# Hybrid Approach for Autonomous Assessment of Domain Specific Comparative Questions

In [1]:
import logging
from pprint import pprint
from gensim import corpora, models, similarities
#corpora: This module implements the concept of a Dictionary – a mapping between words and their integer ids.

In [2]:
documents = ["SOLUTION: A multiprogramming operating system is a software that loads one or more programs in main memory and executes them using a single CPU (central processing unit). In fact, the CPU executes only one program at a time while other programs are waiting in queue. In multiprogramming system when one program is busy with input/output operation, the CPU executes another program that is in queue. In this way, multiprogramming operating system uses the CPU time and other resources of computer to improve the performance of computer. A multitasking operating system is a software that has a single CPU. The CPU executes only one program at a time but it rapidly switches between multiple programs and it appears as if all the users' programs are being executes at the same time", 
             "NOTES: In a multiprogramming system there are one or more programs loaded in main memory which are ready to execute. Only one program at a time is able to get the CPU for executing its instructions while all the others are waiting their turn. The main idea of multiprogramming is to maximize the use of CPU time. Indeed, suppose the currently running process is performing an I/O task which, by definition, does not need the CPU to be accomplished. Then, the OS may interrupt that process and give the control to one of the other in-main-memory programs that are ready to execute i.e. process context switching. In this way, no CPU time is wasted by the system waiting for the I/O task to be completed, and a running process keeps executing until either it voluntarily releases the CPU or when it blocks for an I/O operation. Therefore, the ultimate goal of multiprogramming is to keep the CPU busy as long as there are processes ready to execute. Note that in order for such a system to function properly, the OS must be able to load multiple programs into separate areas of the main memory and provide the required protection to avoid the chance of one process being modified by another one. Other problems that need to be addressed when having multiple programs in memory is fragmentation as programs enter or leave the main memory. Another issue that needs to be handled as well is that large programs may not fit at once in memory which can be solved by using pagination and virtual memory. Please, refer to this article for more details on that. Finally, note that if there are N ready processes and all of those are highly CPU-bound i.e., they mostly execute CPU tasks and none or very few I/O operations, in the very worst case one program might wait all the other N-1 ones to complete before executing. Multitasking has the same meaning of multiprogramming but in a more general sense, as it refers to having multiple programs, processes, tasks, threads running at the same time. This term is used in modern operating systems when multiple tasks share a common processing resource e.g. CPU and Memory. At any time the CPU is executing one task only while other tasks waiting their turn. The illusion of parallelism is achieved when the CPU is reassigned to another task i.e. process or thread context switching. There are subtle differences between multitasking and multiprogramming. A task in a multitasking operating system is not a whole application program but it can also refer to a thread of execution when one process is divided into sub-tasks. Each smaller task does not hijack the CPU until it finishes like in the older multiprogramming but rather a fair share amount of the CPU time called quantum. Just to make it easy to remember, both multiprogramming and multitasking operating systems are (CPU) time sharing systems. However, while in multiprogramming older OSs one program as a whole keeps running until it blocks, in multitasking modern OSs time sharing is best manifested because each running process takes only a fair quantum of the CPU time."]

In [3]:
stoplist = set( 'for of a the and to in'.split() )
texts = [ [word for word in document.lower().split() if word not in stoplist ] for document in documents ]
pprint( texts )


[['solution:',
  'multiprogramming',
  'operating',
  'system',
  'is',
  'software',
  'that',
  'loads',
  'one',
  'or',
  'more',
  'programs',
  'main',
  'memory',
  'executes',
  'them',
  'using',
  'single',
  'cpu',
  '(central',
  'processing',
  'unit).',
  'fact,',
  'cpu',
  'executes',
  'only',
  'one',
  'program',
  'at',
  'time',
  'while',
  'other',
  'programs',
  'are',
  'waiting',
  'queue.',
  'multiprogramming',
  'system',
  'when',
  'one',
  'program',
  'is',
  'busy',
  'with',
  'input/output',
  'operation,',
  'cpu',
  'executes',
  'another',
  'program',
  'that',
  'is',
  'queue.',
  'this',
  'way,',
  'multiprogramming',
  'operating',
  'system',
  'uses',
  'cpu',
  'time',
  'other',
  'resources',
  'computer',
  'improve',
  'performance',
  'computer.',
  'multitasking',
  'operating',
  'system',
  'is',
  'software',
  'that',
  'has',
  'single',
  'cpu.',
  'cpu',
  'executes',
  'only',
  'one',
  'program',
  'at',
  'time',
  'but'

In [4]:
from collections import defaultdict
frequency = defaultdict( int )
for text in texts:
    for token in text:
        frequency[ token ] += 1
pprint( frequency )


defaultdict(<class 'int'>,
            {'(central': 1,
             '(cpu)': 1,
             'able': 2,
             'accomplished.': 1,
             'achieved': 1,
             'addressed': 1,
             'all': 4,
             'also': 1,
             'amount': 1,
             'an': 2,
             'another': 4,
             'any': 1,
             'appears': 1,
             'application': 1,
             'are': 11,
             'areas': 1,
             'article': 1,
             'as': 7,
             'at': 7,
             'avoid': 1,
             'be': 6,
             'because': 1,
             'before': 1,
             'being': 2,
             'best': 1,
             'between': 2,
             'blocks': 1,
             'blocks,': 1,
             'both': 1,
             'busy': 2,
             'but': 4,
             'by': 4,
             'called': 1,
             'can': 2,
             'case': 1,
             'chance': 1,
             'common': 1,
             'complete': 1,
        

In [5]:
texts2 = [ [ token for token in text if frequency[ token ] > 1 ] for text in texts ]
 
pprint( texts2 )
dictionary = corpora.Dictionary( texts2 )
print( dictionary )
print( dictionary.token2id )


[['multiprogramming',
  'operating',
  'system',
  'is',
  'software',
  'that',
  'one',
  'or',
  'more',
  'programs',
  'main',
  'memory',
  'executes',
  'using',
  'single',
  'cpu',
  'processing',
  'cpu',
  'executes',
  'only',
  'one',
  'program',
  'at',
  'time',
  'while',
  'other',
  'programs',
  'are',
  'waiting',
  'queue.',
  'multiprogramming',
  'system',
  'when',
  'one',
  'program',
  'is',
  'busy',
  'cpu',
  'executes',
  'another',
  'program',
  'that',
  'is',
  'queue.',
  'this',
  'way,',
  'multiprogramming',
  'operating',
  'system',
  'cpu',
  'time',
  'other',
  'multitasking',
  'operating',
  'system',
  'is',
  'software',
  'that',
  'has',
  'single',
  'cpu',
  'executes',
  'only',
  'one',
  'program',
  'at',
  'time',
  'but',
  'it',
  'between',
  'multiple',
  'programs',
  'it',
  'as',
  'if',
  'all',
  'programs',
  'are',
  'being',
  'executes',
  'at',
  'same',
  'time'],
 ['multiprogramming',
  'system',
  'there',
  'ar

In [14]:
query1 = 'QUIZ1: Multiprogramming is a system software that loads one or more programs in main memory and execute by single CPU. In fact, CPU executes the program one by one and other programs are waiting in queue. If one program is not ready to execute then other program in queue is executed.Multitasking operating system is a system software that performs multiple tasks in a single CPU. CPU executes one task at a time and other are waiting for execution.'
query1vec = dictionary.doc2bow( query1.split() )# convert into bag of words
         
 
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( './deerwster.mm', corpus )
corpora.SvmLightCorpus.serialize('./corpus.svmlight', corpus)
tfidf = models.TfidfModel( corpus ) #term frequency-inverse document frequency

corpus_tfidf = tfidf[ corpus ]
for doc in corpus_tfidf:
    print( doc )


[(10, 0.8219949365267865), (29, 0.3287979746107146), (31, 0.3287979746107146), (32, 0.3287979746107146)]
[(42, 0.0982946374365981), (43, 0.0982946374365981), (44, 0.29488391230979427), (45, 0.1965892748731962), (46, 0.0982946374365981), (47, 0.0982946374365981), (48, 0.0982946374365981), (49, 0.0982946374365981), (50, 0.0982946374365981), (51, 0.0982946374365981), (52, 0.14744195615489714), (53, 0.0982946374365981), (54, 0.0982946374365981), (55, 0.0982946374365981), (56, 0.1965892748731962), (57, 0.0982946374365981), (58, 0.0982946374365981), (59, 0.0982946374365981), (60, 0.14744195615489714), (61, 0.0982946374365981), (62, 0.0982946374365981), (63, 0.1965892748731962), (64, 0.0982946374365981), (65, 0.0982946374365981), (66, 0.0982946374365981), (67, 0.0982946374365981), (68, 0.3931785497463924), (69, 0.0982946374365981), (70, 0.1965892748731962), (71, 0.0982946374365981), (72, 0.24573659359149524), (73, 0.0982946374365981), (74, 0.0982946374365981), (75, 0.0982946374365981), (76, 0

In [15]:
# latent semantic analysis
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity( lsi[ corpus ] )
veclsi = lsi[ query1vec ]

sims = index[ veclsi ]
for i, sim in enumerate( sims):
    pprint( query1+":::"+documents[i]+". Similarity_score_is {}".format( sim*100 )+ "%")
result = (sims*100)
b = sum(result)
print ("similarity of Quize1 is ", b/2, "%")


('QUIZ1: Multiprogramming is a system software that loads one or more programs '
 'in main memory and execute by single CPU. In fact, CPU executes the program '
 'one by one and other programs are waiting in queue. If one program is not '
 'ready to execute then other program in queue is executed.Multitasking '
 'operating system is a system software that performs multiple tasks in a '
 'single CPU. CPU executes one task at a time and other are waiting for '
 'execution.:::SOLUTION: A multiprogramming operating system is a software '
 'that loads one or more programs in main memory and executes them using a '
 'single CPU (central processing unit). In fact, the CPU executes only one '
 'program at a time while other programs are waiting in queue. In '
 'multiprogramming system when one program is busy with input/output '
 'operation, the CPU executes another program that is in queue. In this way, '
 'multiprogramming operating system uses the CPU time and other resources of '
 'compute

In [16]:
query2 = 'QUIZ2: Multiprogramming operating system is a software that loads one or more programs in main memory, execute them using a CPU. It executes the only one program at the same time and other programs are waiting in queue. If one program is busy in input/output operation the system will execute the other program. While on the other hand multitasking operating system is a software that perform multiple task using a single CPU. It executes only one program at a time. It rapidly switches between the programs and appear if all the programs are executed at the same time.'
query2vec = dictionary.doc2bow( query2.split() )
         
 
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( './deerwster.mm', corpus )
corpora.SvmLightCorpus.serialize('./corpus.svmlight', corpus)
tfidf = models.TfidfModel( corpus )
corpus_tfidf = tfidf[ corpus ]
for doc in corpus_tfidf:
    print( doc )
# latent semantic analysis
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity( lsi[ corpus ] )
veclsi = lsi[ query2vec ]

sims = index[ veclsi ]
for i, sim in enumerate( sims):
    pprint( query2+":::"+documents[i]+". Similarity_score_is {}".format( sim*100 )+ "%")
result = (sims*100)
b = sum(result)
print ("similarity of Quize2 is ", b/2, "%")


[(10, 0.8219949365267865), (29, 0.3287979746107146), (31, 0.3287979746107146), (32, 0.3287979746107146)]
[(42, 0.0982946374365981), (43, 0.0982946374365981), (44, 0.29488391230979427), (45, 0.1965892748731962), (46, 0.0982946374365981), (47, 0.0982946374365981), (48, 0.0982946374365981), (49, 0.0982946374365981), (50, 0.0982946374365981), (51, 0.0982946374365981), (52, 0.14744195615489714), (53, 0.0982946374365981), (54, 0.0982946374365981), (55, 0.0982946374365981), (56, 0.1965892748731962), (57, 0.0982946374365981), (58, 0.0982946374365981), (59, 0.0982946374365981), (60, 0.14744195615489714), (61, 0.0982946374365981), (62, 0.0982946374365981), (63, 0.1965892748731962), (64, 0.0982946374365981), (65, 0.0982946374365981), (66, 0.0982946374365981), (67, 0.0982946374365981), (68, 0.3931785497463924), (69, 0.0982946374365981), (70, 0.1965892748731962), (71, 0.0982946374365981), (72, 0.24573659359149524), (73, 0.0982946374365981), (74, 0.0982946374365981), (75, 0.0982946374365981), (76, 0

In [17]:
query3 = 'QUIZ3: A multiprogramming operating system loads more than one program into the main memory and executes them one by one. It executes one program while other programs are on waiting. When one program is busy with input/output operation the CPU execute another program. While on the other side multitasking operating system performs the tasks on the same time on a single CPU. CPU executes only one program at a time and switches between multiple programs.'
query3vec = dictionary.doc2bow( query3.split() )
         
 
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( './deerwster.mm', corpus )
corpora.SvmLightCorpus.serialize('./corpus.svmlight', corpus)
tfidf = models.TfidfModel( corpus )
corpus_tfidf = tfidf[ corpus ]
for doc in corpus_tfidf:
    print( doc )
# latent semantic analysis
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity( lsi[ corpus ] )
veclsi = lsi[ query3vec ]

sims = index[ veclsi ]
for i, sim in enumerate( sims):
    pprint( query3+":::"+documents[i]+". Similarity_score_is {}".format( sim*100 )+ "%")
result = (sims*100)
b = sum(result)
print ("similarity of Quize3 is ", b/2, "%")
    


[(10, 0.8219949365267865), (29, 0.3287979746107146), (31, 0.3287979746107146), (32, 0.3287979746107146)]
[(42, 0.0982946374365981), (43, 0.0982946374365981), (44, 0.29488391230979427), (45, 0.1965892748731962), (46, 0.0982946374365981), (47, 0.0982946374365981), (48, 0.0982946374365981), (49, 0.0982946374365981), (50, 0.0982946374365981), (51, 0.0982946374365981), (52, 0.14744195615489714), (53, 0.0982946374365981), (54, 0.0982946374365981), (55, 0.0982946374365981), (56, 0.1965892748731962), (57, 0.0982946374365981), (58, 0.0982946374365981), (59, 0.0982946374365981), (60, 0.14744195615489714), (61, 0.0982946374365981), (62, 0.0982946374365981), (63, 0.1965892748731962), (64, 0.0982946374365981), (65, 0.0982946374365981), (66, 0.0982946374365981), (67, 0.0982946374365981), (68, 0.3931785497463924), (69, 0.0982946374365981), (70, 0.1965892748731962), (71, 0.0982946374365981), (72, 0.24573659359149524), (73, 0.0982946374365981), (74, 0.0982946374365981), (75, 0.0982946374365981), (76, 0

In [18]:
query4 = 'QUIZ4: Multiprogramming operating system is the software that loads multiple programs or more than one program on a single CPU. In multiprogramming system the multiple programs are loads in single CPU but programs are loads one by one. When one program is on the stage of loading the program wait in a queue till at the first program is executed. In one CPU or single CPU the programs is executed in a queue. In multiprogramming the CPU and other resources make the performance of computer. While on the other hand multitasking operating system is the software that share or execute multiple task on a single CPU. In this multitasking the single CPU execute one task or program at a time but it become switching between the other multiple tasks. It is appear as it user execute the all programs at the same time.'
query4vec = dictionary.doc2bow( query4.split() )
         
 
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( './deerwster.mm', corpus )
corpora.SvmLightCorpus.serialize('./corpus.svmlight', corpus)
tfidf = models.TfidfModel( corpus )
corpus_tfidf = tfidf[ corpus ]
for doc in corpus_tfidf:
    print( doc )
# latent semantic analysis
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity( lsi[ corpus ] )
veclsi = lsi[ query4vec ]

sims = index[ veclsi ]
for i, sim in enumerate( sims):
    pprint( query4+":::"+documents[i]+". Similarity_score_is {}".format( sim*100 )+ "%")
result = (sims*100)
b = sum(result)
print ("similarity of Quize4 is ", b/2, "%")


[(10, 0.8219949365267865), (29, 0.3287979746107146), (31, 0.3287979746107146), (32, 0.3287979746107146)]
[(42, 0.0982946374365981), (43, 0.0982946374365981), (44, 0.29488391230979427), (45, 0.1965892748731962), (46, 0.0982946374365981), (47, 0.0982946374365981), (48, 0.0982946374365981), (49, 0.0982946374365981), (50, 0.0982946374365981), (51, 0.0982946374365981), (52, 0.14744195615489714), (53, 0.0982946374365981), (54, 0.0982946374365981), (55, 0.0982946374365981), (56, 0.1965892748731962), (57, 0.0982946374365981), (58, 0.0982946374365981), (59, 0.0982946374365981), (60, 0.14744195615489714), (61, 0.0982946374365981), (62, 0.0982946374365981), (63, 0.1965892748731962), (64, 0.0982946374365981), (65, 0.0982946374365981), (66, 0.0982946374365981), (67, 0.0982946374365981), (68, 0.3931785497463924), (69, 0.0982946374365981), (70, 0.1965892748731962), (71, 0.0982946374365981), (72, 0.24573659359149524), (73, 0.0982946374365981), (74, 0.0982946374365981), (75, 0.0982946374365981), (76, 0

In [19]:
query5 = 'QUIZ5: Multiprogramming operating system is a software that loads one or more programs in memory. A single CPU executes them. In this system a single program is execute at one time and other programs are waiting in queue. During input/output operating of a program, the CPU execute next program that is waiting in queue. While multitasking operating system is a software that performs multiple tasks at the same time in a computer that has a single CPU. CPU executes one program at a time and other programs are switching rapidly at same time.'
query5vec = dictionary.doc2bow( query5.split() )
         
 
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( './deerwster.mm', corpus )
corpora.SvmLightCorpus.serialize('./corpus.svmlight', corpus)
tfidf = models.TfidfModel( corpus )
corpus_tfidf = tfidf[ corpus ]
for doc in corpus_tfidf:
    print( doc )
# latent semantic analysis
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity( lsi[ corpus ] )
veclsi = lsi[ query5vec ]

sims = index[ veclsi ]
for i, sim in enumerate( sims):
    pprint( query5+":::"+documents[i]+". Similarity_score_is {}".format( sim*100 )+ "%")
    
result = (sims*100)
b = sum(result)
print ("similarity of Quize5 is ", b/2, "%")


[(10, 0.8219949365267865), (29, 0.3287979746107146), (31, 0.3287979746107146), (32, 0.3287979746107146)]
[(42, 0.0982946374365981), (43, 0.0982946374365981), (44, 0.29488391230979427), (45, 0.1965892748731962), (46, 0.0982946374365981), (47, 0.0982946374365981), (48, 0.0982946374365981), (49, 0.0982946374365981), (50, 0.0982946374365981), (51, 0.0982946374365981), (52, 0.14744195615489714), (53, 0.0982946374365981), (54, 0.0982946374365981), (55, 0.0982946374365981), (56, 0.1965892748731962), (57, 0.0982946374365981), (58, 0.0982946374365981), (59, 0.0982946374365981), (60, 0.14744195615489714), (61, 0.0982946374365981), (62, 0.0982946374365981), (63, 0.1965892748731962), (64, 0.0982946374365981), (65, 0.0982946374365981), (66, 0.0982946374365981), (67, 0.0982946374365981), (68, 0.3931785497463924), (69, 0.0982946374365981), (70, 0.1965892748731962), (71, 0.0982946374365981), (72, 0.24573659359149524), (73, 0.0982946374365981), (74, 0.0982946374365981), (75, 0.0982946374365981), (76, 0

In [20]:
Dummy_Query = 'Dummy_Query: Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming,Multiprogramming, .'
Dummy_Query_vec = dictionary.doc2bow( Dummy_Query.split() )
         
 
corpus = [ dictionary.doc2bow( text ) for text in texts ]
corpora.MmCorpus.serialize( './deerwster.mm', corpus )
corpora.SvmLightCorpus.serialize('./corpus.svmlight', corpus)
tfidf = models.TfidfModel( corpus )
corpus_tfidf = tfidf[ corpus ]
for doc in corpus_tfidf:
    print( doc )
# latent semantic analysis
lsi = models.LsiModel( corpus, id2word=dictionary, num_topics=2)
index = similarities.MatrixSimilarity( lsi[ corpus ] )
veclsi = lsi[ Dummy_Query_vec ]

sims = index[ veclsi ]
for i, sim in enumerate( sims):
    pprint( Dummy_Query+":::"+documents[i]+". Similarity_score_is {}".format( sim*100 )+ "%")
result = (sims*100)
b = sum(result)
print ("similarity of Dummy_Quiz is ", b/2, "%")  


[(10, 0.8219949365267865), (29, 0.3287979746107146), (31, 0.3287979746107146), (32, 0.3287979746107146)]
[(42, 0.0982946374365981), (43, 0.0982946374365981), (44, 0.29488391230979427), (45, 0.1965892748731962), (46, 0.0982946374365981), (47, 0.0982946374365981), (48, 0.0982946374365981), (49, 0.0982946374365981), (50, 0.0982946374365981), (51, 0.0982946374365981), (52, 0.14744195615489714), (53, 0.0982946374365981), (54, 0.0982946374365981), (55, 0.0982946374365981), (56, 0.1965892748731962), (57, 0.0982946374365981), (58, 0.0982946374365981), (59, 0.0982946374365981), (60, 0.14744195615489714), (61, 0.0982946374365981), (62, 0.0982946374365981), (63, 0.1965892748731962), (64, 0.0982946374365981), (65, 0.0982946374365981), (66, 0.0982946374365981), (67, 0.0982946374365981), (68, 0.3931785497463924), (69, 0.0982946374365981), (70, 0.1965892748731962), (71, 0.0982946374365981), (72, 0.24573659359149524), (73, 0.0982946374365981), (74, 0.0982946374365981), (75, 0.0982946374365981), (76, 0

In [21]:
pwd

'/home/mqasim/Data/Fast/Git/FAST_Data/Semester6/purposal'

In [22]:
pwd

'/home/mqasim/Data/Fast/Git/FAST_Data/Semester6/purposal'