In [56]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [57]:
file_docs = []

with open ('demoDoc.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file_docs.append(line)

In [58]:
file_docs

['Mars is the fourth planet in our solar system.',
 'It is second-smallest planet in the Solar System after Mercury.',
 'Saturn is yellow planet.']

In [59]:
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in file_docs]

In [60]:
import gensim


In [61]:
gen_docs

[['mars',
  'is',
  'the',
  'fourth',
  'planet',
  'in',
  'our',
  'solar',
  'system',
  '.'],
 ['it',
  'is',
  'second-smallest',
  'planet',
  'in',
  'the',
  'solar',
  'system',
  'after',
  'mercury',
  '.'],
 ['saturn', 'is', 'yellow', 'planet', '.']]

In [62]:
dictionary = gensim.corpora.Dictionary(gen_docs)

In [63]:
dictionary.token2id


{'.': 0,
 'fourth': 1,
 'in': 2,
 'is': 3,
 'mars': 4,
 'our': 5,
 'planet': 6,
 'solar': 7,
 'system': 8,
 'the': 9,
 'after': 10,
 'it': 11,
 'mercury': 12,
 'second-smallest': 13,
 'saturn': 14,
 'yellow': 15}

In [64]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] #since there are multiple tokenized sentences in gen_docs

In [65]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1)],
 [(0, 1),
  (2, 1),
  (3, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1)],
 [(0, 1), (3, 1), (6, 1), (14, 1), (15, 1)]]

In [66]:
import numpy as np

In [67]:
tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

[['fourth', 0.53], ['in', 0.2], ['mars', 0.53], ['our', 0.53], ['solar', 0.2], ['system', 0.2], ['the', 0.2]]
[['in', 0.17], ['solar', 0.17], ['system', 0.17], ['the', 0.17], ['after', 0.47], ['it', 0.47], ['mercury', 0.47], ['second-smallest', 0.47]]
[['saturn', 0.71], ['yellow', 0.71]]


In [68]:
sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
                                        num_features=len(dictionary))

In [69]:
file2_docs = []

with open ('queryDoc.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file2_docs.append(line)


query_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in file2_docs]
query_doc_bow = [dictionary.doc2bow(query_doc) for query_doc in query_docs]
#query_doc_bow is the corpus of the query document

In [70]:
file2_docs

['Saturn is the sixth planet from the Sun.', 'Mars is a red planet.']

In [71]:
query_docs

[['saturn', 'is', 'the', 'sixth', 'planet', 'from', 'the', 'sun', '.'],
 ['mars', 'is', 'a', 'red', 'planet', '.']]

In [72]:
query_doc_bow
#query_doc_bow is the corpus of the query document

[[(0, 1), (3, 1), (6, 1), (9, 2), (14, 1)], [(0, 1), (3, 1), (4, 1), (6, 1)]]

In [73]:
for line in query_doc_bow:
    query_doc_tf_idf = tf_idf[line]
    print(line)
    print('Comparing Result:', sims[query_doc_tf_idf]) 

[(0, 1), (3, 1), (6, 1), (9, 2), (14, 1)]
Comparing Result: [0.11641413 0.10281226 0.56890744]
[(0, 1), (3, 1), (4, 1), (6, 1)]
Comparing Result: [0.5311302 0.        0.       ]


In [74]:
#THEREFORE it can be seen that the last document sentence is the most similar to query

In [75]:
file_docs = []

with open ('demoDoc2.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file_docs.append(line)

In [76]:
gen_docs = [[w.lower() for w in word_tokenize(text)] 
            for text in file_docs]

In [77]:
gen_docs

[['malls',
  'are',
  'great',
  'places',
  'to',
  'shop',
  ',',
  'i',
  'can',
  'find',
  'sandwiches',
  'under',
  'one',
  'roof',
  '.'],
 ['i',
  'love',
  'eating',
  'toasted',
  'cheese',
  'and',
  'tuna',
  'sandwiches',
  '.'],
 ['should',
  'we',
  'start',
  'class',
  'now',
  ',',
  'or',
  'should',
  'we',
  'wait',
  'for',
  'everyone',
  'to',
  'get',
  'here',
  '?']]

In [78]:
dictionary = gensim.corpora.Dictionary(gen_docs)

In [79]:
corpus = [dictionary.doc2bow(gen_doc) for gen_doc in gen_docs] #since there are multiple tokenized sentences in gen_docs

In [80]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1),
  (13, 1),
  (14, 1)],
 [(1, 1),
  (6, 1),
  (11, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1)],
 [(0, 1),
  (13, 1),
  (21, 1),
  (22, 1),
  (23, 1),
  (24, 1),
  (25, 1),
  (26, 1),
  (27, 1),
  (28, 1),
  (29, 2),
  (30, 1),
  (31, 1),
  (32, 2)]]

In [81]:
tf_idf = gensim.models.TfidfModel(corpus)
for doc in tf_idf[corpus]:
    print([[dictionary[id], np.around(freq, decimals=2)] for id, freq in doc])

[[',', 0.11], ['.', 0.11], ['are', 0.31], ['can', 0.31], ['find', 0.31], ['great', 0.31], ['i', 0.11], ['malls', 0.31], ['one', 0.31], ['places', 0.31], ['roof', 0.31], ['sandwiches', 0.11], ['shop', 0.31], ['to', 0.11], ['under', 0.31]]
[['.', 0.15], ['i', 0.15], ['sandwiches', 0.15], ['and', 0.4], ['cheese', 0.4], ['eating', 0.4], ['love', 0.4], ['toasted', 0.4], ['tuna', 0.4]]
[[',', 0.09], ['to', 0.09], ['?', 0.23], ['class', 0.23], ['everyone', 0.23], ['for', 0.23], ['get', 0.23], ['here', 0.23], ['now', 0.23], ['or', 0.23], ['should', 0.47], ['start', 0.23], ['wait', 0.23], ['we', 0.47]]


In [82]:
sims = gensim.similarities.Similarity('workdir/',tf_idf[corpus],
                                        num_features=len(dictionary))

In [89]:
avg_sims = [] # array of averages

file2_docs = []

with open ('queryDoc2.txt') as f:
    tokens = sent_tokenize(f.read())
    for line in tokens:
        file2_docs.append(line)
# for line in query documents
for line in file2_docs:
        # tokenize words
        query_doc = [w.lower() for w in word_tokenize(line)]
        # create bag of words
        query_doc_bow = dictionary.doc2bow(query_doc)
        # find similarity for each document
        query_doc_tf_idf = tf_idf[query_doc_bow]
        # print (document_number, document_similarity)
        print('Comparing Result:', sims[query_doc_tf_idf]) 
        # calculate sum of similarities for each query doc
        sum_of_sims =(np.sum(sims[query_doc_tf_idf], dtype=np.float32))
        # calculate average of similarity for each query doc
        avg = sum_of_sims / len(file_docs)
        # print average of similarity for each query doc
        print(f'avg: {sum_of_sims / len(file_docs)}')
        # add average values into array
        avg_sims.append(avg)  
   # calculate total average
total_avg = np.sum(avg_sims, dtype=np.float)
    # round the value and multiply by 100 to format it as percentage
percentage_of_similarity = round(float(total_avg) * 100)
    # if percentage is greater than 100
    # that means documents are almost same
if percentage_of_similarity >= 100:
    percentage_of_similarity = 100
    
print("Total similarity percentage:", percentage_of_similarity ,"%")

Comparing Result: [0.3690919  0.03038312 0.13209888]
avg: 0.17719129721323648
Comparing Result: [0.02851599 0.03681399 0.32011765]
avg: 0.12848254044850668
Comparing Result: [0.03263272 0.         0.9407785 ]
avg: 0.3244704008102417
Total similarity percentage: 63 %


In [84]:
file2_docs

['Malls are good for shopping.',
 'What kind of bread is used for sandwiches?',
 'Do we have to start class now, or should we wait for everyone to come here?']

In [90]:
avg_sims

[0.17719129721323648, 0.12848254044850668, 0.3244704008102417]