### Representation of a Text Document in Vector Space Model & Computing Similarity between Two Documents.

In [49]:
import nltk
from nltk.corpus import stopwords,PlaintextCorpusReader
from nltk.tokenize import word_tokenize
import numpy as np
import math
from scipy import spatial

# Define Stopwords
stop_words = set(stopwords.words())

names = []
# Input: Number of Documents and Document Names
d = int(input("Enter Number of Documents : "))
for i in range(d):
    print("Enter Filename ",i+1,": ")
    name = input()
    names.append(name)
vocab = []                         # Vocabulary
for t in range(d): 
    vcb = []                       # Temporary Vocabulary for each File
    name = names[t]+".txt"
    file = open(name,'r')
    for i in file:
        # Preprocessing of Data
        token = word_tokenize(i.lower())
        token = [word.lower() for word in token if word not in stop_words]
        token = [word.lower() for word in token if word.isalpha()]
        # Adding words to Temporary Vocabulary
        for item in token:
            vcb.append(item)
    # Adding words to Final Vocabulary
    for i in vcb:
        if i not in vocab  :
            vocab.append(i)
    del(vcb)
    
# Output: Vocabulary
print("\nThe Vocabulary\n",vocab)

# Creating Document Vectors
DV = []
for t in range(d):
    name = names[t]+".txt"
    file = open(name,'r')
    vec = [0]*len(vocab)             
    for i in file:
        # Preprocessing of Data
        token = word_tokenize(i.lower())
        token = [word.lower() for word in token if word not in stop_words]
        token = [word.lower() for word in token if word.isalpha()]
        # Calculate Term Frequencies
        for item in token:
            vec[vocab.index(item)]+=1
    # Document Vectors appended together to form a Matrix
    DV.append(vec) 
    del(vec)
    
# Output: Document Vectors
for t in range(d):
    name = names[t]+".txt"
    print("\nThe Document Vector for", name,"\n",DV[t])
    
# Calculating Document Frequency
DF = [0]*len(vocab) 
for i in range(len(vocab)):
    for j in range(d):
        if DV[j][i]>0:             # If Term Frequency > 0
            DF[i]+=1
# Output: Document Frequency
print("\nDocument Frequencies\n",DF) 

# Calculating Inverse Document Frequency
IDF = []
for i in DF:
    IDF.append(math.log((d/i),2))
#print("\nInverse Document Frequencies\n",IDF)

# Calculating Weight of each Term in Vocabulary
WGT = []
for i in range(d):
    wt = [0]*len(vocab)
    for j in range(len(vocab)):
        wt[j] = DV[i][j]*IDF[j]
    WGT.append(wt)
    del(wt)
#print("\nWeights of Vocabulary Terms\n",WGT)

# Calculating Cosine Similarity between Documents
for t in range(d):
    cos = []
    for m in range(t,d):
        if(m!=t):
            name = names[t]+".txt"
            name1 = names[m]+".txt"
            cos_sim = 1 - spatial.distance.cosine(WGT[t], WGT[m]) 
            cos.append(cos_sim)
            # Output: Cosine Similarity
            print("\n\nThe Cosine Similarity between ",name," and ",name1," = ",cos_sim)

Enter Number of Documents : 3
Enter Filename  1 : 
a
Enter Filename  2 : 
b
Enter Filename  3 : 
c

The Vocabulary
 ['deep', 'learning', 'allows', 'computational', 'models', 'composed', 'multiple', 'processing', 'layers', 'learn', 'representations', 'data', 'levels', 'abstraction', 'methods', 'dramatically', 'improved', 'speech', 'recognition', 'visual', 'object', 'detection', 'many', 'domains', 'discovery', 'genomics', 'discovers', 'intricate', 'structure', 'large', 'sets', 'using', 'backpropagation', 'algorithm', 'indicate', 'machine', 'change', 'internal', 'parameters', 'used', 'compute', 'representation', 'layer', 'previous', 'convolutional', 'nets', 'brought', 'breakthroughs', 'images', 'video', 'audio', 'whereas', 'recurrent', 'shone', 'light', 'sequential', 'text', 'powerful', 'form', 'enables', 'computers', 'solve', 'perceptual', 'problems', 'image', 'increasingly', 'making', 'entry', 'biological', 'sciences', 'artificial', 'neural', 'networks', 'use', 'discover', 'patterns', '

In [50]:
# Check Cosine Similarity between an Input Query and Documents
# Input: Query
qr=input("Enter your Query : ")
query=qr.split(" ")
QV=[0]*len(vocab)                 # Query Vector
for i in vocab:
    if i in query:
        QV[vocab.index(i)]+=1     # Calculate Term Frequency
wt=[0]*len(vocab)                 # Weights of Vocabulary Terms
for j in range(len(vocab)):
    wt[j]=QV[j]*IDF[j]
# Calculating Cosine Similarity between Documents and Query
cos=[]
for i in range(d):
    name= names[i]+".txt"
    cos_sim = 1 - spatial.distance.cosine(wt, WGT[i])
    cos.append(cos_sim)
    # Output: Cosine Similarity
    print("\nThe Cosine Similarity between Query and ",name," = ",cos_sim)

Enter your Query : Deep learning discovers intricate structure in l arge data sets by using the backpropagation algorithm to indicate how a machine should change its internal parameters that are used to compute the representation in each lay er from the representation in the previous layer.

The Cosine Similarity between Query and  a.txt  =  0.5250730209536698

The Cosine Similarity between Query and  b.txt  =  0.013266487822794604

The Cosine Similarity between Query and  c.txt  =  0.006727244525535969
