In [None]:
import gdown

url = 'https://drive.google.com/drive/folders/1D3sM984cM_aKdgr6HfxM7UvT_n526A3e?usp=share_link'
opath = '/content'
gdown.download_folder(url, quiet=True, use_cookies=False)

In [76]:
import glob
import string
import nltk
import re
import numpy as np
from math import log10
from math import sqrt
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [88]:
def document_preprocessing(text):
  # Removes punctuations.
  p_text = text.translate(str.maketrans('', '', string.punctuation))

  # Removes newline characters.
  p_text = p_text.translate(str.maketrans({'\n': ' '}))
  
  # Removes tab characters.
  p_text = p_text.translate(str.maketrans({'\t': ''}))

  # lowers the text.
  p_text = p_text.lower()

  # Removes whitespaces at the beginning and end of the text.
  p_text = p_text.strip()

  # Substitutes multiple whitespaces with a single whitespace.
  p_text = re.sub(r"\s\s+", " ", p_text) 

  # Splitting text into tokens
  p_tokens = re.split('\W+', p_text)

  stopwords = nltk.corpus.stopwords.words('english')

  # Stopword and single character Removal
  p_tokens = [token for token in p_tokens if token not in stopwords and len(token) > 1] 

  # # Stem Tokens
  # stemmer = PorterStemmer()
  # p_tokens = [stemmer.stem(token) for token in p_tokens]

  # # Lemmatize tokens
  # lemmatizer = WordNetLemmatizer()
  # p_tokens = [lemmatizer.lemmatize(token) for token in p_tokens]

  return p_tokens

In [89]:
def get_doc_dict():
  doc_dict = {}
  file_list = glob.glob('/content/Corpus/*')
  for file in file_list:
    file_name = file.split('/')[-1]
    with open(file, 'r') as file:
      data = file.read()
      doc_dict[file_name] = document_preprocessing(data)
  return doc_dict

In [90]:
def get_wordList(doc_dict):
  wordList = []
  for doc in doc_dict:
    wordList += doc_dict[doc]
  return list(set(wordList))

In [91]:
def wf_doc(vocabulary, doc_dict):
  tf_docs = {}
  for doc_id in doc_dict.keys():
    tf_docs[doc_id] = {}

  for word in vocabulary:
    for doc_id, doc in doc_dict.items():
      if doc.count(word) > 0:
        tf_docs[doc_id][word] = 1 + log10(doc.count(word))
  return tf_docs

In [92]:
def df_doc(vocabulary, doc_dict):
  df = {}
  for word in vocabulary:
    freq = 0
    for doc in doc_dict.values():
      if word in doc:
        freq += 1
      df[word] = freq
  return df

In [93]:
def idf_doc(vocabulary, doc_freq, length):
  idf = {}
  for word in vocabulary:
    idf[word] = np.log10((length) / doc_freq[word])
  return idf

In [94]:
def tfidf(vocabulary, tf, idf, doc_dict):
  tf_idf = {}
  for doc_id in doc_dict.keys():
    tf_idf[doc_id] = {}
  for word in vocabulary:
    for doc_id, doc in doc_dict.items():
      tf_idf[doc_id][word] = tf[doc_id][word] * idf[doc_id]
  return tf_idf

In [95]:
document_dictionary = get_doc_dict()
for doc_name, doc in document_dictionary.items():
  print(doc_name, ':', doc)
N = len(document_dictionary)
print(N)

paypal.txt : ['paypal', 'paypal', 'american', 'ecommerce', 'company', 'formed', 'march', '2000', 'specializes', 'internet', 'money', 'transfers', 'heavily', 'used', 'internet', 'auction', 'company', 'ebay', 'owned', 'paypal', '2002', '2015', 'paypal', 'product', 'merger', 'xcom', 'confinity', 'allowed', 'users', 'make', 'payments', 'purchased', 'goods', 'exchange', 'money', 'accounts', 'secure', 'online', 'transaction', 'watching', 'paypal', 'become', 'premier', 'choice', 'internet', 'auction', 'shoppers', 'online', 'marketplace', 'giant', 'ebay', 'acquired', 'paypal', '15', 'billion', 'october', '2002', 'company', 'offers', 'users', 'ability', 'link', 'paypal', 'accounts', 'bank', 'accounts', 'making', 'transfers', 'payments', 'efficient', 'money', 'orders', 'checks', 'fees', 'collected', 'ebay', 'certain', 'transactions', 'determined', 'based', 'amount', 'transaction', 'nature', 'transaction', 'currency', 'type', 'transaction', '2015', 'paypal', 'spun', 'independent', 'company', 'con

In [96]:
vocabulary = get_wordList(document_dictionary)
print(vocabulary)
print(len(vocabulary))

['antonio', 'north', 'jean', 'revenue', 'wearing', 'frankfurt', 'eaters', 'amass', 'besse', 'proof', 'continuing', 'ridehailing', 'transparently', 'increments', 'distinguished', 'precise', 'shortlived', 'fuel', 'chance', 'finnish', 'rabbit', 'clip', 'slide', 'boyhood', 'events', 'icon', 'globe', 'converted', 'asked', 'advertisement', 'quasar', 'socially', 'ornamentals', 'help', 'mining', 'ip', 'lets', 'wars', 'aired', 'fortunes', 'skeleton', 'occurred', 'trioangle', '43', 'browse', 'boots', 'title', 'engineer', 'tested', 'different', 'investigated', 'validate', 'symbol', '1976', 'sanjeev', 'simple', 'threads', 'covers', 'get', 'versions', 'individual', 'dots', 'supervise', 'html', 'equal', '1011', 'driverpartners', 'workstations', '45000', 'hosting', 'worldwide', 'nature', 'according', 'redownload', 'started', 'looks', 'nmt', 'durability', 'bookstore', 'selfexamination', 'attempt', 'boom', 'craft', 'subscriptions', 'contacts', 'wished', 'manufacturing', 'trust', 'parker', 'wants', 'mod

In [97]:
document_wf = wf_doc(vocabulary=vocabulary, doc_dict=document_dictionary)
document_wf
for doc in document_wf.keys():
  print(doc, ':', document_wf[doc])

paypal.txt : {'distinguished': 1.0, 'individual': 1.0, 'nature': 1.0, 'wished': 1.0, 'giant': 1.0, 'processed': 1.0, 'sue': 1.0, 'among': 1.0, 'freedom': 1.3010299956639813, 'chartered': 1.0, 'ownership': 1.0, 'losses': 1.0, 'number': 1.0, 'internet': 1.4771212547196624, 'portable': 1.0, 'routine': 1.0, 'requires': 1.0, 'standing': 1.0, 'gradually': 1.0, 'secure': 1.0, 'advancements': 1.0, 'name': 1.0, 'payments': 1.3010299956639813, 'continued': 1.0, 'company': 1.845098040014257, 'universities': 1.0, 'protection': 1.0, 'making': 1.0, 'control': 1.0, 'entitled': 1.0, 'first': 1.0, 'misled': 1.0, '2015': 1.3010299956639813, 'offers': 1.3010299956639813, 'measures': 1.0, 'exchange': 1.0, 'antiphishing': 1.0, 'incorporation': 1.6020599913279625, 'largescale': 1.0, 'policies': 1.0, '19th': 1.3010299956639813, 'partnership': 1.3010299956639813, 'choice': 1.0, 'pursuit': 1.0, 'proceeds': 1.0, 'constitution': 1.3010299956639813, 'rights': 1.0, 'series': 1.0, 'refund': 1.0, 'two': 1.3010299956

In [98]:
document_df = df_doc(vocabulary, document_dictionary)
print(document_df)
# for word, freq in document_df:
#   print(word, ':', freq)

{'antonio': 1, 'north': 5, 'jean': 1, 'revenue': 4, 'wearing': 1, 'frankfurt': 1, 'eaters': 1, 'amass': 1, 'besse': 1, 'proof': 1, 'continuing': 1, 'ridehailing': 1, 'transparently': 1, 'increments': 1, 'distinguished': 1, 'precise': 1, 'shortlived': 1, 'fuel': 1, 'chance': 2, 'finnish': 1, 'rabbit': 1, 'clip': 1, 'slide': 1, 'boyhood': 1, 'events': 4, 'icon': 1, 'globe': 3, 'converted': 2, 'asked': 3, 'advertisement': 2, 'quasar': 1, 'socially': 1, 'ornamentals': 1, 'help': 12, 'mining': 1, 'ip': 1, 'lets': 6, 'wars': 2, 'aired': 1, 'fortunes': 2, 'skeleton': 1, 'occurred': 2, 'trioangle': 1, '43': 1, 'browse': 1, 'boots': 2, 'title': 1, 'engineer': 1, 'tested': 1, 'different': 11, 'investigated': 1, 'validate': 1, 'symbol': 1, '1976': 1, 'sanjeev': 1, 'simple': 9, 'threads': 2, 'covers': 2, 'get': 16, 'versions': 4, 'individual': 2, 'dots': 1, 'supervise': 1, 'html': 1, 'equal': 2, '1011': 1, 'driverpartners': 2, 'workstations': 2, '45000': 1, 'hosting': 1, 'worldwide': 8, 'nature': 

In [99]:
document_idf = idf_doc(vocabulary, document_df, N)
document_idf

{'antonio': 1.6127838567197355,
 'north': 0.9138138523837167,
 'jean': 1.6127838567197355,
 'revenue': 1.0107238653917732,
 'wearing': 1.6127838567197355,
 'frankfurt': 1.6127838567197355,
 'eaters': 1.6127838567197355,
 'amass': 1.6127838567197355,
 'besse': 1.6127838567197355,
 'proof': 1.6127838567197355,
 'continuing': 1.6127838567197355,
 'ridehailing': 1.6127838567197355,
 'transparently': 1.6127838567197355,
 'increments': 1.6127838567197355,
 'distinguished': 1.6127838567197355,
 'precise': 1.6127838567197355,
 'shortlived': 1.6127838567197355,
 'fuel': 1.6127838567197355,
 'chance': 1.3117538610557542,
 'finnish': 1.6127838567197355,
 'rabbit': 1.6127838567197355,
 'clip': 1.6127838567197355,
 'slide': 1.6127838567197355,
 'boyhood': 1.6127838567197355,
 'events': 1.0107238653917732,
 'icon': 1.6127838567197355,
 'globe': 1.135662602000073,
 'converted': 1.3117538610557542,
 'asked': 1.135662602000073,
 'advertisement': 1.3117538610557542,
 'quasar': 1.6127838567197355,
 'soci

In [109]:
def normalise(dict):
  denominator = 0
  for val in dict.values():
    denominator += val**2
  
  denominator = sqrt(denominator)
  if denominator > 0:
    for key in dict.keys():
      dict[key] /= denominator 
  return dict

In [116]:
def vsm(query):
  query = document_preprocessing(query)
  query_vocab = [word for word in query if word in vocabulary]
  query_vocab = list(set(query_vocab))

  query_wfidf = {}
  for word in query_vocab:
    query_wfidf[word] = 1 + log10(query.count(word)) * document_idf[word]

  n_query_wfidf = normalise(query_wfidf)
  # print(query_wfidf)

  similarity_scores = {}
  for doc_name in document_dictionary.keys():
    score = 0
    doc_wf = {}
    for word in query_vocab:
      try:
        doc_wf[word] = document_wf[doc_name][word]
      except KeyError:
        doc_wf[word] = 0
  
    n_doc_wf = normalise(doc_wf)
    # print(doc_name, doc_wf)
    
    for word in query_vocab:
      score += n_query_wfidf[word] * n_doc_wf[word]
    
    similarity_scores[doc_name] = score
  
  # print(similarity_scores)
  sorted_value = OrderedDict(sorted(similarity_scores.items(), key=lambda x:x[1], reverse=True))
  top_10 = {k: sorted_value[k] for k in list(sorted_value)[:10]}
  return top_10


In [122]:
query = input("Search : ")
while(query != 'exit'):
  dic = vsm(query)
  # print(dic)
  for key, val in dic.items():
    print(key, ':', val)
  print('')
  query = input("Search : ")

Search : ‘Developing your Zomato business account and profile is a great way to boost your restaurant’s online reputation
zomato.txt : 0.9583166500174
swiggy.txt : 0.6528389362225576
skype.txt : 0.597927653666375
google.txt : 0.5967716739046143
youtube.txt : 0.5930052475787645
messenger.txt : 0.5919513864161252
reddit.txt : 0.5888217881498098
paypal.txt : 0.5849374782143337
telegram.txt : 0.5222329678670936
bing.txt : 0.5222329678670936

Search :  Warwickshire, came from an ancient family and was the heiress to some land 
shakespeare.txt : 0.9861030676644635
levis.txt : 0.5773502691896258
Adobe.txt : 0.5724721440973308
operating.txt : 0.4082482904638631
HP.txt : 0.4082482904638631
huawei.txt : 0.4082482904638631
puma.txt : 0.4082482904638631
Lenovo.txt : 0.4082482904638631
google.txt : 0.4082482904638631
nike.txt : 0.4082482904638631

Search : exit
