In [None]:
import gdown

url = 'https://drive.google.com/drive/folders/1D3sM984cM_aKdgr6HfxM7UvT_n526A3e?usp=share_link'
opath = '/content'
gdown.download_folder(url, quiet=True, use_cookies=False)

In [18]:
import glob
import string
import nltk
import re
import numpy as np
from math import log10
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from collections import OrderedDict
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [5]:
def document_preprocessing(text):
  # Removes punctuations.
  p_text = text.translate(str.maketrans('', '', string.punctuation))

  # Removes newline characters.
  p_text = p_text.translate(str.maketrans({'\n': ' '}))
  
  # Removes tab characters.
  p_text = p_text.translate(str.maketrans({'\t': ''}))

  # lowers the text.
  p_text = p_text.lower()

  # Removes whitespaces at the beginning and end of the text.
  p_text = p_text.strip()

  # Substitutes multiple whitespaces with a single whitespace.
  p_text = re.sub(r"\s\s+", " ", p_text) 

  # Splitting text into tokens
  p_tokens = re.split('\W+', p_text)

  stopwords = nltk.corpus.stopwords.words('english')

  # Stopword and single character Removal
  p_tokens = [token for token in p_tokens if token not in stopwords and len(token) > 1] 

  # Stem Tokens
  stemmer = PorterStemmer()
  p_tokens = [stemmer.stem(token) for token in p_tokens]

  # Lemmatize tokens
  lemmatizer = WordNetLemmatizer()
  p_tokens = [lemmatizer.lemmatize(token) for token in p_tokens]

  return p_tokens

In [6]:
def get_doc_dict():
  doc_dict = {}
  file_list = glob.glob('/content/Corpus/*')
  for file in file_list:
    file_name = file.split('/')[-1]
    with open(file, 'r') as file:
      data = file.read()
      doc_dict[file_name] = document_preprocessing(data)
  return doc_dict

In [30]:
def get_wordList(doc_dict):
  wordList = []
  for doc in doc_dict:
    wordList += doc_dict[doc]
  return list(set(wordList))

In [55]:
def wf_doc(vocabulary, doc_dict):
  tf_docs = {}
  for doc_id in doc_dict.keys():
    tf_docs[doc_id] = {}

  for word in vocabulary:
    for doc_id, doc in doc_dict.items():
      if doc.count(word) > 0:
        tf_docs[doc_id][word] = 1 + log10(doc.count(word))
  return tf_docs

In [52]:
def df_doc(vocabulary, doc_dict):
  df = {}
  for word in vocabulary:
    freq = 0
    for doc in doc_dict.values():
      if word in doc:
        freq += 1
      df[word] = freq
  return df

In [21]:
def idf_doc(vocabulary, doc_freq, length):
  idf = {}
  for word in vocabulary:
    idf[word] = np.log10((length) / doc_freq[word])
  return idf

In [23]:
def tfidf(vocabulary, tf, idf, doc_dict):
  tf_idf = {}
  for doc_id in doc_dict.keys():
    tf_idf[doc_id] = {}
  for word in vocabulary:
    for doc_id, doc in doc_dict.items():
      tf_idf[doc_id][word] = tf[doc_id][word] * idf[doc_id]
  return tf_idf

In [34]:
document_dictionary = get_doc_dict()
for doc_name, doc in document_dictionary.items():
  print(doc_name, ':', doc)
N = len(document_dictionary)
print(N)

paypal.txt : ['paypal', 'paypal', 'american', 'ecommerc', 'compani', 'form', 'march', '2000', 'special', 'internet', 'money', 'transfer', 'heavili', 'use', 'internet', 'auction', 'compani', 'ebay', 'own', 'paypal', '2002', '2015', 'paypal', 'product', 'merger', 'xcom', 'confin', 'allow', 'user', 'make', 'payment', 'purchas', 'good', 'exchang', 'money', 'account', 'secur', 'onlin', 'transact', 'watch', 'paypal', 'becom', 'premier', 'choic', 'internet', 'auction', 'shopper', 'onlin', 'marketplac', 'giant', 'ebay', 'acquir', 'paypal', '15', 'billion', 'octob', '2002', 'compani', 'offer', 'user', 'abil', 'link', 'paypal', 'account', 'bank', 'account', 'make', 'transfer', 'payment', 'effici', 'money', 'order', 'check', 'fee', 'collect', 'ebay', 'certain', 'transact', 'determin', 'base', 'amount', 'transact', 'natur', 'transact', 'currenc', 'type', 'transact', '2015', 'paypal', 'spun', 'independ', 'compani', 'continu', 'use', 'ebay', 'sophist', 'seri', 'secur', 'advanc', 'help', 'paypal', 'r

In [33]:
vocabulary = get_wordList(document_dictionary)
print(vocabulary)
print(len(vocabulary))

['town', 'largest', 'greenlak', 'gyroscop', 'wrong', 'cunnington', 'antonio', 'critic', 'north', 'jean', 'thursday', 'regiment', 'arrest', 'leet', 'priorit', 'shirtpockets', 'judg', 'diversifi', 'disparag', 'patentinfring', '1958', 'bezo', 'bulletin', 'follicl', 'textbas', 'hourli', '18500', 'hail', 'frankfurt', 'subscriptionbas', 'postscript', 'amass', 'fromabigail', 'cement', 'divid', 'liber', 'eu', 'string', 'merchant', 'mostrecogniz', 'inde', 'field', 'motorol', 'overestim', 'polici', '5810', 'valid', 'braill', 'cloud', 'consid', 'stagnat', 'pink', '200', 'proof', 'sale', 'protocol', 'intgrat', 'abil', 'sophist', 'split', 'histor', 'freescal', '23', 'launch', 'direct', '50000', 'unionist', 'fruitbear', 'polyest', 'tag', 'polic', 'survivalist', 'fuel', 'move', 'finnish', 'concept', 'scientist', 'rabbit', 'count', 'bomber', 'clip', 'slide', 'boyhood', 'amband', 'icon', 'globe', 'aven', 'offtop', 'faster', 'f22', 'furnisur', 'natur', 'juli', 'intend', 'ultim', 'patienc', 'homer', 'con

In [56]:
document_wf = wf_doc(vocabulary=vocabulary, doc_dict=document_dictionary)
document_wf
for doc in document_wf.keys():
  print(doc, ':', document_wf[doc])

paypal.txt : {'liber': 1.0, 'polici': 1.0, 'abil': 1.0, 'sophist': 1.0, 'natur': 1.0, 'help': 1.0, 'fact': 1.3010299956639813, 'public': 1.0, 'purpos': 1.0, 'sole': 1.0, 'monasteri': 1.0, 'earli': 1.0, 'rais': 1.0, 'link': 1.0, 'owner': 1.0, 'strict': 1.0, 'largescal': 1.0, 'giant': 1.0, 'implement': 1.0, 'subject': 1.0, 'stand': 1.0, 'secur': 1.3010299956639813, 'competit': 1.0, 'monopoli': 1.0, 'sue': 1.0, 'offer': 1.3010299956639813, 'among': 1.0, 'sen': 1.0, 'made': 1.0, 'consum': 1.0, 'money': 1.4771212547196624, 'constitut': 1.3010299956639813, 'freedom': 1.3010299956639813, 'payment': 1.3010299956639813, 'britain': 1.0, 'devic': 1.0, 'germani': 1.0, 'wish': 1.0, 'readili': 1.0, 'merger': 1.0, 'limit': 1.3010299956639813, 'excess': 1.0, 'investor': 1.0, 'contract': 1.0, 'special': 1.0, 'ownership': 1.0, 'larg': 1.3010299956639813, 'commerc': 1.0, 'mediev': 1.0, 'wherebi': 1.3010299956639813, 'fusion': 1.3010299956639813, 'law': 1.4771212547196624, 'may': 1.6989700043360187, 'char

In [53]:
document_df = df_doc(vocabulary, document_dictionary)
print(document_df)
# for word, freq in document_df:
#   print(word, ':', freq)

{'town': 2, 'largest': 15, 'greenlak': 1, 'gyroscop': 1, 'wrong': 1, 'cunnington': 1, 'antonio': 1, 'critic': 5, 'north': 5, 'jean': 1, 'thursday': 1, 'regiment': 1, 'arrest': 1, 'leet': 1, 'priorit': 1, 'shirtpockets': 1, 'judg': 1, 'diversifi': 3, 'disparag': 1, 'patentinfring': 1, '1958': 3, 'bezo': 2, 'bulletin': 1, 'follicl': 1, 'textbas': 1, 'hourli': 1, '18500': 1, 'hail': 3, 'frankfurt': 1, 'subscriptionbas': 1, 'postscript': 1, 'amass': 1, 'fromabigail': 1, 'cement': 1, 'divid': 1, 'liber': 2, 'eu': 3, 'string': 1, 'merchant': 1, 'mostrecogniz': 1, 'inde': 1, 'field': 8, 'motorol': 1, 'overestim': 1, 'polici': 4, '5810': 1, 'valid': 2, 'braill': 1, 'cloud': 7, 'consid': 7, 'stagnat': 1, 'pink': 2, '200': 5, 'proof': 1, 'sale': 13, 'protocol': 1, 'intgrat': 1, 'abil': 8, 'sophist': 2, 'split': 5, 'histor': 2, 'freescal': 1, '23': 4, 'launch': 17, 'direct': 6, '50000': 1, 'unionist': 1, 'fruitbear': 1, 'polyest': 1, 'tag': 2, 'polic': 1, 'survivalist': 1, 'fuel': 1, 'move': 9, '

In [57]:
document_idf = idf_doc(vocabulary, document_df, N)
document_idf

{'town': 1.3117538610557542,
 'largest': 0.4366925976640542,
 'greenlak': 1.6127838567197355,
 'gyroscop': 1.6127838567197355,
 'wrong': 1.6127838567197355,
 'cunnington': 1.6127838567197355,
 'antonio': 1.6127838567197355,
 'critic': 0.9138138523837167,
 'north': 0.9138138523837167,
 'jean': 1.6127838567197355,
 'thursday': 1.6127838567197355,
 'regiment': 1.6127838567197355,
 'arrest': 1.6127838567197355,
 'leet': 1.6127838567197355,
 'priorit': 1.6127838567197355,
 'shirtpockets': 1.6127838567197355,
 'judg': 1.6127838567197355,
 'diversifi': 1.135662602000073,
 'disparag': 1.6127838567197355,
 'patentinfring': 1.6127838567197355,
 '1958': 1.135662602000073,
 'bezo': 1.3117538610557542,
 'bulletin': 1.6127838567197355,
 'follicl': 1.6127838567197355,
 'textbas': 1.6127838567197355,
 'hourli': 1.6127838567197355,
 '18500': 1.6127838567197355,
 'hail': 1.135662602000073,
 'frankfurt': 1.6127838567197355,
 'subscriptionbas': 1.6127838567197355,
 'postscript': 1.6127838567197355,
 'amas

In [26]:
def vsm(query):
  query = document_preprocessing(query)
  query_vocab = list(set(query))

  query_wfidf = {}
  for word in query_vocab:
    query_wfidf[word] = 1 + log10(query.count(word)) * d
  
  

  

  similarity_scores = {}
  for doc_id in doc_dict.keys():
    score = 0
    for word in query_vocab:
      score += query_wc[word] * tf[doc_id][word]
    similarity_scores[doc_id] = score
  sorted_value = OrderedDict(sorted(similarity_scores.items(), key=lambda x:x[1], reverse=True))
  top_5 = {k: sorted_value[k] for k in list(sorted_value)[:5]}
  return top_5


TypeError: ignored