# Document Retrieval using TF-IDF Weighted Rank and TF-IDF Cosine Similarity

In [2]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math


In [3]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
path='/content/drive/MyDrive/4th_Sem/IR/A1/stories'
os.chdir(path)
arr = os.listdir('.')

# Preprocessing

In [6]:
def convert_lower_case(data):
    return np.char.lower(data)

In [7]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [8]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~\n"
    for i in range(len(symbols)):
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
    data = np.char.replace(data, ',', '')
    return data

In [9]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [10]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text

In [11]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        try:
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
    new_text = np.char.replace(new_text, "-", " ")
    return new_text

In [12]:
def preprocess(data):
    data = convert_lower_case(data)
    data = remove_punctuation(data) 
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = stemming(data)
    return data

## Extracting Data

In [13]:
processed_text = []
processed_title = []

for i in arr:
    file = open(i, 'r', encoding="utf8", errors='ignore')
    text = file.read()
    file.close()

    processed_text.append(word_tokenize(str(preprocess(text))))
    processed_title.append(word_tokenize(str(preprocess(i))))

## Calculating IDF for all words

In [15]:
DF = {}
IDF={}

for i in range(len(processed_text)):
    tokens = processed_text[i]
    for w in tokens:
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}

for i in DF:
    IDF[i] = np.log(len(processed_text)/(len(DF[i])+1))

In [16]:
idf_score=list(IDF.values())

In [18]:
vocab=list(IDF.keys())

In [50]:
max=0
for i in processed_text:
  if len(i)>max:
    max=len(i)
max

59245

### Calculating TF 

In [60]:
bin_tf_idf =  np.asarray([[0.00 for i in range(len(processed_text))] for j in range(len(vocab))])
count_tf_idf =  np.asarray([[0.00 for i in range(len(processed_text))] for j in range(len(vocab))])
freq_tf_idf =  np.asarray([[0.00 for i in range(len(processed_text))] for j in range(len(vocab))])
log_tf_idf =  np.asarray([[0.00 for i in range(len(processed_text))] for j in range(len(vocab))])
norm_tf_idf =  np.asarray([[0.00 for i in range(len(processed_text))] for j in range(len(vocab))])
bin_tf_idf.shape

(33899, 453)

In [61]:
log_tf_idf.shape

(33899, 453)

In [63]:
for i in range(len(vocab)):
  for j in range(len(processed_text)):
    if vocab[i] in processed_text[j]:
      count=processed_text[j].count(vocab[i])
      bin_tf_idf[i][j]=idf_score[i]
      count_tf_idf[i][j]=idf_score[i]*count
      freq_tf_idf[i][j]=idf_score[i]*count/len(processed_text[j])
      log_tf_idf[i][j]=idf_score[i]*(np.log(1+count))
      norm_tf_idf[i][j]=idf_score[i]*(0.5+0.5*(count)/max)


In [52]:
df_bin=pd.DataFrame(bin_tf_idf, index=vocab ,columns = arr)
df_bin.head()

Unnamed: 0,18.lws,13chil.txt,19.lws,3gables.txt,14.lws,16.lws,20.lws,100west.txt,3lpigs.txt,17.lws,asop,aluminum.hum,adler.txt,advsayed.txt,alad10.txt,aircon.txt,7voysinb.txt,alissadl.txt,7oldsamr.txt,arctic.txt,aisle.six,aminegg.txt,archive,6ablemen.txt,5orange.txt,abbey.txt,abyss.txt,3student.txt,advtthum.txt,aquith.txt,antcrick.txt,adv_alad.txt,aesopa10.txt,angry_ca.txt,altside.hum,aesop11.txt,3wishes.txt,3sonnets.vrs,4moons.txt,arcadia.sty,...,szechuan,telefone.txt,terrorbears.txt,wall.art,vgilante.txt,vaincrow.txt,timem.hac,vampword.txt,tinsoldr.txt,tree.txt,timetrav.txt,unluckwr.txt,times.fic,traitor.txt,toilet.s,veiledl.txt,thewave,vainsong.txt,thanksg,tuc_mees,the-tree.txt,tin,wanderer.fun,vday.hum,uglyduck.txt,valen,wrt,wolflamb.txt,wolfcran.txt,wolf7kid.txt,withdraw.cyb,wlgirl.txt,wombat.und,whgdsreg.reg,zombies.txt,wisteria.txt,weaver.txt,yukon.txt,write,weeprncs.txt
adventur,1.721443,0.0,1.721443,1.721443,1.721443,1.721443,1.721443,0.0,0.0,1.721443,0.0,0.0,0.0,1.721443,0.0,0.0,1.721443,0.0,0.0,0.0,0.0,0.0,1.721443,0.0,1.721443,1.721443,0.0,1.721443,1.721443,0.0,0.0,1.721443,0.0,0.0,0.0,0.0,0.0,0.0,1.721443,0.0,...,1.721443,0.0,0.0,0.0,1.721443,0.0,1.721443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.721443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.721443,0.0,0.0,0.0,0.0
lone,1.759183,0.0,1.759183,1.759183,1.759183,1.759183,1.759183,1.759183,0.0,1.759183,0.0,0.0,0.0,0.0,1.759183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.759183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.759183,0.0,1.759183,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.759183,0.0,1.759183,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.759183,0.0,0.0,0.0,0.0,0.0,1.759183,1.759183,0.0,0.0,0.0,0.0,0.0,1.759183,0.0,0.0,0.0,0.0,1.759183,0.0,1.759183,0.0,0.0
wolf,2.532373,0.0,2.532373,2.532373,2.532373,2.532373,2.532373,0.0,2.532373,2.532373,2.532373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.532373,0.0,0.0,2.532373,2.532373,0.0,2.532373,0.0,0.0,2.532373,0.0,...,0.0,0.0,0.0,0.0,2.532373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.532373,2.532373,2.532373,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.532373,0.0,0.0
scientif,2.714695,0.0,2.714695,0.0,2.714695,2.714695,2.714695,0.0,0.0,2.714695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.714695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.714695,0.0,2.714695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
electron,1.811827,0.0,1.811827,0.0,1.811827,1.811827,1.811827,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,...,1.811827,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0


In [53]:
df_count=pd.DataFrame(count_tf_idf, index=vocab ,columns = arr)
df_count.head()

Unnamed: 0,18.lws,13chil.txt,19.lws,3gables.txt,14.lws,16.lws,20.lws,100west.txt,3lpigs.txt,17.lws,asop,aluminum.hum,adler.txt,advsayed.txt,alad10.txt,aircon.txt,7voysinb.txt,alissadl.txt,7oldsamr.txt,arctic.txt,aisle.six,aminegg.txt,archive,6ablemen.txt,5orange.txt,abbey.txt,abyss.txt,3student.txt,advtthum.txt,aquith.txt,antcrick.txt,adv_alad.txt,aesopa10.txt,angry_ca.txt,altside.hum,aesop11.txt,3wishes.txt,3sonnets.vrs,4moons.txt,arcadia.sty,...,szechuan,telefone.txt,terrorbears.txt,wall.art,vgilante.txt,vaincrow.txt,timem.hac,vampword.txt,tinsoldr.txt,tree.txt,timetrav.txt,unluckwr.txt,times.fic,traitor.txt,toilet.s,veiledl.txt,thewave,vainsong.txt,thanksg,tuc_mees,the-tree.txt,tin,wanderer.fun,vday.hum,uglyduck.txt,valen,wrt,wolflamb.txt,wolfcran.txt,wolf7kid.txt,withdraw.cyb,wlgirl.txt,wombat.und,whgdsreg.reg,zombies.txt,wisteria.txt,weaver.txt,yukon.txt,write,weeprncs.txt
adventur,3.442886,0.0,5.164329,5.164329,3.442886,1.721443,6.885772,0.0,0.0,5.164329,0.0,0.0,0.0,1.721443,0.0,0.0,17.21443,0.0,0.0,0.0,0.0,0.0,6.885772,0.0,3.442886,1.721443,0.0,5.164329,3.442886,0.0,0.0,3.442886,0.0,0.0,0.0,0.0,0.0,0.0,1.721443,0.0,...,1.721443,0.0,0.0,0.0,3.442886,0.0,5.164329,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.721443,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.885772,0.0,0.0,0.0,0.0
lone,28.146933,0.0,21.1102,1.759183,3.518367,5.27755,10.5551,1.759183,0.0,7.036733,0.0,0.0,0.0,0.0,3.518367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,12.314283,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.759183,0.0,1.759183,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,15.83265,0.0,3.518367,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.759183,0.0,0.0,0.0,0.0,0.0,1.759183,3.518367,0.0,0.0,0.0,0.0,0.0,1.759183,0.0,0.0,0.0,0.0,3.518367,0.0,54.534682,0.0,0.0
wolf,35.453225,0.0,32.920851,2.532373,5.064746,5.064746,15.194239,0.0,40.517971,10.129493,15.194239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.064746,0.0,0.0,103.827301,2.532373,0.0,235.510706,0.0,0.0,2.532373,0.0,...,0.0,0.0,0.0,0.0,20.258985,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.258985,27.856105,65.841703,0.0,0.0,0.0,0.0,0.0,0.0,0.0,20.258985,0.0,0.0
scientif,19.002863,0.0,32.576337,0.0,5.429389,5.429389,16.288168,0.0,0.0,10.858779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.714695,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.429389,0.0,8.144084,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
electron,1.811827,0.0,1.811827,0.0,1.811827,3.623654,1.811827,0.0,0.0,3.623654,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,...,1.811827,0.0,0.0,0.0,7.247308,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.811827,0.0,0.0,0.0,0.0,0.0


In [54]:
df_freq=pd.DataFrame(freq_tf_idf, index=vocab ,columns = arr)
df_freq.head()

Unnamed: 0,18.lws,13chil.txt,19.lws,3gables.txt,14.lws,16.lws,20.lws,100west.txt,3lpigs.txt,17.lws,asop,aluminum.hum,adler.txt,advsayed.txt,alad10.txt,aircon.txt,7voysinb.txt,alissadl.txt,7oldsamr.txt,arctic.txt,aisle.six,aminegg.txt,archive,6ablemen.txt,5orange.txt,abbey.txt,abyss.txt,3student.txt,advtthum.txt,aquith.txt,antcrick.txt,adv_alad.txt,aesopa10.txt,angry_ca.txt,altside.hum,aesop11.txt,3wishes.txt,3sonnets.vrs,4moons.txt,arcadia.sty,...,szechuan,telefone.txt,terrorbears.txt,wall.art,vgilante.txt,vaincrow.txt,timem.hac,vampword.txt,tinsoldr.txt,tree.txt,timetrav.txt,unluckwr.txt,times.fic,traitor.txt,toilet.s,veiledl.txt,thewave,vainsong.txt,thanksg,tuc_mees,the-tree.txt,tin,wanderer.fun,vday.hum,uglyduck.txt,valen,wrt,wolflamb.txt,wolfcran.txt,wolf7kid.txt,withdraw.cyb,wlgirl.txt,wombat.und,whgdsreg.reg,zombies.txt,wisteria.txt,weaver.txt,yukon.txt,write,weeprncs.txt
adventur,0.001464,0.0,0.003132,0.001825,0.007248,0.001311,0.005875,0.0,0.0,0.005607,0.0,0.0,0.0,0.001867,0.0,0.0,0.004649,0.0,0.0,0.0,0.0,0.0,0.000544,0.0,0.001031,0.000417,0.0,0.00171,0.008805,0.0,0.0,0.003734,0.0,0.0,0.0,0.0,0.0,0.0,0.001155,0.0,...,0.003985,0.0,0.0,0.0,5.8e-05,0.0,0.000332,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000851,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001269,0.0,0.0,0.0,0.0
lone,0.011967,0.0,0.012802,0.000622,0.007407,0.004019,0.009006,0.001014,0.0,0.00764,0.0,0.0,0.0,0.0,0.001419,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.003688,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00031,0.0,0.000711,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000267,0.0,0.000226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002788,0.0,0.0,0.0,0.0,0.0,0.001757,0.007767,0.0,0.0,0.0,0.0,0.0,0.001424,0.0,0.0,0.0,0.0,0.000648,0.0,0.007797,0.0,0.0
wolf,0.015074,0.0,0.019964,0.000895,0.010663,0.003857,0.012964,0.0,0.076449,0.010998,0.085843,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001141,0.0,0.0,0.018279,0.004506,0.0,0.012362,0.0,0.0,0.0017,0.0,...,0.0,0.0,0.0,0.0,0.000342,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.12353,0.125478,0.116328,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002897,0.0,0.0
scientif,0.008079,0.0,0.019755,0.0,0.01143,0.004135,0.013898,0.0,0.0,0.01179,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000657,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,9.2e-05,0.0,0.000523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
electron,0.00077,0.0,0.001099,0.0,0.003814,0.00276,0.001546,0.0,0.0,0.003934,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000143,0.0,0.0,0.0,0.003318,0.0,0.0,0.0,0.0,0.0,0.0,0.003224,0.0,0.0,0.0,0.0,0.0,0.0,...,0.004194,0.0,0.0,0.0,0.000122,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.00135,0.0,0.0,0.0,0.001714,0.0,0.0,0.0,0.0,0.008236,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000339,0.0,0.0,0.0,0.0,0.0


In [55]:
df_log=pd.DataFrame(log_tf_idf, index=vocab ,columns = arr)
df_log.head()

Unnamed: 0,18.lws,13chil.txt,19.lws,3gables.txt,14.lws,16.lws,20.lws,100west.txt,3lpigs.txt,17.lws,asop,aluminum.hum,adler.txt,advsayed.txt,alad10.txt,aircon.txt,7voysinb.txt,alissadl.txt,7oldsamr.txt,arctic.txt,aisle.six,aminegg.txt,archive,6ablemen.txt,5orange.txt,abbey.txt,abyss.txt,3student.txt,advtthum.txt,aquith.txt,antcrick.txt,adv_alad.txt,aesopa10.txt,angry_ca.txt,altside.hum,aesop11.txt,3wishes.txt,3sonnets.vrs,4moons.txt,arcadia.sty,...,szechuan,telefone.txt,terrorbears.txt,wall.art,vgilante.txt,vaincrow.txt,timem.hac,vampword.txt,tinsoldr.txt,tree.txt,timetrav.txt,unluckwr.txt,times.fic,traitor.txt,toilet.s,veiledl.txt,thewave,vainsong.txt,thanksg,tuc_mees,the-tree.txt,tin,wanderer.fun,vday.hum,uglyduck.txt,valen,wrt,wolflamb.txt,wolfcran.txt,wolf7kid.txt,withdraw.cyb,wlgirl.txt,wombat.und,whgdsreg.reg,zombies.txt,wisteria.txt,weaver.txt,yukon.txt,write,weeprncs.txt
adventur,1.891198,0.0,2.386427,2.386427,1.891198,1.193213,2.770556,0.0,0.0,2.386427,0.0,0.0,0.0,1.193213,0.0,0.0,4.12784,0.0,0.0,0.0,0.0,0.0,2.770556,0.0,1.891198,1.193213,0.0,2.386427,1.891198,0.0,0.0,1.891198,0.0,0.0,0.0,0.0,0.0,0.0,1.193213,0.0,...,1.193213,0.0,0.0,0.0,1.891198,0.0,2.386427,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.193213,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.770556,0.0,0.0,0.0,0.0
lone,4.984142,0.0,4.512216,1.219373,1.93266,2.438746,3.423213,1.219373,0.0,2.831296,0.0,0.0,0.0,0.0,1.93266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.658119,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.219373,0.0,1.219373,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.050669,0.0,1.93266,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.219373,0.0,0.0,0.0,0.0,0.0,1.219373,1.93266,0.0,0.0,0.0,0.0,0.0,1.219373,0.0,0.0,0.0,0.0,1.93266,0.0,6.096865,0.0,0.0
wolf,6.857794,0.0,6.683078,1.755307,2.782096,2.782096,4.927771,0.0,7.174754,4.075697,4.927771,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.782096,0.0,0.0,9.465174,1.755307,0.0,11.505318,0.0,0.0,1.755307,0.0,...,0.0,0.0,0.0,0.0,5.564193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.564193,6.292711,8.346289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.564193,0.0,0.0
scientif,5.645049,0.0,6.963055,0.0,2.982397,2.982397,5.282552,0.0,0.0,4.369133,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.881683,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.982397,0.0,3.763366,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
electron,1.255863,0.0,1.255863,0.0,1.255863,1.990495,1.255863,0.0,0.0,1.990495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.255863,0.0,0.0,0.0,1.255863,0.0,0.0,0.0,0.0,0.0,0.0,1.255863,0.0,0.0,0.0,0.0,0.0,0.0,...,1.255863,0.0,0.0,0.0,2.916023,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.255863,0.0,0.0,0.0,1.255863,0.0,0.0,0.0,0.0,1.255863,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.255863,0.0,0.0,0.0,0.0,0.0


In [64]:
df_norm=pd.DataFrame(norm_tf_idf, index=vocab ,columns = arr)
df_norm.head()

Unnamed: 0,18.lws,13chil.txt,19.lws,3gables.txt,14.lws,16.lws,20.lws,100west.txt,3lpigs.txt,17.lws,asop,aluminum.hum,adler.txt,advsayed.txt,alad10.txt,aircon.txt,7voysinb.txt,alissadl.txt,7oldsamr.txt,arctic.txt,aisle.six,aminegg.txt,archive,6ablemen.txt,5orange.txt,abbey.txt,abyss.txt,3student.txt,advtthum.txt,aquith.txt,antcrick.txt,adv_alad.txt,aesopa10.txt,angry_ca.txt,altside.hum,aesop11.txt,3wishes.txt,3sonnets.vrs,4moons.txt,arcadia.sty,...,szechuan,telefone.txt,terrorbears.txt,wall.art,vgilante.txt,vaincrow.txt,timem.hac,vampword.txt,tinsoldr.txt,tree.txt,timetrav.txt,unluckwr.txt,times.fic,traitor.txt,toilet.s,veiledl.txt,thewave,vainsong.txt,thanksg,tuc_mees,the-tree.txt,tin,wanderer.fun,vday.hum,uglyduck.txt,valen,wrt,wolflamb.txt,wolfcran.txt,wolf7kid.txt,withdraw.cyb,wlgirl.txt,wombat.und,whgdsreg.reg,zombies.txt,wisteria.txt,weaver.txt,yukon.txt,write,weeprncs.txt
adventur,0.860751,0.0,0.860765,0.860765,0.860751,0.860736,0.86078,0.0,0.0,0.860765,0.0,0.0,0.0,0.860736,0.0,0.0,0.860867,0.0,0.0,0.0,0.0,0.0,0.86078,0.0,0.860751,0.860736,0.0,0.860765,0.860751,0.0,0.0,0.860751,0.0,0.0,0.0,0.0,0.0,0.0,0.860736,0.0,...,0.860736,0.0,0.0,0.0,0.860751,0.0,0.860765,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.860736,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.86078,0.0,0.0,0.0,0.0
lone,0.879829,0.0,0.87977,0.879606,0.879621,0.879636,0.879681,0.879606,0.0,0.879651,0.0,0.0,0.0,0.0,0.879621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.879696,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.879606,0.0,0.879606,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.879725,0.0,0.879621,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.879606,0.0,0.0,0.0,0.0,0.0,0.879606,0.879621,0.0,0.0,0.0,0.0,0.0,0.879606,0.0,0.0,0.0,0.0,0.879621,0.0,0.880052,0.0,0.0
wolf,1.266486,0.0,1.266464,1.266208,1.266229,1.266229,1.266315,0.0,1.266529,1.266272,1.266315,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.266229,0.0,0.0,1.267063,1.266208,0.0,1.268174,0.0,0.0,1.266208,0.0,...,0.0,0.0,0.0,0.0,1.266358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.266358,1.266422,1.266742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.266358,0.0,0.0
scientif,1.357508,0.0,1.357622,0.0,1.357393,1.357393,1.357485,0.0,0.0,1.357439,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.35737,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.357393,0.0,1.357416,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
electron,0.905929,0.0,0.905929,0.0,0.905929,0.905944,0.905929,0.0,0.0,0.905944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.905929,0.0,0.0,0.0,0.905929,0.0,0.0,0.0,0.0,0.0,0.0,0.905929,0.0,0.0,0.0,0.0,0.0,0.0,...,0.905929,0.0,0.0,0.0,0.905975,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.905929,0.0,0.0,0.0,0.905929,0.0,0.0,0.0,0.0,0.905929,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.905929,0.0,0.0,0.0,0.0,0.0


In [65]:
df_bin.to_csv('/content/Output/Bin_Tf-idf.csv')
df_count.to_csv('/content/Output/Count_Tf-idf.csv')
df_freq.to_csv('/content/Output/Freq_Tf-idf.csv')
df_log.to_csv('/content/Output/Log_Tf-idf.csv')
df_norm.to_csv('/content/Output/Norm_Tf-idf.csv')

In [None]:
query= "Without the drive of Rebeccah's insistence, Kate lost her momentum. She stood next a slatted oak bench, canisters still clutched, surveying"
query_vector={}

for i in vocab:
    query_vector[i] = 0.00

query_token=word_tokenize(str(preprocess(query)))

for t in query_token:
  query_vector[t]=IDF[t]*(query_token.count(t))/len(query_token)


In [98]:
query_vector.values()

dict_values([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.04155335449483334, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.05172656842428118, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.03211083153770832, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,

# TF-IDF Matching Score Ranking

1) Binary *TF*

Advantage :

1. Very simple to understand and implement.


Disadvantage : 

1.   Gives no semantic Information
2.   High size with lots of zeros consumes memory



In [118]:
score={}
for i in arr:
  score[i] = np.dot(df_bin[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Top 5 Matches for bin TF scheme:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Top 5 Matches for bin TF scheme:
Keys: Values
ghost  : 8.159614745135666  
vgilante.txt  : 3.4830316514319772  
rocket.sf  : 2.7152060720246123  
roger1.txt  : 2.524048430205908  
fgoose.txt  : 2.393142091628711  


2) Count/BOW model
Advantage :

1.  Very simple to understand and implement.

Disadvantage : 

1.   High dimensional feature vector due to large size of Vocabulary
2.   Assumes all words are independent of each other. 
3.   It leads to a highly sparse vectors





In [119]:
score={}
for i in arr:
  score[i] = np.dot(df_count[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Top 5 Matches for Raw Count TF scheme:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Top 5 Matches for Raw Count TF scheme:
Keys: Values
ghost  : 104.5835098596016  
vgilante.txt  : 48.468390300341596  
radar_ra.txt  : 22.0510372545445  
hitch3.txt  : 14.4510264261402  
gulliver.txt  : 12.617411939399894  


3) **Term Frequency Model**

*Advantages*

- Easy to compute
- You have some basic metric to extract the most descriptive terms in a document
- You can easily compute the similarity between 2 documents using it

*Disadvantages*


* Slow for large vocabularies.
* It assumes that the counts of different words provide independent evidence of similarity.
* It makes no use of semantic similarities between words.


In [121]:
score={}
for i in arr:
  score[i] = np.dot(df_freq[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Top 5 Matches forStandard TF scheme:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Top 5 Matches forStandard TF scheme:
Keys: Values
ghost  : 0.06958317355928251  
fea3  : 0.01520582072355144  
vday.hum  : 0.01016178781095313  
graymare.txt  : 0.0033979224910596576  
fic7  : 0.0032656103489848544  


4) **Log Normalized Model**

*Advantages*

*   Normalizes Common elements





In [122]:
score={}
for i in arr:
  score[i] = np.dot(df_log[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Top 5 Matches for log weighted TF scheme:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Top 5 Matches for log weighted TF scheme:
Keys: Values
ghost  : 17.030091104905253  
vgilante.txt  : 7.32731458360322  
hitch3.txt  : 4.112992832258883  
fgoose.txt  : 3.915792246281595  
radar_ra.txt  : 3.717623780215789  


5) **Double Normalized Scaling**

*Advantages*

* Normalizes Common elements
* Prevents bias towards longer documents


In [123]:
score={}
for i in arr:
  score[i] = np.dot(df_norm[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Top 5 Matches for Double Normalized TF scheme:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Top 5 Matches for Double Normalized TF scheme:
Keys: Values
ghost  : 4.080690008316502  
vgilante.txt  : 1.7419248761869173  
rocket.sf  : 1.3576828512933599  
roger1.txt  : 1.2620511554682288  
fgoose.txt  : 1.196669113333133  


# TF-IDF Cosine Similarity Ranking

In [111]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [124]:
score={}
for i in arr:
  score[i] = cosine_sim(df_bin[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Dictionary with 5 highest values:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Dictionary with 5 highest values:
Keys: Values
ghost  : 0.17391473692587864  
enc  : 0.04272881561129746  
roger1.txt  : 0.03886267306328776  
vday.hum  : 0.03851652630542407  
running.txt  : 0.02591687490642025  


In [125]:
score={}
for i in arr:
  score[i] = cosine_sim(df_count[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Dictionary with 5 highest values:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Dictionary with 5 highest values:
Keys: Values
ghost  : 0.5870949240459775  
vday.hum  : 0.1520566254845284  
fea3  : 0.09234933479463948  
fic7  : 0.036532872760937404  
fgoose.txt  : 0.035957578194830656  


In [126]:
score={}
for i in arr:
  score[i] = cosine_sim(df_freq[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Dictionary with 5 highest values:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Dictionary with 5 highest values:
Keys: Values
ghost  : 0.5870949240459776  
vday.hum  : 0.15205662548452836  
fea3  : 0.09234933479463948  
fic7  : 0.03653287276093739  
fgoose.txt  : 0.03595757819483067  


In [127]:
score={}
for i in arr:
  score[i] = cosine_sim(df_log[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Dictionary with 5 highest values:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Dictionary with 5 highest values:
Keys: Values
ghost  : 0.3712293890342363  
vday.hum  : 0.0856186608694815  
fea3  : 0.047335156777324354  
fic7  : 0.03873575354733747  
fgoose.txt  : 0.0377466911334025  


In [128]:
score={}
for i in arr:
  score[i] = cosine_sim(df_norm[i],list(query_vector.values()))

k = Counter(score)
  
high = k.most_common(5) 
  
print("Dictionary with 5 highest values:")
print("Keys: Values")
  
for i in high:
    print(i[0]," :",i[1]," ")

Dictionary with 5 highest values:
Keys: Values
ghost  : 0.17394693283933085  
enc  : 0.04272851970833655  
roger1.txt  : 0.03886244316029367  
vday.hum  : 0.03851978975461948  
running.txt  : 0.025916712996746263  
