In [65]:
import numpy as np

from time import time
import os
from os import listdir
from os.path import isfile, join
from sys import getsizeof
from collections import Mapping, Container

import glob
import string
import math
from nltk.stem.porter import *

In [48]:
def directory_listing(dir_path):
    '''
    Input:  string (path to directory)
    Output: list of strings (full paths to files in the directory)
    '''
    
    return glob.glob(dir_path + '*.txt')

In [29]:
path = os.getcwd() +'/HillaryEmails/'
all_files = directory_listing(path)
print(all_files)
#all_files

['/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/3644.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/5235.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/1053.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/7422.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/7344.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/1735.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/5553.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/4895.txt', '/Users/arkapravasaha/Documents/CI6226/Assignment/Information-Retrieval-and-Analysis/HillaryEmails/3122.txt', '/Users/a

In [49]:
def read_file(file_path):
    '''
    Input:  string (full path to file)
    Output: string/text (full contents of a file)
    '''
    
    lines = [line.rstrip('\n') for line in open(file_path)]
    return ' '.join(lines)

In [31]:
file_text = read_file(all_files[0])
file_text

'UNCLASSIFIED U.S. Department of State Case No. F-2014-20439 Doc No. C05766459 Date: 07/31/2015 RELEASE IN FULL CONFIDENTIAL October 23, 2009 For: Hillary From: Sid Re: Tony Blair, EU presidency, Tory Party, & Berlin trip One of your agenda items behind the scenes on your Berlin trip can be to discuss the future of the EU, the European presidency and the prospects of Tony Blair. If Blair does not become EU president the position will likely be filled by .a third rank nonentity in the Brussels bureaucratic mode incapable of realizing the possibilities in the creation of the office, continuing the feebleness of Europe as a political idea and reality. Of course, it is in the US interest to have a strong Europe—and the naming of the first European president might be the most important opportunity for the US to strengthen Europe, to give it actual sinew, for a long time and a long time to come. The Conservative Party is split over Europe and Blair. Hague represents the Tory right, which is 

In [50]:
def tokenization(text, file_path):
    '''
    Input:  text(file contents), string (document id = path to file)
    Output: list of pairs < string(token) , string (document id) >
    '''
    doc_id = int(os.path.basename(file_path).replace('.txt','')) #retrieve document id from file path
    tokens = text.split()
    return [(token, doc_id) for token in tokens]
    

In [34]:
token_pairs = tokenization(file_text, all_files[0])
print("length of pairs: ", len(token_pairs))
print(token_pairs)
#token_pairs

length of pairs:  605
[('UNCLASSIFIED', 3644), ('U.S.', 3644), ('Department', 3644), ('of', 3644), ('State', 3644), ('Case', 3644), ('No.', 3644), ('F-2014-20439', 3644), ('Doc', 3644), ('No.', 3644), ('C05766459', 3644), ('Date:', 3644), ('07/31/2015', 3644), ('RELEASE', 3644), ('IN', 3644), ('FULL', 3644), ('CONFIDENTIAL', 3644), ('October', 3644), ('23,', 3644), ('2009', 3644), ('For:', 3644), ('Hillary', 3644), ('From:', 3644), ('Sid', 3644), ('Re:', 3644), ('Tony', 3644), ('Blair,', 3644), ('EU', 3644), ('presidency,', 3644), ('Tory', 3644), ('Party,', 3644), ('&', 3644), ('Berlin', 3644), ('trip', 3644), ('One', 3644), ('of', 3644), ('your', 3644), ('agenda', 3644), ('items', 3644), ('behind', 3644), ('the', 3644), ('scenes', 3644), ('on', 3644), ('your', 3644), ('Berlin', 3644), ('trip', 3644), ('can', 3644), ('be', 3644), ('to', 3644), ('discuss', 3644), ('the', 3644), ('future', 3644), ('of', 3644), ('the', 3644), ('EU,', 3644), ('the', 3644), ('European', 3644), ('presidency'

In [51]:
def linguistic_modules(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: list of pairs < modified token , document id >
    
    modified token: removing all punctuation symbols (!@#$%^&*()-_=+’`~”:;/.,?[]{}<>),lowercasingand stemming.
    '''
    
    stemmer = PorterStemmer()  
    return [(stemmer.stem(token.translate(str.maketrans('','',string.punctuation)).lower()), doc_id)  
            for token, doc_id in token_pairs  
                if token.translate(str.maketrans('','',string.punctuation)) is not ''] #if statement to check empty token
    

In [36]:
modified_token_pairs = linguistic_modules(token_pairs)
print('length after modification: ', len(modified_token_pairs))
print(modified_token_pairs)
#modified_token_pairs

length after modification:  604
[('unclassifi', 3644), ('us', 3644), ('depart', 3644), ('of', 3644), ('state', 3644), ('case', 3644), ('no', 3644), ('f201420439', 3644), ('doc', 3644), ('no', 3644), ('c05766459', 3644), ('date', 3644), ('07312015', 3644), ('releas', 3644), ('in', 3644), ('full', 3644), ('confidenti', 3644), ('octob', 3644), ('23', 3644), ('2009', 3644), ('for', 3644), ('hillari', 3644), ('from', 3644), ('sid', 3644), ('re', 3644), ('toni', 3644), ('blair', 3644), ('eu', 3644), ('presid', 3644), ('tori', 3644), ('parti', 3644), ('berlin', 3644), ('trip', 3644), ('one', 3644), ('of', 3644), ('your', 3644), ('agenda', 3644), ('item', 3644), ('behind', 3644), ('the', 3644), ('scene', 3644), ('on', 3644), ('your', 3644), ('berlin', 3644), ('trip', 3644), ('can', 3644), ('be', 3644), ('to', 3644), ('discuss', 3644), ('the', 3644), ('futur', 3644), ('of', 3644), ('the', 3644), ('eu', 3644), ('the', 3644), ('european', 3644), ('presid', 3644), ('and', 3644), ('the', 3644), ('p

In [67]:
l = [('unclassifi', '133'),('us', '133'),('depart', '133'), ('!','12')]
linguistic_modules(l)

[('unclassifi', '133'), ('us', '133'), ('depart', '133')]

In [52]:
def sort_tokens(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: sorted list of pairs < token , document id >
    
    perform sorting of the token list: first by tokens (alphabetical order), and then by document ids 
    '''
    
    return sorted(token_pairs, key=lambda element: (element[0], element[1]))

In [38]:
sorted_modified_token_pairs = sort_tokens(modified_token_pairs)
sorted_modified_token_pairs

[('07312015', 3644),
 ('07312015', 3644),
 ('07312015', 3644),
 ('07312015', 3644),
 ('1', 3644),
 ('2', 3644),
 ('2009', 3644),
 ('23', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('a', 3644),
 ('abl', 3644),
 ('about', 3644),
 ('about', 3644),
 ('act', 3644),
 ('actual', 3644),
 ('affili', 3644),
 ('against', 3644),
 ('agenda', 3644),
 ('align', 3644),
 ('align', 3644),
 ('all', 3644),
 ('also', 3644),
 ('american', 3644),
 ('an', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('and', 3644),
 ('antiamerican', 3644),
 ('antieuropean', 3644),
 ('antieuropean', 3644),
 ('anyth', 3644),
 ('appreci', 3644),
 ('are', 3644),
 ('are', 3644),
 ('are', 3644),
 ('argument', 3644),
 ('aristocrat', 3644),
 ('as', 3644),
 ('as', 3644),
 ('as', 3644),
 ('as', 36

In [53]:
def transformation_into_postings(sorted_token_pairs):
    '''
    Input: sorted list of pairs < token , document id >
    Output: inverted index
    
    Used dictionary data structure (Hash table)
    '''
    
    dictionary_ = {}
    for a, b in sorted_token_pairs:
        dictionary_.setdefault(a, []).append(b)
#     dictionary_ = {key:list(sorted(set(value))) for (key, value) in dictionary_.items()}
    for key in dictionary_:
        value = dictionary_[key]
        posting = list(sorted(set(value)))
        dictionary_[key] = (len(posting),posting)
    return dictionary_

In [40]:
posting_list = transformation_into_postings(sorted_modified_token_pairs)
posting_list

{'07312015': (1, [3644]),
 '1': (1, [3644]),
 '2': (1, [3644]),
 '2009': (1, [3644]),
 '23': (1, [3644]),
 'a': (1, [3644]),
 'abl': (1, [3644]),
 'about': (1, [3644]),
 'act': (1, [3644]),
 'actual': (1, [3644]),
 'affili': (1, [3644]),
 'against': (1, [3644]),
 'agenda': (1, [3644]),
 'align': (1, [3644]),
 'all': (1, [3644]),
 'also': (1, [3644]),
 'american': (1, [3644]),
 'an': (1, [3644]),
 'and': (1, [3644]),
 'antiamerican': (1, [3644]),
 'antieuropean': (1, [3644]),
 'anyth': (1, [3644]),
 'appreci': (1, [3644]),
 'are': (1, [3644]),
 'argument': (1, [3644]),
 'aristocrat': (1, [3644]),
 'as': (1, [3644]),
 'at': (1, [3644]),
 'away': (1, [3644]),
 'balanc': (1, [3644]),
 'be': (1, [3644]),
 'becom': (1, [3644]),
 'behind': (1, [3644]),
 'berlin': (1, [3644]),
 'best': (1, [3644]),
 'bit': (1, [3644]),
 'blair': (1, [3644]),
 'bori': (1, [3644]),
 'britain': (1, [3644]),
 'brussel': (1, [3644]),
 'bureaucrat': (1, [3644]),
 'bush': (1, [3644]),
 'but': (1, [3644]),
 'by': (1, 

In [72]:
a = [483, 483, 1526, 1526, 1840, 1840, 1840, 1840]
list(set(a))

[1840, 483, 1526]

In [73]:
d = {'a': [1,1,2], 'c': [3], 'b': [2,3,2]}
{key:list(set(value)) for (key, value) in d.items()}


{'a': [1, 2], 'c': [3], 'b': [2, 3]}

In [74]:
dict1 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
# Double each value in the dictionary
double_dict1 = {k:v*2 for (k,v) in dict1.items()}
print(double_dict1)

{'a': 2, 'b': 4, 'c': 6, 'd': 8, 'e': 10}


In [54]:
def postings_list_merge(postings_lists):
    '''
    Input: list of postings lists 
    Output: merged postings list
    
    Intersect the postings lists in increasing order of length
    '''
    
    sorted_postings_lists = sorted(postings_lists, key = lambda l : l[0], reverse = True)
    first_len, first_list = sorted_postings_lists.pop()
    while sorted_postings_lists:
        second_len, second_list = sorted_postings_lists.pop()
        merged_list = []
        p1 = 0
        p2 = 0
        length = 0
        while p1 < first_len and p2 < second_len:
            if first_list[p1] == second_list[p2]:
                merged_list.append(first_list[p1])
                p1 += 1
                p2 += 1
                length += 1
            elif first_list[p1] < second_list[p2]:
                p1 += 1
            else:
                p2 += 1
        first_list = merged_list
        first_len = length
    return merged_list

In [28]:
def tfidf_weight(token,docid,term_freq,posting_list):
    if token not in term_freq[docid]:
        return 0
    tf = 1+math.log10(term_freq[docid][token])
    idf = math.log10((len(term_freq)-1)/posting_list[token][0])
    return (tf*idf)

In [55]:
# def tfidf_score(query_tokens,docid,tfidf,posting_list):
def tfidf_score(query_tokens,docid,tfidf):
    '''
    Input: tokens in query, document id, term frequencies
    Output: tf-idf score
    
    Compute tf-idf score of a document for a query
    '''
    
    score = 0
    for token in query_tokens:
#         score += tfidf_weight(token,docid,term_freq,posting_list)
        score += tfidf[docid][token]
    return score

In [56]:
def cosine_score(doc,tfidf,length):
    score = 0
    for term in tfidf[0]:
        if term in tfidf[doc]:
            score += tfidf[0][term]*tfidf[doc][term]
    score /= (length[0]*length[doc])
    return score

In [57]:
# MAIN
start = time()

path = os.getcwd() +'/HillaryEmails/'
# print(path)
all_files = directory_listing(path)
# print(all_files)
#all_files = all_files[:10]

all_token_pairs = []

for file in all_files:
    file_text = read_file(file)
    token_pairs = tokenization(file_text, file)
    modified_token_pairs = linguistic_modules(token_pairs)
    all_token_pairs = all_token_pairs + modified_token_pairs

end = time()
print(end - start)

freq = [{} for i in range(len(all_files)+1)]
for token,docid in all_token_pairs:
    if token not in freq[docid]:
        freq[docid][token] = 0
    freq[docid][token] += 1

447.6319410800934


In [58]:
sorted_tokens = sort_tokens(all_token_pairs)

In [59]:
start = time()

posting_list = transformation_into_postings(sorted_tokens)
# posting_list

end = time()
print(end - start)

2.2403481006622314


In [68]:
def deep_getsizeof(o, ids):
    d = deep_getsizeof
    if id(o) in ids:
        return 0
    r = getsizeof(o)
    ids.add(id(o))
    r = getsizeof(o)
    ids.add(id(o))
    if isinstance(o, str):
        return r
    if isinstance(o, Mapping):
        return r + sum(d(k, ids) + d(v, ids) for k, v in o.items())
    if isinstance(o, Container):
        return r + sum(d(x, ids) for x in o)
    return r

In [70]:
deep_getsizeof(posting_list,set())
deep_getsizeof({},set())

240

In [23]:
for term in posting_list:
    if posting_list[term][0]<=20 and  posting_list[term][0]>=5:
        print(term,posting_list[term][0])

00 9
0045 5
00b 7
0110 9
0129 7
0130 6
02082025 5
02102025 5
02138 11
022002 6
0235 5
0333 5
0359 5
040252 5
04232035 6
04252035 5
050638 6
051614 9
054058 5
054156 5
055920 16
0600 10
062736 6
0630 12
0633 6
0635 7
0636 8
0638 6
06397276 15
0641 6
0654 5
065750 5
065804 16
0700 5
070027 6
070639 14
071355 5
072417 8
072721 6
072912 7
0730 7
073445 5
074104 6
075103 9
075145 14
075659 5
075714 9
075852 20
080012 6
081224 6
08142024 8
081654 5
081725 13
082038 5
082403 5
082557 14
083056 12
083145 12
083317 11
083921 5
084238 13
084447 5
0845 8
0851 6
085135 12
085416 5
085813 5
0900 5
090318 12
090817 13
091404 7
091441 12
0917 9
092145 5
0923 5
092625 9
0930 9
093006 5
093131 10
093448 10
0935 8
093633 9
094314 5
094319 7
094848 6
094939 5
095503 5
095645 9
095907 6
0g2 8
100000 13
1005am 7
100909 5
100bn 7
100k 5
100pm 13
1011 16
1012 16
101235 18
1017 17
102 11
1020am 14
102252 7
1026 18
1029 16
103 19
1030am 10
103111 5
103203 15
1033 16
1034 20
1036 19
1037 13
1038 18
103901 6
104

bath 6
baton 5
battalion 10
batter 6
batteri 7
battlefield 10
battleground 7
bauer 17
bayh 7
bazaar 8
bca 6
bcl 8
bdbco 14
bdnews24com 6
bdr 5
beach 19
beacon 7
beal 19
bealecastategov 10
bean 16
beard 6
beatric 6
beck 20
becker 5
becki 5
bed 18
bedinhstategov 5
beef 5
beer 10
befit 5
befriend 7
beg 10
beget 9
behalpa 17
behaviour 8
behead 5
behemoth 11
behest 12
behindthescen 5
beholden 6
belaru 14
belatedli 6
belgian 9
belgrad 6
bell 6
bellegard 5
belliard 10
bellicos 5
belliger 14
bellow 6
belqasim 19
belt 8
beltway 11
benazir 8
bench 15
benchmark 17
bend 6
beneath 15
benefactor 6
benefic 7
benign 5
benjaminmoncrieflemieuxsenategov 8
bent 15
benton 5
berkeley 6
berlusconi 19
bermuda 8
bermudez 7
bernard 15
berniertoth 12
berrusien 6
beset 10
besieg 6
bestsel 17
betti 6
bf 8
bfg 7
bg 5
bhutto 12
bia 9
bibl 13
biblic 12
bic 7
bicker 10
bidder 5
bidenasdov 10
bigot 7
bigotri 11
bildt 12
billi 15
bimonthli 7
binat 5
binder 18
bing 6
bingaman 5
bio 20
biofuel 9
biograph 10
biographi 9
bi

entail 14
entebb 6
entic 11
entireti 7
entranc 14
entrench 14
entrepreneuri 8
entrepreneurship 12
enu 13
envelop 19
environmentalist 5
envisag 6
eotausch 5
ep 10
epa 10
epap 6
epic 12
epicent 9
epidem 16
epp 9
equadorani 5
equit 18
equivoc 11
er 17
erad 8
eras 11
erekat 16
erin 6
ernest 5
eros 8
errandboy 5
errat 16
erron 5
ert 5
escort 18
esmtp 12
espionag 8
espirito 6
espo 5
espous 11
esq 8
esquir 13
essay 16
esta 5
estado 6
esteem 8
estonia 5
estrang 14
eta 13
ethan 6
ethiopia 10
ethiopian 13
eton 6
etonian 8
etranger 9
eugen 11
euphem 10
europa 5
euroz 5
euu 6
evan 14
evangel 16
evas 18
evenhanded 5
evenli 6
evergreenhdr22clintonemailcom 11
evertough 5
evict 7
evidenc 10
evok 8
evolut 18
evyenia 6
ex 12
exagger 9
exam 11
exce 10
exceedingli 8
exchequ 7
excis 7
excruci 8
exec 17
exemplifi 10
exhibit 18
exhort 5
exig 5
exmilitari 8
exodu 6
exoner 5
expat 10
expatri 8
expedi 6
expedit 13
expediti 10
expeditionari 7
expel 16
expend 14
expenditur 15
explicit 12
exponenti 7
exposur 6
exp

jewelri 6
jfk 11
jiabao 9
jigsaw 19
jihad 13
jihadi 5
jihadist 13
jillian 6
jilotylcstategovi 6
jilotylcstategoy 10
jima 5
jintao 7
jirga 6
jj 10
joanna 5
jobkil 6
jobless 6
jock 6
jockey 11
joel 8
johannesburg 5
johna 5
johnstown 7
jolt 6
jona 5
jonespw2stategov 5
jordanian 19
jorg 11
jornal 6
joschka 6
joyc 16
joyou 6
jp 6
jsullivanilstategov 8
jsullivanjjstategov 13
jsullivannstategov 7
juarez 12
juba 8
jubil 8
judaism 5
judd 6
judea 12
judeh 10
judgement 5
juggl 11
julia 18
juliana 6
julien 17
julio 15
juliu 6
jumpstart 7
junctur 8
junta 8
juri 18
justica 6
justif 19
juwali 11
k1a 8
kaczynski 6
kadima 7
kagan 7
kaiser 6
kalyan 7
kamala 14
kampala 9
kamran 5
kan 7
kandahar 19
kansa 9
kantor 5
kara 6
karabakh 5
karachi 11
karim 5
karin 6
kashmir 11
kate 8
katherin 14
katheyle 6
kathi 5
kathmandu 5
katrina 9
katz 5
kaufman 5
kayani 6
kazakh 7
kazakhstan 13
kb 13
kbh 6
kean 12
keat 5
keenli 6
keeper 7
keib 19
keith 17
kellycstategov 10
kemp 6
kendal 8
kendrick 5
kenna 11
kennan 5
kenne

permrep 12
perri 11
persever 6
persian 12
persona 7
persorg 6
persorgb6 6
persuas 19
pertain 5
pertin 7
peruvian 5
pervaiz 8
pervas 17
pervers 11
pervez 8
peshawar 15
pessimist 15
pester 5
pet 6
pete 12
peterson 8
petit 18
petraeuss 19
petrochem 6
petroleo 10
petti 14
pew 14
pfizer 6
ph 12
pharma 8
pharmaceut 7
phenomenon 16
philadelphia 15
philanthrop 12
philanthropi 16
philanthropist 14
philli 7
philosoph 8
philosophi 20
phoni 6
picker 14
pickup 13
pie 17
pier 8
pierci 9
pierreloui 12
pig 10
pigott 10
pih 10
pile 15
pillag 5
pilot 20
pin 14
pinato 6
ping 6
pioneer 15
pipe 13
piraci 6
pirat 9
piss 5
pit 10
pitfal 18
pitt 5
pittsburgh 14
placard 6
placehold 6
placement 9
plagu 17
plainli 10
plaintiff 7
planner 16
plastic 11
plate 9
platitud 6
platt 6
plaudit 7
plausibl 15
playbook 5
plaza 13
plea 19
plenari 15
plight 7
plo 11
plouff 7
plowden 12
plug 20
plummet 5
plung 6
plural 20
pmelect 9
po 8
podium 15
poem 9
poet 8
poetri 8
pogo 5
pogrom 8
poignant 5
pointedli 6
pointless 13
points

sr 9
srap 20
sreebni 11
srsg 6
ssaudabayev 9
sspecialassistantsstategov 5
ssrap 8
ssrapstaffassist 6
sta 10
stab 11
stack 14
stadium 10
stagger 9
stagnant 5
stalbott 13
stalin 9
stalk 6
stamp 8
stan 17
standardbear 6
standbi 7
standoff 20
standpoint 10
stanford 17
stanton 16
stapl 5
stare 12
stark 20
starkli 6
starr 8
startl 9
startup 8
starv 11
stasi 6
stat 6
statecraft 19
stateforeign 7
stategov 17
statehood 13
statenew 6
staterun 8
statesman 17
statesmanship 7
statesmen 12
stateusaid 13
stateusaidmcc 9
statewid 9
static 7
stationeri 6
statut 8
statutori 5
staunch 8
stave 5
ste 5
steadfast 13
steadili 11
steal 13
stealth 9
steep 8
stein 6
steinbergjbstategov 20
stella 5
stepbystep 6
stephani 19
stereotyp 6
sterl 10
sterntdstategov 8
steroid 7
steward 5
stewardship 5
stf 6
stiff 10
stiffen 6
stifl 13
stiglitz 5
sting 6
stint 11
stipend 8
stipul 9
stj 5
stockholm 11
stockpil 5
stoer 15
stoke 11
stolen 6
stomach 7
stonebridg 7
storag 16
storey 5
storylin 9
stout 16
stove 6
straddl 7
str

In [18]:
for doc in range(1,len(freq)):
    for term in freq[doc]:
        tf = 1+math.log10(freq[doc][term])
        idf = math.log10((len(freq)-1)/posting_list[term][0])
        freq[doc][term] = tf*idf

In [19]:
length = [0]
for i in range(1,len(freq)):
    length.append(np.linalg.norm(np.array(list(freq[i].values()))))

In [14]:
postings_list_merge([posting_list['mail'],posting_list['phone'],posting_list['clinton']])

[1325, 1397]

In [26]:
# User Queries with TF-IDF
stemmer = PorterStemmer()
query = input('Enter your query: ')
while query is not '':
    start = time()
    tokens = query.split()  
    stemmed_tokens = [stemmer.stem(token.translate(str.maketrans('','',string.punctuation)).lower()) 
            for token in tokens
                if token.translate(str.maketrans('','',string.punctuation)) is not '']
    docs = postings_list_merge([posting_list[tok] for tok in stemmed_tokens])
    end = time()
    t1 = time()
    result = sorted(docs,key = lambda d: tfidf_score(set(stemmed_tokens),d,freq))
    t2 = time()
    print('Time taken to retrieve: '+str(end-start))
    print('Time taken to rank: '+str(t2-t1))
    print('Ranked list of files with query: '+str([str(res)+'.txt' for res in result]))
    print()
    query = input('Enter your query: ')

Enter your query: Nazi Hitler
Time taken to rank: 2.6941299438476562e-05
Ranked list of files with query: ['3560.txt', '6488.txt', '6489.txt', '3606.txt', '5806.txt', '6751.txt']
Time taken: 0.00020003318786621094

Enter your query: Nazi mastermind Hitler
Time taken to rank: 1.2159347534179688e-05
Ranked list of files with query: ['3606.txt']
Time taken: 0.0003368854522705078

Enter your query: 


In [29]:
# User Queries with Cosine Similarity
stemmer = PorterStemmer()
query = input('Enter your query: ')
while query is not '':
    start = time()
    tokens = query.split()  
    stemmed_tokens = [stemmer.stem(token.translate(str.maketrans('','',string.punctuation)).lower()) 
            for token in tokens
                if token.translate(str.maketrans('','',string.punctuation)) is not '']
    docs = postings_list_merge([posting_list[tok] for tok in stemmed_tokens])
    freq[0] = {}
    for tok in stemmed_tokens:
        if tok not in freq:
            freq[0][tok] = 0
        freq[0][tok] += 1
    for tok in stemmed_tokens:
        tf = 1+math.log10(freq[0][tok])
        idf = math.log10((len(freq)-1)/posting_list[tok][0])
        freq[0][tok] = tf*idf
    length[0] = np.linalg.norm(np.array(list(freq[0].values())))
    end = time()
    t1 = time()
    result = sorted(docs,key = lambda d: cosine_score(d,freq,length))
    t2 = time()
    print('Time taken to retrieve: '+str(end-start))
    print('Time taken to rank: '+str(t2-t1))
    print('Ranked list of files with query: '+str([str(res)+'.txt' for res in result]))
    print()
    query = input('Enter your query: ')

Enter your query: Clinton's email and phone
Time taken to rank: 0.0001800060272216797
Ranked list of files with query: ['5490.txt', '5288.txt', '5998.txt', '1397.txt', '5264.txt', '7541.txt', '4299.txt', '137.txt', '133.txt', '132.txt', '7907.txt', '126.txt', '32.txt', '6675.txt', '6030.txt', '6823.txt', '6674.txt', '6024.txt', '5300.txt', '3280.txt', '665.txt', '4335.txt', '2420.txt', '3911.txt', '2355.txt', '5991.txt', '3910.txt', '2353.txt', '1416.txt', '2536.txt', '5549.txt', '2118.txt', '2043.txt']
Time taken: 0.005117893218994141

Enter your query: Hillary Clinton email about politics and presidential election
Time taken to rank: 8.869171142578125e-05
Ranked list of files with query: ['5789.txt', '6258.txt', '6037.txt', '5998.txt', '3263.txt', '5098.txt', '2119.txt', '4299.txt', '3903.txt', '137.txt', '133.txt', '132.txt', '126.txt']
Time taken: 0.007580757141113281

Enter your query: 
