In [1]:
import numpy as np

import time
import os
from os import listdir
from os.path import isfile, join

import glob
import string
from nltk.stem.porter import *

In [2]:
def directory_listing(dir_path):
    '''
    Input:  string (path to directory)
    Output: list of strings (full paths to files in the directory)
    '''
    
    return glob.glob(dir_path + '*.txt')

In [31]:
path = os.getcwd() +'/HillaryEmails/'
all_files = directory_listing(path)
print(all_files)
#all_files

[]


In [3]:
def read_file(file_path):
    '''
    Input:  string (full path to file)
    Output: string/text (full contents of a file)
    '''
    
    lines = [line.rstrip('\n') for line in open(file_path)]
    return ' '.join(lines)

In [62]:
file_text = read_file(all_files[0])
file_text

'UNCLASSIFIED U.S. Department of State Case No. F-2015-04841 Doc No. C05739714 Date: 05/13/2015 STATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM. SUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER. RELEASE IN FULL From: Mills, Cheryl D <MillsCD©state.gova Sent Sunday, October 14, 2012 9:46 AM To: Subject: Re: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad In car driving so can in a bit From: H [mallto:H0R22@clintonemail.corri] Sent: Sunday, October 14, 2012 09:39 AM To: Mills, Cheryl D Subject: Re: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad Very nice. Can you talk? From: Mills, Cheryl D [mailto:MillsCD@state,gov] Sent: Sunday, October 14, 2012 09:00 AM To: H Subject Fw: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad , From: Hensman, Chris D Sent: Sunday, October 14, 2012 08:19 AM To: Coordination Cc: Mills, Cheryl D; Reines, Philippe I Subject: Fw: Bloomberg: Libyan Ambassador\'s Death 

In [4]:
def tokenization(text, file_path):
    '''
    Input:  text(file contents), string (document id = path to file)
    Output: list of pairs < string(token) , string (document id) >
    '''
    doc_id = int(os.path.basename(file_path).replace('.txt','')) #retrieve document id from file path
    tokens = text.split()
    return [(token, doc_id) for token in tokens]
    

In [15]:
token_pairs = tokenization(file_text, all_files[0])
print("length of pairs: ", len(token_pairs))
#token_pairs

NameError: name 'file_text' is not defined

In [5]:
def linguistic_modules(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: list of pairs < modified token , document id >
    
    modified token: removing all punctuation symbols (!@#$%^&*()-_=+’`~”:;/.,?[]{}<>),lowercasingand stemming.
    '''
    
    stemmer = PorterStemmer()  
    return [(stemmer.stem(token.translate(str.maketrans('','',string.punctuation)).lower()), doc_id)  
            for token, doc_id in token_pairs  
                if token.translate(str.maketrans('','',string.punctuation)) is not ''] #if statement to check empty token
    

In [66]:
modified_token_pairs = linguistic_modules(token_pairs)
print('length after modification: ', len(modified_token_pairs))
#modified_token_pairs

length after modification:  1313


In [67]:
l = [('unclassifi', '133'),('us', '133'),('depart', '133'), ('!','12')]
linguistic_modules(l)

[('unclassifi', '133'), ('us', '133'), ('depart', '133')]

In [6]:
def sort_tokens(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: sorted list of pairs < token , document id >
    
    perform sorting of the token list: first by tokens (alphabetical order), and then by document ids 
    '''
    
    return sorted(token_pairs, key=lambda element: (element[0], element[1]))

In [69]:
sorted_modified_token_pairs = sort_tokens(modified_token_pairs)
#sorted_modified_token_pairs

In [7]:
def transformation_into_postings(sorted_token_pairs):
    '''
    Input: sorted list of pairs < token , document id >
    Output: inverted index
    
    Used dictionary data structure (Hash table)
    '''
    
    dictionary_ = {}
    for a, b in sorted_token_pairs:
        dictionary_.setdefault(a, []).append(b)
#     dictionary_ = {key:list(sorted(set(value))) for (key, value) in dictionary_.items()}
    for key in dictionary_:
        value = dictionary_[key]
        posting = list(sorted(set(value)))
        dictionary_[key] = (len(posting),posting)
    return dictionary_

In [71]:
posting_list = transformation_into_postings(sorted_modified_token_pairs)
#posting_list

In [72]:
a = [483, 483, 1526, 1526, 1840, 1840, 1840, 1840]
list(set(a))

[1840, 483, 1526]

In [73]:
d = {'a': [1,1,2], 'c': [3], 'b': [2,3,2]}
{key:list(set(value)) for (key, value) in d.items()}


{'a': [1, 2], 'c': [3], 'b': [2, 3]}

In [74]:
dict1 = {'a': 1, 'b': 2, 'c': 3, 'd': 4, 'e': 5}
# Double each value in the dictionary
double_dict1 = {k:v*2 for (k,v) in dict1.items()}
print(double_dict1)

{'a': 2, 'b': 4, 'c': 6, 'd': 8, 'e': 10}


In [11]:
def postings_list_merge(postings_lists):
    '''
    Input: list of postings lists 
    Output: merged postings list
    
    Intersect the postings lists in increasing order of length
    '''
    
    sorted_postings_lists = sorted(postings_lists, key = lambda l : l[0], reverse = True)
    first_len, first_list = sorted_postings_lists.pop()
    while sorted_postings_lists:
        second_len, second_list = sorted_postings_lists.pop()
        merged_list = []
        p1 = 0
        p2 = 0
        length = 0
        while p1 < first_len and p2 < second_len:
            if first_list[p1] == second_list[p2]:
                merged_list.append(first_list[p1])
                p1 += 1
                p2 += 1
                length += 1
            elif first_list[p1] < second_list[p2]:
                p1 += 1
            else:
                p2 += 1
        first_list = merged_list
        first_len = length
    return merged_list

In [9]:
# MAIN
#start = time.time()

path = os.getcwd() +'/HillaryEmails/'
# print(path)
all_files = directory_listing(path)
# print(all_files)
#all_files = all_files[:10]

all_token_pairs = []

for i in range(len(all_files)):
    file_text = read_file(all_files[i])
    token_pairs = tokenization(file_text, all_files[i])
# for file in all_files:
#     file_text = read_file(file)
#     token_pairs = tokenization(file_text, file)
    modified_token_pairs = linguistic_modules(token_pairs)
    all_token_pairs = all_token_pairs + modified_token_pairs

#end = time.time()
#print(end - start)

In [10]:
#start = time.time()

sorted_token_pairs = sort_tokens(all_token_pairs)
posting_list = transformation_into_postings(sorted_token_pairs)
posting_list


#end = time.time()
#print(end - start)

{'0': (194,
  [15,
   17,
   28,
   32,
   44,
   45,
   50,
   76,
   96,
   111,
   126,
   129,
   130,
   137,
   145,
   174,
   188,
   192,
   193,
   200,
   204,
   217,
   222,
   225,
   228,
   230,
   233,
   238,
   262,
   266,
   284,
   287,
   288,
   372,
   399,
   587,
   589,
   610,
   725,
   744,
   747,
   756,
   769,
   778,
   779,
   827,
   891,
   909,
   1189,
   1285,
   1375,
   1574,
   1579,
   1613,
   1614,
   1693,
   1831,
   1839,
   1923,
   1974,
   2035,
   2047,
   2076,
   2107,
   2114,
   2226,
   2228,
   2341,
   2344,
   2348,
   2409,
   2425,
   2584,
   2590,
   2640,
   2642,
   2684,
   2697,
   2949,
   2955,
   3028,
   3120,
   3276,
   3280,
   3281,
   3307,
   3319,
   3336,
   3341,
   3357,
   3376,
   3388,
   3696,
   3734,
   3751,
   3769,
   3774,
   3790,
   3799,
   3839,
   3878,
   3882,
   3885,
   3952,
   3965,
   3973,
   3987,
   4148,
   4161,
   4226,
   4265,
   4283,
   4349,
   4356,
   4359,
   4360,
 

In [14]:
postings_list_merge([posting_list['mail'],posting_list['phone'],posting_list['clinton']])

[1325, 1397]