In [1]:
import numpy as np

import os
from os import listdir
from os.path import isfile, join

import glob
import string
from nltk.stem.porter import *

In [2]:
def directory_listing(dir_path):
    '''
    Input:  string (path to directory)
    Output: list of strings (full paths to files in the directory)
    '''
    
    return glob.glob(dir_path + '*.txt')

In [3]:
path = os.getcwd() +'/HillaryEmails/'
all_files = directory_listing(path)
all_files

['/home/tasnim/projects/temp/information-retrieval/HillaryEmails/133.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/483.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/5619.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/4262.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/7604.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/6811.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/1840.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/2566.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/1526.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/2123.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/7607.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/6397.txt',
 '/home/tasnim/projects/temp/information-retrieval/HillaryEmails/6014.txt',
 '/home/tasnim

In [4]:
def read_file(file_path):
    '''
    Input:  string (full path to file)
    Output: string/text (full contents of a file)
    '''
    
    lines = [line.rstrip('\n') for line in open(all_files[0])]
    return ' '.join(lines)

In [5]:
file_text = read_file(all_files[0])
file_text

'UNCLASSIFIED U.S. Department of State Case No. F-2015-04841 Doc No. C05739714 Date: 05/13/2015 STATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM. SUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER. RELEASE IN FULL From: Mills, Cheryl D <MillsCD©state.gova Sent Sunday, October 14, 2012 9:46 AM To: Subject: Re: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad In car driving so can in a bit From: H [mallto:H0R22@clintonemail.corri] Sent: Sunday, October 14, 2012 09:39 AM To: Mills, Cheryl D Subject: Re: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad Very nice. Can you talk? From: Mills, Cheryl D [mailto:MillsCD@state,gov] Sent: Sunday, October 14, 2012 09:00 AM To: H Subject Fw: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad , From: Hensman, Chris D Sent: Sunday, October 14, 2012 08:19 AM To: Coordination Cc: Mills, Cheryl D; Reines, Philippe I Subject: Fw: Bloomberg: Libyan Ambassador\'s Death 

In [6]:
def tokenization(text, file_path):
    '''
    Input:  text(file contents), string (document id = path to file)
    Output: list of pairs < string(token) , string (document id) >
    '''
    doc_id = os.path.basename(file_path).replace('.txt','') #retrieve document id from file path
    tokens = text.split()
    return [(token, doc_id) for token in tokens]
    

In [7]:
token_pairs = tokenization(file_text, all_files[0])
print("length of pairs: ", len(token_pairs))
token_pairs

length of pairs:  1326


[('UNCLASSIFIED', '133'),
 ('U.S.', '133'),
 ('Department', '133'),
 ('of', '133'),
 ('State', '133'),
 ('Case', '133'),
 ('No.', '133'),
 ('F-2015-04841', '133'),
 ('Doc', '133'),
 ('No.', '133'),
 ('C05739714', '133'),
 ('Date:', '133'),
 ('05/13/2015', '133'),
 ('STATE', '133'),
 ('DEPT.', '133'),
 ('-', '133'),
 ('PRODUCED', '133'),
 ('TO', '133'),
 ('HOUSE', '133'),
 ('SELECT', '133'),
 ('BENGHAZI', '133'),
 ('COMM.', '133'),
 ('SUBJECT', '133'),
 ('TO', '133'),
 ('AGREEMENT', '133'),
 ('ON', '133'),
 ('SENSITIVE', '133'),
 ('INFORMATION', '133'),
 ('&', '133'),
 ('REDACTIONS.', '133'),
 ('NO', '133'),
 ('FOIA', '133'),
 ('WAIVER.', '133'),
 ('RELEASE', '133'),
 ('IN', '133'),
 ('FULL', '133'),
 ('From:', '133'),
 ('Mills,', '133'),
 ('Cheryl', '133'),
 ('D', '133'),
 ('<MillsCD©state.gova', '133'),
 ('Sent', '133'),
 ('Sunday,', '133'),
 ('October', '133'),
 ('14,', '133'),
 ('2012', '133'),
 ('9:46', '133'),
 ('AM', '133'),
 ('To:', '133'),
 ('Subject:', '133'),
 ('Re:', '133'),

In [8]:
def linguistic_modules(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: list of pairs < modified token , document id >
    
    modified token: removing all punctuation symbols (!@#$%^&*()-_=+’`~”:;/.,?[]{}<>),lowercasingand stemming.
    '''
    
    stemmer = PorterStemmer()  
    return [(stemmer.stem(token.translate(str.maketrans('','',string.punctuation)).lower()), doc_id)  
            for token, doc_id in token_pairs  
                if token.translate(str.maketrans('','',string.punctuation)) is not ''] #if statement to check empty token
    

In [9]:
modified_token_pairs = linguistic_modules(token_pairs)
print('length after modification: ', len(modified_token_pairs))
modified_token_pairs

length after modification:  1313


[('unclassifi', '133'),
 ('us', '133'),
 ('depart', '133'),
 ('of', '133'),
 ('state', '133'),
 ('case', '133'),
 ('no', '133'),
 ('f201504841', '133'),
 ('doc', '133'),
 ('no', '133'),
 ('c05739714', '133'),
 ('date', '133'),
 ('05132015', '133'),
 ('state', '133'),
 ('dept', '133'),
 ('produc', '133'),
 ('to', '133'),
 ('hous', '133'),
 ('select', '133'),
 ('benghazi', '133'),
 ('comm', '133'),
 ('subject', '133'),
 ('to', '133'),
 ('agreement', '133'),
 ('on', '133'),
 ('sensit', '133'),
 ('inform', '133'),
 ('redact', '133'),
 ('no', '133'),
 ('foia', '133'),
 ('waiver', '133'),
 ('releas', '133'),
 ('in', '133'),
 ('full', '133'),
 ('from', '133'),
 ('mill', '133'),
 ('cheryl', '133'),
 ('d', '133'),
 ('millscd©stategova', '133'),
 ('sent', '133'),
 ('sunday', '133'),
 ('octob', '133'),
 ('14', '133'),
 ('2012', '133'),
 ('946', '133'),
 ('am', '133'),
 ('to', '133'),
 ('subject', '133'),
 ('re', '133'),
 ('bloomberg', '133'),
 ('libyan', '133'),
 ('ambassador', '133'),
 ('death',

In [10]:
l = [('unclassifi', '133'),('us', '133'),('depart', '133'), ('!','12')]
linguistic_modules(l)

[('unclassifi', '133'), ('us', '133'), ('depart', '133')]

In [11]:
def sort_tokens(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: sorted list of pairs < token , document id >
    
    perform sorting of the token list: first by tokens (alphabetical order), and then by document ids 
    '''
    
    return sorted(token_pairs, key=lambda element: (element[0], element[1]))

In [12]:
sorted_modified_token_pairs = sort_tokens(modified_token_pairs)
sorted_modified_token_pairs

[('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('0758', '133'),
 ('0819', '133'),
 ('0900', '133'),
 ('0939', '133'),
 ('11', '133'),
 ('12', '133'),
 ('14', '133'),
 ('14', '133'),
 ('14', '133'),
 ('14', '133'),
 ('14', '133'),
 ('14', '133'),
 ('16member', '133'),
 ('2011', '133'),
 ('2012', '133'),
 ('2012', '133'),
 ('2012', '133'),
 ('2012', '133'),
 ('2012', '133'),
 ('2012', '133'),
 ('3', '133'),
 ('77', '133'),
 ('946', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('a', '133'),
 ('abhorr', '133'),
 (

In [13]:
path = os.getcwd() +'/HillaryEmails/'
all_files = directory_listing(path)
#all_files = all_files[:10]

all_token_pairs = []

for i in range(len(all_files)):
    file_text = read_file(all_files[i])
    token_pairs = tokenization(file_text, all_files[i])
    modified_token_pairs = linguistic_modules(token_pairs)
    all_token_pairs = all_token_pairs + modified_token_pairs
  

In [14]:
sorted_token_pairs = sort_tokens(all_token_pairs)
sorted_token_pairs

[('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1840'),
 ('05132015', '1840'),
 ('05132015', '1840'),
 ('05132015', '1840'),
 ('05132015', '1840'),
 ('05132015', '1840'),
 ('05132015', '2123'),
 ('05132015', '2123'),
 ('05132015', '2123'),
 ('05132015', '2123'),
 ('05132015', '2123'),
 ('05132015', '2123'),
 ('05132015', '2566'),
 ('05132015', '2566'),
 ('05132015', '2566'),
 ('05132015', '2566'),
 ('05132015', '2566'),
 ('05132015', '2566'),
 ('05132015', '4262'),
 ('05132015', '4262'),
 ('05132015', '4262'),
 ('05132015', '4262'),
 ('05132015', '4262'),
 ('05132015', '4262'),
 ('05132015', '483'),
 ('05132015', '483'),
 ('05132015', '483'),
 ('05132015', '483'),
 ('05132015', '483'),
 ('05132015', '483'),
 ('05132015', '5619'),
 ('05132015', '5619'),
