In [1]:
import numpy as np

import os
from os import listdir
from os.path import isfile, join

import glob
import string
from nltk.stem.porter import *

In [2]:
def directory_listing(dir_path):
    '''
    Input:  string (path to directory)
    Output: list of strings (full paths to files in the directory)
    '''
    
    return glob.glob(dir_path + '*.txt')

In [3]:
path = os.getcwd() +'/HillaryEmails/'
all_files = directory_listing(path)
#all_files

In [4]:
def read_file(file_path):
    '''
    Input:  string (full path to file)
    Output: string/text (full contents of a file)
    '''
    
    lines = [line.rstrip('\n') for line in open(all_files[0])]
    return ' '.join(lines)

In [5]:
file_text = read_file(all_files[0])
file_text

'UNCLASSIFIED U.S. Department of State Case No. F-2015-04841 Doc No. C05739714 Date: 05/13/2015 STATE DEPT. - PRODUCED TO HOUSE SELECT BENGHAZI COMM. SUBJECT TO AGREEMENT ON SENSITIVE INFORMATION & REDACTIONS. NO FOIA WAIVER. RELEASE IN FULL From: Mills, Cheryl D <MillsCD©state.gova Sent Sunday, October 14, 2012 9:46 AM To: Subject: Re: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad In car driving so can in a bit From: H [mallto:H0R22@clintonemail.corri] Sent: Sunday, October 14, 2012 09:39 AM To: Mills, Cheryl D Subject: Re: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad Very nice. Can you talk? From: Mills, Cheryl D [mailto:MillsCD@state,gov] Sent: Sunday, October 14, 2012 09:00 AM To: H Subject Fw: Bloomberg: Libyan Ambassador\'s Death Not a Political Issue, Says Dad , From: Hensman, Chris D Sent: Sunday, October 14, 2012 08:19 AM To: Coordination Cc: Mills, Cheryl D; Reines, Philippe I Subject: Fw: Bloomberg: Libyan Ambassador\'s Death 

In [6]:
def tokenization(text, file_path):
    '''
    Input:  text(file contents), string (document id = path to file)
    Output: list of pairs < string(token) , string (document id) >
    '''
    doc_id = os.path.basename(file_path).replace('.txt','') #retrieve document id from file path
    tokens = text.split()
    return [(token, doc_id) for token in tokens]
    

In [7]:
token_pairs = tokenization(file_text, all_files[0])
print("length of pairs: ", len(token_pairs))
#token_pairs

length of pairs:  1326


In [8]:
def linguistic_modules(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: list of pairs < modified token , document id >
    
    modified token: removing all punctuation symbols (!@#$%^&*()-_=+’`~”:;/.,?[]{}<>),lowercasingand stemming.
    '''
    
    stemmer = PorterStemmer()  
    return [(stemmer.stem(token.translate(str.maketrans('','',string.punctuation)).lower()), doc_id)  
            for token, doc_id in token_pairs  
                if token.translate(str.maketrans('','',string.punctuation)) is not ''] #if statement to check empty token
    

In [9]:
modified_token_pairs = linguistic_modules(token_pairs)
print('length after modification: ', len(modified_token_pairs))
#modified_token_pairs

length after modification:  1313


In [10]:
l = [('unclassifi', '133'),('us', '133'),('depart', '133'), ('!','12')]
linguistic_modules(l)

[('unclassifi', '133'), ('us', '133'), ('depart', '133')]

In [11]:
def sort_tokens(token_pairs):
    '''
    Input:  list of pairs < token , document id >
    Output: sorted list of pairs < token , document id >
    
    perform sorting of the token list: first by tokens (alphabetical order), and then by document ids 
    '''
    
    return sorted(token_pairs, key=lambda element: (element[0], element[1]))

In [12]:
sorted_modified_token_pairs = sort_tokens(modified_token_pairs)
#sorted_modified_token_pairs

In [14]:
# MAIN

path = os.getcwd() +'/HillaryEmails/'
all_files = directory_listing(path)
all_files = all_files[:100]

all_token_pairs = []

for i in range(len(all_files)):
    file_text = read_file(all_files[i])
    token_pairs = tokenization(file_text, all_files[i])
    modified_token_pairs = linguistic_modules(token_pairs)
    all_token_pairs = all_token_pairs + modified_token_pairs
    
sorted_token_pairs = sort_tokens(all_token_pairs)
sorted_token_pairs  

[('05132015', '1059'),
 ('05132015', '1059'),
 ('05132015', '1059'),
 ('05132015', '1059'),
 ('05132015', '1059'),
 ('05132015', '1059'),
 ('05132015', '106'),
 ('05132015', '106'),
 ('05132015', '106'),
 ('05132015', '106'),
 ('05132015', '106'),
 ('05132015', '106'),
 ('05132015', '1164'),
 ('05132015', '1164'),
 ('05132015', '1164'),
 ('05132015', '1164'),
 ('05132015', '1164'),
 ('05132015', '1164'),
 ('05132015', '1218'),
 ('05132015', '1218'),
 ('05132015', '1218'),
 ('05132015', '1218'),
 ('05132015', '1218'),
 ('05132015', '1218'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '133'),
 ('05132015', '1432'),
 ('05132015', '1432'),
 ('05132015', '1432'),
 ('05132015', '1432'),
 ('05132015', '1432'),
 ('05132015', '1432'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1526'),
 ('05132015', '1792'),
 ('05132015', '1792'),
