- This notebooks will read all emails under csv format which are located in /content/My Drive/Colab Notebooks/s_user_csv and clean it.
- Processed data is saved and used in other notebooks.

**Setup the google colab environment**

In [None]:
from google.colab import drive
# This will prompt for authorization.
# authorization code: 4/OwErfUj6QceGXhIGx_RWv0MKclb9rilw8UsJnZqFbSez-QS8zQ399JU
drive.mount('/content/drive')

!pip install PyDrive

from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

**Import the libraries**

In [None]:
import pandas as pd
import os
import warnings
warnings.filterwarnings("ignore")
from joblib import dump, load
import nltk
nltk.download('stopwords')
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

**Google drive access path**

In [None]:
csv_path = '/content/drive/My Drive/Colab Notebooks/s_user_csv/'
metadata_path = '/content/drive/My Drive/Colab Notebooks/output/'
#metadata_ents_path = '/content/drive/My Drive/Colab Notebooks/metadata_ents'

**Select the content of emails of all users**

In [None]:
def read_all_emails(file_path):
    '''
    Go through all csv files in file_path, read the file content
    - Agr:
        @file_path (string): the location of csv files
    - Return:
        a list of string (list)
    '''

    list_file = os.listdir(file_path)
    list_content = []
    for f_csv in list_file:
    if ('.csv' in f_csv):
        # print(f_csv)
        data = pd.read_csv(file_path + '/' + f_csv, encoding = "ISO-8859-1")
        for content in data['content'].values:
            if (content):
                list_content.append(content)
    return list_content

list_content = read_all_emails(csv_path)
# save list_content on drive
dump(list_content, metadata_path + 'list_content.joblib')

- Clean the mail content:
  - Remove the punctuation
  - Remove non alpha numberic letters
  - Do the lematization on the content
  - Remove the stop words
  - Replace the contractions in english

In [None]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}

en_stop = stopwords.words('english')
# this list is update during tuning hyper parameters of model
stoplist = ['j', 'one','re', 'thank', 'thanks', 'enron' , 'http' \
            , 'the', 'email', 'to', 'send', 'pm', 'subject', 'please' \
            , 'thanks', 'a', 'cc', 'bcc', 'i','from', 'we', 'r', 's','make' \
            , 'want','forward', 'would','u', 'be','could','this','nt', 'say', 'rb', 'o', 'wr', 'tx', 'with', 'fyi', 'bc', 'he','web', 'click' \
           ,'font', "br", "net", "images", "gif", "com", "href", "text", "jpg", "script", "clear", "td", "size", "tr", "face", "align" \
        , "align", "class" , "color", "pt", "border", "com", "www", "htm", "html", "width", "link", "go", "pdf",  "news"]

en_stop  = en_stop + stoplist

def clean_data(list_text):
    
    list_content_processed = [list(simple_preprocess(text)) for text in list_text]
    
    list_content_clean = []
    for sentence in list_content_processed:
        list_sent = []
        # iterate each sentence in the corpus
        for word in sentence:
            if (not word in en_stop):
            if (word in contractions):
                list_sent.append(contractions.get(word))
            else:
                list_sent.append(word)
        
        if (len(list_sent) > 0):
            list_content_clean.append(list_sent)
    return list_content_clean

list_content_clean = clean_data(list_content) 

dump(list_content_clean, metadata_path + 'list_content_clean.joblib')