In [1]:
import pandas as pd
import email
import tensorflow as tf

from utils import *
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from string import punctuation
from dateutil import parser

<h3> Importing and basic preparation

In [2]:
db = pd.read_csv(
  'giskard_dataset.csv',
  delimiter=';',
  index_col=0
)
db.reset_index(drop=True,inplace=True)
db

Unnamed: 0,Target,Message
0,internal company policy,Message-ID: <32715630.1075840547840.JavaMail.e...
1,alliances / partnerships,Message-ID: <8721012.1075863428380.JavaMail.ev...
2,internal company operations,Message-ID: <8687721.1075852656109.JavaMail.ev...
3,internal company operations,Message-ID: <16562450.1075846168630.JavaMail.e...
4,internal projects -- progress and strategy,Message-ID: <955111.1075858690252.JavaMail.eva...
...,...,...
874,internal company operations,Message-ID: <18983060.1075847582386.JavaMail.e...
875,alliances / partnerships,Message-ID: <29879754.1075863427653.JavaMail.e...
876,talking points,Message-ID: <3001077.1075863428054.JavaMail.ev...
877,internal projects -- progress and strategy,Message-ID: <13669071.1075863428696.JavaMail.e...


In [3]:
stop_words = set(stopwords.words('english')+list(punctuation))
lemmatizer = WordNetLemmatizer()

In [4]:
#These are our classification labels
db.Target.value_counts()

regulations and regulators (includes price caps)    184
california energy crisis / california politics      149
internal projects -- progress and strategy          109
internal company operations                          78
political influence / contributions / contacts       76
internal company policy                              67
 company image -- current                            59
legal advice                                         47
alliances / partnerships                             43
company image -- changing / influencing              37
meeting minutes                                      13
talking points                                       13
trip reports                                          4
Name: Target, dtype: int64

In [5]:
db['Message_usable'] = db['Message'].apply(lambda x:email.message_from_string(x))
db['Subject'] = db['Message_usable'].apply(lambda x:x['Subject'])
db['Body'] = db['Message_usable'].apply(lambda x:x.get_payload())
db['Date'] = db['Message_usable'].apply(lambda x:parser.parse(x['Date']))
db['Year'] =  db['Date'].apply(lambda x:x.strftime('%Y'))
db['Month'] =  db['Date'].apply(lambda x:x.strftime('%B'))
db['Day'] = db['Date'].apply(lambda x:x.strftime('%A'))

In [6]:
db_copy = db.copy(deep=True)

In [7]:
db_copy = db_copy.drop(
  columns=[
    'Message',
    'Message_usable',
    'Date'
  ]
)
db_copy

Unnamed: 0,Target,Subject,Body,Year,Month,Day
0,internal company policy,Confidential Information and Securities Trading,"To:GILBERT-SMITH, DOUGLAS\nEmail:doug.gilbert-...",2001,July,Wednesday
1,alliances / partnerships,"FW: Venezuela FX and inflation forecasts, June...",\n\n -----Original Message-----\nFrom: \tKoepk...,2001,June,Tuesday
2,internal company operations,Enron Expatriates in India,"John Brindle, David Cromley and others in the ...",2001,June,Wednesday
3,internal company operations,Re: PLEASE READ & RESPOND,Yes. Contact either Gia or John and get the b...,2000,August,Monday
4,internal projects -- progress and strategy,FW: ISO's Response to BPA Rebuttal of Sheffrin...,\nThis is something interesting to put in your...,2001,June,Saturday
...,...,...,...,...,...,...
874,internal company operations,Message from Clark C. Smith,include on the distribution list and send out ...,2001,May,Sunday
875,alliances / partnerships,RE:,"Aruna,\n\nI shall be in London this week. Plea...",2001,June,Monday
876,talking points,RE: Marketing Plan & Meetings,"Sandeep,\n\nThis looks great. I would talk to ...",2001,June,Tuesday
877,internal projects -- progress and strategy,RE: Test Message,"Mike,\n\nThanks for your message. The meeting ...",2001,June,Wednesday


<h3> Cleaning Data

<h4> Subject Column

In [8]:
def list_lemmatizer(list_to_lem,lemmatizer):
    return [lemmatizer.lemmatize(token, 'v') for token in list_to_lem]
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/efkanturedi/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [9]:
db_copy['Subject'] = db_copy['Subject'].apply(lambda x: tokenizer(x))
db_copy['Subject'] = db_copy['Subject'].apply(lambda x: stop_words_check(x,stop_words))
# Note that stopwords checker also removes tokens with len strictly less than 4
db_copy['Subject'] = db_copy['Subject'].apply(lambda x: list_lemmatizer(x,lemmatizer))
db_copy['Subject'] = db_copy['Subject'].apply(lambda x: " ".join(x))
db_copy['Subject'] = db_copy['Subject'].replace('','None')

<h4> Body Column

In [10]:
db_copy['Body'] = db_copy['Body'].apply(lambda x: tokenizer(x))
db_copy['Body'] = db_copy['Body'].apply(lambda x: stop_words_check(x,stop_words))
# Note that stopwords checker also removes tokens with len strictly less than 3
db_copy['Body'] = db_copy['Body'].apply(lambda x: list_lemmatizer(x,lemmatizer))
db_copy['Body'] = db_copy['Body'].apply(lambda x: " ".join(x))

In [11]:
db_copy

Unnamed: 0,Target,Subject,Body,Year,Month,Day
0,internal company policy,confidential information securities trade,gilbert smith douglas email doug gilbert smith...,2001,July,Wednesday
1,alliances / partnerships,venezuela inflation forecast june 2001,original message koepke gwyn send tuesday june...,2001,June,Tuesday
2,internal company operations,enron expatriate india,john brindle david cromley others corporate bu...,2001,June,Wednesday
3,internal company operations,please read respond,contact either john ball roll thank michael te...,2000,August,Monday
4,internal projects -- progress and strategy,iso's response rebuttal sheffrin study confide...,something interest back pocket original messag...,2001,June,Saturday
...,...,...,...,...,...,...
874,internal company operations,message clark smith,include distribution list send note monday mee...,2001,May,Sunday
875,alliances / partnerships,,aruna shall london week please call monday nex...,2001,June,Monday
876,talking points,market plan meet,sandeep look great would talk head structure d...,2001,June,Tuesday
877,internal projects -- progress and strategy,test message,mike thank message meet tuesday confirm vince ...,2001,June,Wednesday


In [12]:
db_copy['Subject'].value_counts()

None                                                                                                                                                      51
energy issue                                                                                                                                              46
confidential information securities trade                                                                                                                 17
enron mention                                                                                                                                             13
western wholesale activities power conf call privilege confidential communication attorney client communication attorney work product privilege assert     8
                                                                                                                                                          ..
lay's meet                                                

<h3> Preparing training and test set

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
y = db_copy['Target']
X = db_copy.drop('Target',1)

  X = db_copy.drop('Target',1)


In [15]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=42,stratify=y)

In [16]:
X_test.to_csv('X_test')
y_test.to_csv('y_test')

<h3> Training the model

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score

In [18]:
categorical_features = [
  'Year',
  'Month',
  'Day'
]

categorical_transformer = Pipeline([
  ('binarizer',OneHotEncoder(sparse=False))
])

In [19]:
core_text_features = [
  'Subject',
  'Body',
]

core_text_transformer = Pipeline([
  ('vectorizer',TfidfVectorizer())
])

In [20]:
transformers = [
    ('categorical',categorical_transformer,categorical_features),
    ('subject_vect',core_text_transformer,core_text_features[0]),
    ('body_vect',core_text_transformer,core_text_features[1]),
]

preprocessing_pipe = ColumnTransformer(
  transformers=transformers
)

Now we are going to try two models; Logistic Regression and Random Forest

In [21]:
classifier_pipe_rfr = Pipeline(steps=[
  ('preprocessing',preprocessing_pipe),
  ('classifier_rfr', RandomForestClassifier(
    n_estimators=500,
    class_weight='balanced',
    n_jobs=-1,
  ))
])

classifier_pipe_logit = Pipeline(steps=[
  ('preprocessing',preprocessing_pipe),
  ('classifier_logit', LogisticRegression(
    class_weight='balanced',
    n_jobs=-1,
    multi_class='ovr',
    penalty='l2',
    solver='lbfgs'
  ))
])

classifier_pipe_svc = Pipeline(steps=[
  ('preprocessing',preprocessing_pipe),
  ('classifier_rfr', SVC(
    class_weight='balanced',
    kernel='rbf'
  ))
])


In [22]:
classifier_pipe_rfr.fit(X_train,y_train);
classifier_pipe_logit.fit(X_train,y_train);
classifier_pipe_svc.fit(X_train,y_train);

In [23]:
f1_score(y_test,classifier_pipe_rfr.predict(X_test),average='weighted')

0.359159466624798

In [24]:
f1_score(y_test,classifier_pipe_logit.predict(X_test),average='weighted')

0.3699898094969736

In [25]:
f1_score(y_test,classifier_pipe_svc.predict(X_test),average='weighted')

0.27545905034204277

<h4> Exporting Pipeline

In [26]:
import joblib

In [27]:
joblib.dump(classifier_pipe_logit,'pipeline.joblib')

['pipeline.joblib']