# First model for API

### Imports, read in data, and look over

In [39]:
import pandas as pd
import numpy as np
import category_encoders as ce
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('one_hot_encoded_tags_split.csv')

print(df.shape)

df.head()

(9861, 19)


Unnamed: 0,id,uid,from_,subject,msg,content_type,tags,Productivity,Entertainment,Finance,Other,Events,Travel,Shopping,Social,Personal,first_tag,text,second_tag
0,31780,32509,<grangepayments@westernunionspeedpay.com>,Grange Payment Confirmation,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",text/plain,Finance,0,0,1,0,0,0,0,0,0,Finance,<grangepayments@westernunionspeedpay.com> Gran...,empty
1,31779,32508,Chase <no.reply.alerts@chase.com>,Your Debit Card Transaction,This is an Alert to help manage your account ...,text/plain,Finance,0,0,1,0,0,0,0,0,0,Finance,Chase <no.reply.alerts@chase.com> Your Debit C...,empty
2,31738,32467,Amazon Web Services <no-reply-aws@amazon.com>,Resolved 6559329691: Limit Increase: SageMaker,Please let us know if we helped resolve your i...,text/plain,Productivity,1,0,0,0,0,0,0,0,0,Productivity,Amazon Web Services <no-reply-aws@amazon.com> ...,empty
3,31693,32422,Lambda Labs <noreply@github.com>,Bernie Durfee added you to the Lambda Labs tea...,Youve been added to the Labs 18 - Tagger team ...,text/plain,Productivity,1,0,0,0,0,0,0,0,0,Productivity,Lambda Labs <noreply@github.com> Bernie Durfee...,empty
4,31684,32413,Amazon Web Services <no-reply-aws@amazon.com>,Attention required on case 6559329691: Limit I...,"Hello, We haven't heard back from you regard...",text/plain,Productivity,1,0,0,0,0,0,0,0,0,Productivity,Amazon Web Services <no-reply-aws@amazon.com> ...,empty


In [3]:
df.isnull().sum()

id               0
uid              0
from_            0
subject          0
msg              0
content_type     0
tags             0
Productivity     0
Entertainment    0
Finance          0
Other            0
Events           0
Travel           0
Shopping         0
Social           0
Personal         0
first_tag        0
text             0
second_tag       0
dtype: int64

In [4]:
df.columns

Index(['id', 'uid', 'from_', 'subject', 'msg', 'content_type', 'tags',
       'Productivity', 'Entertainment', 'Finance', 'Other', 'Events', 'Travel',
       'Shopping', 'Social', 'Personal', 'first_tag', 'text', 'second_tag'],
      dtype='object')

In [5]:
df['first_tag'].value_counts()

Finance          5428
Entertainment    1590
Shopping          967
Personal          963
Other             452
Productivity      262
Social            137
Travel             34
Events             28
Name: first_tag, dtype: int64

### Train model

In [42]:
train, test = train_test_split(df, stratify=df['first_tag'])

train.shape, test.shape

((7395, 19), (2466, 19))

In [43]:
train['first_tag'].value_counts(normalize=True)

Finance          0.550507
Entertainment    0.161190
Shopping         0.098039
Personal         0.097634
Other            0.045842
Productivity     0.026504
Social           0.013928
Travel           0.003516
Events           0.002840
Name: first_tag, dtype: float64

In [44]:
test['first_tag'].value_counts(normalize=True)

Finance          0.550284
Entertainment    0.161395
Shopping         0.098135
Personal         0.097729
Other            0.045823
Productivity     0.026764
Social           0.013788
Travel           0.003244
Events           0.002839
Name: first_tag, dtype: float64

In [45]:
X_train = train['text']
y_train = train['first_tag'].values

X_test = test['text']
y_test = test['first_tag'].values

In [28]:
y_train.shape

(7395,)

In [46]:
vect = TfidfVectorizer(stop_words='english')

X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [29]:
encoder = ce.OrdinalEncoder()

y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [47]:
model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7822384428223844

In [31]:
y_train.shape

(7395, 1)

In [48]:
preds = model.predict(X_test)

pd.DataFrame({'text': test['text'],
              'tags': test['tags'],
              'truth': y_test,
              'predictions': preds})

Unnamed: 0,text,tags,truth,predictions
2643,contests@fantasygames.go.com ESPN Eliminator C...,Entertainment,Entertainment,Finance
1042,"""PokerStars Support"" <support@pokerstars.com> ...",Entertainment,Entertainment,Entertainment
536,"""PokerStars Home Games"" <yourclub@starsaccount...",Entertainment,Entertainment,Entertainment
4192,Amazon Payments <no-reply@amazon.com> Payment ...,Finance,Finance,Finance
2244,Brooklyn College Career Center <careernews@bro...,"Productivity, Events",Productivity,Finance
...,...,...,...,...
9709,Hazon <offers@jblasts.com> Pinot & Pomegranate...,Other,Other,Finance
4534,Amazon Payments <no-reply@amazon.com> Payment ...,Finance,Finance,Finance
9516,Dropbox <no-reply@dropbox.com> Download Dropbo...,Productivity,Productivity,Finance
3331,Amazon Payments <no-reply@amazon.com> You rece...,Finance,Finance,Finance


#### Model attains 72.5% accuracy with giving a single tag

In [49]:
joblib.dump(model, 'model1.joblib')

['model1.joblib']

In [50]:
joblib.dump(vect, 'vect1.joblib')

['vect1.joblib']

# Using larger dataset

In [87]:
df = pd.read_csv('../Data/Master_emails.csv')

print(df.shape)

df.head()

(11550, 7)


Unnamed: 0.1,Unnamed: 0,Content_Type,From,Message,Subject,Tags,UID
0,0,text/plain,<grangepayments@westernunionspeedpay.com>,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",Grange Payment Confirmation,Finance,31780
1,1,text/plain,Chase <no.reply.alerts@chase.com>,This is an Alert to help manage your account ...,Your Debit Card Transaction,Finance,31779
2,2,text/plain,Amazon Web Services <no-reply-aws@amazon.com>,Please let us know if we helped resolve your i...,Resolved 6559329691: Limit Increase: SageMaker,Productivity,31738
3,3,text/plain,Lambda Labs <noreply@github.com>,Youve been added to the Labs 18 - Tagger team ...,Bernie Durfee added you to the Lambda Labs tea...,Productivity,31693
4,4,text/plain,Amazon Web Services <no-reply-aws@amazon.com>,"Hello, We haven't heard back from you regard...",Attention required on case 6559329691: Limit I...,Productivity,31684


In [88]:
df = df[['From', 'Message', 'Subject', 'Tags', 'UID']]

df.shape

(11550, 5)

In [89]:
df.isnull().sum()

From        0
Message     6
Subject    50
Tags       50
UID         0
dtype: int64

In [90]:
untagged_test = df[df['Tags'].isna()]

untagged_test.shape

(50, 5)

In [91]:
df['Subject'] = df['Subject'].replace(np.nan, '')
df = df.dropna(subset=['Tags'])
df.shape

(11500, 5)

In [94]:
df['Message'] = df['Message'].replace(np.nan, '')
df.isnull().sum()

From       0
Message    0
Subject    0
Tags       0
UID        0
dtype: int64

In [95]:
df['text'] = df['From'] + ' ' + df['Subject'] + ' ' + df['Message']

In [96]:
df['Tags'].value_counts()

Finance                    5604
Entertainment              1505
Shopping                   1065
Personal, Other             685
Other                       514
Productivity                514
Personal, Productivity      382
Personal                    299
Social                      146
Productivity, Events        130
Entertainment, Finance      102
Personal, Shopping           95
Personal, Events             95
Events                       95
Travel                       92
Personal, Finance            69
Personal, Travel             25
Personal, Entertainment      19
Shopping, Finance            13
Entertainment, Shopping      10
Events, Productivity          8
Travel, Finance               5
Personal, Social              4
Shopping, Entertainment       4
Social, Productivity          4
Productivity, Finance         4
Security                      3
Shopping, Productivity        2
Finance, Travel               2
Finance, Shopping             1
Finance, Productivity         1
Events, 

In [97]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb_result = mlb.fit_transform([str(df.iloc[i]['Tags']).split(', ') for i in range(len(df))])
#df_final = pd.concat([df['text'],pd.DataFrame(mlb_result,columns=list(mlb.classes_))],axis=1)
mlb_df = pd.DataFrame(mlb_result,columns=list(mlb.classes_))
mlb_df.head()

Unnamed: 0,Entertainment,Events,Finance,Other,Personal,Productivity,Security,Shopping,Social,Travel
0,0,0,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0


In [98]:
one_hot_df = pd.concat([df, mlb_df], axis=1)

In [99]:
one_hot_df['Tags']

0                     Finance
1                     Finance
2                Productivity
3                Productivity
4                Productivity
                 ...         
11545           Entertainment
11546            Productivity
11547    Events, Productivity
11548            Productivity
11549           Entertainment
Name: Tags, Length: 11550, dtype: object

In [86]:
one_hot_df.isna().sum()

From             50
Message          56
Subject          50
Tags             50
UID              50
text             56
Entertainment    50
Events           50
Finance          50
Other            50
Personal         50
Productivity     50
Security         50
Shopping         50
Social           50
Travel           50
dtype: int64

In [84]:
one_hot_df[one_hot_df['Tags'].str.contains('Personal')]

ValueError: cannot index with vector containing NA / NaN values

In [78]:
one_hot_df['first_tag'] = [tags[0] if tags[0]!="Personal" else tags[1] for tags in df['Tags'].str.split(', ')]
one_hot_df['first_tag'].value_counts()

IndexError: list index out of range

In [76]:
df['Tags'].str.split()

0                      [Finance]
1                      [Finance]
2                 [Productivity]
3                 [Productivity]
4                 [Productivity]
                  ...           
11545            [Entertainment]
11546             [Productivity]
11547    [Events,, Productivity]
11548             [Productivity]
11549            [Entertainment]
Name: Tags, Length: 11500, dtype: object

In [None]:
train, test = train_test_split(df, stratif)