# First model for API

### Imports, read in data, and look over

In [2]:
import pandas as pd
import numpy as np
import category_encoders as ce
import joblib

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df = pd.read_csv('one_hot_encoded_tags_split.csv')

print(df.shape)

df.head()

(9861, 19)


Unnamed: 0,id,uid,from_,subject,msg,content_type,tags,Productivity,Entertainment,Finance,Other,Events,Travel,Shopping,Social,Personal,first_tag,text,second_tag
0,31780,32509,<grangepayments@westernunionspeedpay.com>,Grange Payment Confirmation,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",text/plain,Finance,0,0,1,0,0,0,0,0,0,Finance,<grangepayments@westernunionspeedpay.com> Gran...,empty
1,31779,32508,Chase <no.reply.alerts@chase.com>,Your Debit Card Transaction,This is an Alert to help manage your account ...,text/plain,Finance,0,0,1,0,0,0,0,0,0,Finance,Chase <no.reply.alerts@chase.com> Your Debit C...,empty
2,31738,32467,Amazon Web Services <no-reply-aws@amazon.com>,Resolved 6559329691: Limit Increase: SageMaker,Please let us know if we helped resolve your i...,text/plain,Productivity,1,0,0,0,0,0,0,0,0,Productivity,Amazon Web Services <no-reply-aws@amazon.com> ...,empty
3,31693,32422,Lambda Labs <noreply@github.com>,Bernie Durfee added you to the Lambda Labs tea...,Youve been added to the Labs 18 - Tagger team ...,text/plain,Productivity,1,0,0,0,0,0,0,0,0,Productivity,Lambda Labs <noreply@github.com> Bernie Durfee...,empty
4,31684,32413,Amazon Web Services <no-reply-aws@amazon.com>,Attention required on case 6559329691: Limit I...,"Hello, We haven't heard back from you regard...",text/plain,Productivity,1,0,0,0,0,0,0,0,0,Productivity,Amazon Web Services <no-reply-aws@amazon.com> ...,empty


In [3]:
df.isnull().sum()

id               0
uid              0
from_            0
subject          0
msg              0
content_type     0
tags             0
Productivity     0
Entertainment    0
Finance          0
Other            0
Events           0
Travel           0
Shopping         0
Social           0
Personal         0
first_tag        0
text             0
second_tag       0
dtype: int64

In [4]:
df.columns

Index(['id', 'uid', 'from_', 'subject', 'msg', 'content_type', 'tags',
       'Productivity', 'Entertainment', 'Finance', 'Other', 'Events', 'Travel',
       'Shopping', 'Social', 'Personal', 'first_tag', 'text', 'second_tag'],
      dtype='object')

In [5]:
df['first_tag'].value_counts()

Finance          5428
Entertainment    1590
Shopping          967
Personal          963
Other             452
Productivity      262
Social            137
Travel             34
Events             28
Name: first_tag, dtype: int64

### Train model

In [42]:
train, test = train_test_split(df, stratify=df['first_tag'])

train.shape, test.shape

((7395, 19), (2466, 19))

In [43]:
train['first_tag'].value_counts(normalize=True)

Finance          0.550507
Entertainment    0.161190
Shopping         0.098039
Personal         0.097634
Other            0.045842
Productivity     0.026504
Social           0.013928
Travel           0.003516
Events           0.002840
Name: first_tag, dtype: float64

In [44]:
test['first_tag'].value_counts(normalize=True)

Finance          0.550284
Entertainment    0.161395
Shopping         0.098135
Personal         0.097729
Other            0.045823
Productivity     0.026764
Social           0.013788
Travel           0.003244
Events           0.002839
Name: first_tag, dtype: float64

In [45]:
X_train = train['text']
y_train = train['first_tag'].values

X_test = test['text']
y_test = test['first_tag'].values

In [28]:
y_train.shape

(7395,)

In [46]:
vect = TfidfVectorizer(stop_words='english')

X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [29]:
encoder = ce.OrdinalEncoder()

y_train = encoder.fit_transform(y_train)
y_test = encoder.transform(y_test)

In [47]:
model = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=42)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.7822384428223844

In [31]:
y_train.shape

(7395, 1)

In [48]:
preds = model.predict(X_test)

pd.DataFrame({'text': test['text'],
              'tags': test['tags'],
              'truth': y_test,
              'predictions': preds})

Unnamed: 0,text,tags,truth,predictions
2643,contests@fantasygames.go.com ESPN Eliminator C...,Entertainment,Entertainment,Finance
1042,"""PokerStars Support"" <support@pokerstars.com> ...",Entertainment,Entertainment,Entertainment
536,"""PokerStars Home Games"" <yourclub@starsaccount...",Entertainment,Entertainment,Entertainment
4192,Amazon Payments <no-reply@amazon.com> Payment ...,Finance,Finance,Finance
2244,Brooklyn College Career Center <careernews@bro...,"Productivity, Events",Productivity,Finance
...,...,...,...,...
9709,Hazon <offers@jblasts.com> Pinot & Pomegranate...,Other,Other,Finance
4534,Amazon Payments <no-reply@amazon.com> Payment ...,Finance,Finance,Finance
9516,Dropbox <no-reply@dropbox.com> Download Dropbo...,Productivity,Productivity,Finance
3331,Amazon Payments <no-reply@amazon.com> You rece...,Finance,Finance,Finance


#### Model attains 72.5% accuracy with giving a single tag

In [49]:
joblib.dump(model, 'model1.joblib')

['model1.joblib']

In [50]:
joblib.dump(vect, 'vect1.joblib')

['vect1.joblib']

# Using larger dataset

In [100]:
df = pd.read_csv('../Data/Master_emails.csv')

print(df.shape)

df.head()

(11550, 7)


Unnamed: 0.1,Unnamed: 0,Content_Type,From,Message,Subject,Tags,UID
0,0,text/plain,<grangepayments@westernunionspeedpay.com>,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",Grange Payment Confirmation,Finance,31780
1,1,text/plain,Chase <no.reply.alerts@chase.com>,This is an Alert to help manage your account ...,Your Debit Card Transaction,Finance,31779
2,2,text/plain,Amazon Web Services <no-reply-aws@amazon.com>,Please let us know if we helped resolve your i...,Resolved 6559329691: Limit Increase: SageMaker,Productivity,31738
3,3,text/plain,Lambda Labs <noreply@github.com>,Youve been added to the Labs 18 - Tagger team ...,Bernie Durfee added you to the Lambda Labs tea...,Productivity,31693
4,4,text/plain,Amazon Web Services <no-reply-aws@amazon.com>,"Hello, We haven't heard back from you regard...",Attention required on case 6559329691: Limit I...,Productivity,31684


In [101]:
# get ride of unnecessary columns
df = df[['From', 'Message', 'Subject', 'Tags', 'UID']]

df.shape

(11550, 5)

In [102]:
# look for nulls
# save the 50 tagless, but in another df
# replace nan msg and subject with '' 
df.isnull().sum()

From        0
Message     6
Subject    50
Tags       50
UID         0
dtype: int64

In [103]:
# separate the 50 untagged for later testing.
untagged_test = df[df['Tags'].isna()]

untagged_test.shape

(50, 5)

In [104]:
# replace nan with ''
df[['Subject', 'Message']] = df[['Subject', 'Message']].replace(np.nan, '')

df.isnull().sum()

From        0
Message     0
Subject     0
Tags       50
UID         0
dtype: int64

In [105]:
# drop where Tag is nan
df = df.dropna(subset=['Tags'])
df.isnull().sum()

From       0
Message    0
Subject    0
Tags       0
UID        0
dtype: int64

In [106]:
# text column that joins the 3 textual columns
# this is the data for the model
df['text'] = df['From'] + ' ' + df['Subject'] + ' ' + df['Message']

In [107]:
df.head()

Unnamed: 0,From,Message,Subject,Tags,UID,text
0,<grangepayments@westernunionspeedpay.com>,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",Grange Payment Confirmation,Finance,31780,<grangepayments@westernunionspeedpay.com> Gran...
1,Chase <no.reply.alerts@chase.com>,This is an Alert to help manage your account ...,Your Debit Card Transaction,Finance,31779,Chase <no.reply.alerts@chase.com> Your Debit C...
2,Amazon Web Services <no-reply-aws@amazon.com>,Please let us know if we helped resolve your i...,Resolved 6559329691: Limit Increase: SageMaker,Productivity,31738,Amazon Web Services <no-reply-aws@amazon.com> ...
3,Lambda Labs <noreply@github.com>,Youve been added to the Labs 18 - Tagger team ...,Bernie Durfee added you to the Lambda Labs tea...,Productivity,31693,Lambda Labs <noreply@github.com> Bernie Durfee...
4,Amazon Web Services <no-reply-aws@amazon.com>,"Hello, We haven't heard back from you regard...",Attention required on case 6559329691: Limit I...,Productivity,31684,Amazon Web Services <no-reply-aws@amazon.com> ...


In [108]:
# The tags are funky still
df['Tags'].value_counts()

Finance                    5604
Entertainment              1505
Shopping                   1065
Personal, Other             685
Other                       514
Productivity                514
Personal, Productivity      382
Personal                    299
Social                      146
Productivity, Events        130
Entertainment, Finance      102
Personal, Shopping           95
Personal, Events             95
Events                       95
Travel                       92
Personal, Finance            69
Personal, Travel             25
Personal, Entertainment      19
Shopping, Finance            13
Entertainment, Shopping      10
Events, Productivity          8
Travel, Finance               5
Personal, Social              4
Shopping, Entertainment       4
Social, Productivity          4
Productivity, Finance         4
Security                      3
Shopping, Productivity        2
Finance, Travel               2
Finance, Shopping             1
Finance, Productivity         1
Events, 

In [109]:
# THIS IS AN ASIDE FOR WHEN WE FIGURE OUT HOT TO MULTI_LABEL THE OUTPUT
# This also has problems if data was dropped after loaded into df due to indexing mismatches
# creates new df of one-hot-encoded tags
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb_result = mlb.fit_transform([str(df.iloc[i]['Tags']).split(', ') for i in range(len(df))])
#df_final = pd.concat([df['text'],pd.DataFrame(mlb_result,columns=list(mlb.classes_))],axis=1)
mlb_df = pd.DataFrame(mlb_result,columns=list(mlb.classes_))
mlb_df.head()

Unnamed: 0,Entertainment,Events,Finance,Other,Personal,Productivity,Security,Shopping,Social,Travel
0,0,0,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0


In [110]:
# concatenate the 2 dfs together
one_hot_df = pd.concat([df, mlb_df], axis=1)

In [112]:
one_hot_df.isna().sum()

From             50
Message          50
Subject          50
Tags             50
UID              50
text             50
Entertainment    50
Events           50
Finance          50
Other            50
Personal         50
Productivity     50
Security         50
Shopping         50
Social           50
Travel           50
dtype: int64

In [113]:
one_hot_df[one_hot_df['text'].isna()]

Unnamed: 0,From,Message,Subject,Tags,UID,text,Entertainment,Events,Finance,Other,Personal,Productivity,Security,Shopping,Social,Travel
1153,,,,,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2691,,,,,,,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
7647,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7648,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7649,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7650,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7651,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7652,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7653,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7654,,,,,,,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [114]:
df[df['Tags'].str.contains('Personal')]

Unnamed: 0,From,Message,Subject,Tags,UID,text
38,"""Macdonald, Stuart"" <sjmac@ku.edu>","Searching PubMed for ""bcf.ku.edu"" I found the ...",Re: Autism Genetic DataBase,"Personal, Productivity",30026,"""Macdonald, Stuart"" <sjmac@ku.edu> Re: Autism ..."
107,Raizy Jacobovitch <eisenbergerraizy@gmail.com>,Hi! Heres a list of the words: Week of June 10...,DR spelling words,"Personal, Productivity",28741,Raizy Jacobovitch <eisenbergerraizy@gmail.com>...
121,Raizy Jacobovitch <eisenbergerraizy@gmail.com>,"Heres the list: Week of May 6: Benefit, delic...",DR words,"Personal, Productivity",28153,Raizy Jacobovitch <eisenbergerraizy@gmail.com>...
137,Raizy Jacobovitch <eisenbergerraizy@gmail.com>,Calcium Calculate Captain Challenge Contaminat...,Spelling words for this week,"Personal, Productivity",27237,Raizy Jacobovitch <eisenbergerraizy@gmail.com>...
144,Tova Rottenberg <tova@alephbeis.org>,SGkgUmFpenksDQpUaGFua3MgZm9yIGFza2luZy4gWWVzLC...,Nest Cam,"Personal, Productivity",27019,Tova Rottenberg <tova@alephbeis.org> Nest Cam ...
...,...,...,...,...,...,...
11519,"""Brown, Joshua J MSgt USAF 120 AW (USA)"" <josh...","Samuel, Just wanted to check in with yo...",RE: [Non-DoD Source] Cyber/Comms.,"Personal, Productivity",2371,"""Brown, Joshua J MSgt USAF 120 AW (USA)"" <josh..."
11522,"""DEHLER, DEREK B MSgt USAF AFRC 943 RQG/RS"" <d...",Samuel I received the docs you sent. Did you m...,RE: [Non-DoD Source] Samuel Hepner Cover Letter,"Personal, Productivity",3029,"""DEHLER, DEREK B MSgt USAF AFRC 943 RQG/RS"" <d..."
11525,"""DEHLER, DEREK B MSgt USAF AFRC 943 RQG/RS"" <d...","Samuel, Pursue the Guard and if it doesnt work...",RE: [Non-DoD Source] Samuel Hepner Cover Letter,"Personal, Productivity",3145,"""DEHLER, DEREK B MSgt USAF AFRC 943 RQG/RS"" <d..."
11527,norman hepner <normstormin@gmail.com>,Go ahead and reach back out to MSgt Dehler and...,Re: [Non-DoD Source] Samuel Hepner Cover Letter,"Personal, Productivity",3229,norman hepner <normstormin@gmail.com> Re: [Non...


In [115]:
df['tag_list'] = df['Tags'].str.split(', ')

In [120]:
personal_emails = df[df['Tags'].str.contains('Personal')]

In [121]:
personal_emails[personal_emails['tag_list'].map(len)==1]

Unnamed: 0,From,Message,Subject,Tags,UID,text,tag_list
611,Daniel Stuhlman <ddstuhlman@earthlink.net>,My email program went crazy. It didn't realiz...,Re: Mazel Tov --Opps,Personal,12079,Daniel Stuhlman <ddstuhlman@earthlink.net> Re:...,[Personal]
612,Daniel Stuhlman <ddstuhlman@earthlink.net>,"Mazel tov. Donnie At 07:44 AM 01/06/2015, Wa...",Re: Mazel Tov,Personal,12078,Daniel Stuhlman <ddstuhlman@earthlink.net> Re:...,[Personal]
870,noreply@profiles.google.com,Moshe Caplan reached you via your Google+ prof...,Fwd: Tour,Personal,8169,noreply@profiles.google.com Fwd: Tour Moshe Ca...,[Personal]
872,Sim <simsolomon@gmail.com>,Got an American line in Israel 3477613867,,Personal,7905,Sim <simsolomon@gmail.com> Got an American li...,[Personal]
878,Simcha Solomon <simsolomon@gmail.com>,nice.. u better or b makin money so u coood ta...,Re:,Personal,7415,Simcha Solomon <simsolomon@gmail.com> Re: nice...,[Personal]
...,...,...,...,...,...,...,...
11497,ERIN HEPNER <hephaus@yahoo.com>,"I want to, do you? Erin Hepner Begin f...",Fwd: Thanksgiving?,Personal,49044,ERIN HEPNER <hephaus@yahoo.com> Fwd: Thanksgiv...,[Personal]
11499,norman hepner <normstormin@gmail.com>,"Im a yes Sent from my iPhone > On Nov 7,...",Re: Thanksgiving?,Personal,49047,norman hepner <normstormin@gmail.com> Re: Than...,[Personal]
11500,"""Brown, Joshua J MSgt USAF (US)"" <joshua.j.bro...",Here you go. =20 You can also google tha...,RE: [Non-DoD Source] Re: Air Guard,Personal,1484,"""Brown, Joshua J MSgt USAF (US)"" <joshua.j.bro...",[Personal]
11501,"""Brown, Joshua J MSgt USAF (US)"" <joshua.j.bro...","Samuel, I just sent a PDF with no link...",RE: [Non-DoD Source] Info about open position,Personal,1492,"""Brown, Joshua J MSgt USAF (US)"" <joshua.j.bro...",[Personal]


In [126]:
mask = df['Tags'].str.contains('Personal') & df['tag_list'].map(len)==1

df = df[~mask]

In [132]:
df['first_tag'] = [tags[0] if tags[0]!='Personal' else tags[1] for tags in df['tag_list']]

In [133]:
df['first_tag']

0              Finance
1              Finance
2         Productivity
3         Productivity
4         Productivity
             ...      
11545    Entertainment
11546     Productivity
11547           Events
11548     Productivity
11549    Entertainment
Name: first_tag, Length: 11201, dtype: object

In [134]:
df[df['first_tag'] == 'Personal']

Unnamed: 0,From,Message,Subject,Tags,UID,text,tag_list,first_tag


In [176]:
df

Unnamed: 0,From,Message,Subject,Tags,UID,text,tag_list,first_tag
0,<grangepayments@westernunionspeedpay.com>,"Dear AVRAHAM JACOBSOHN, This is to confirm th...",Grange Payment Confirmation,Finance,31780,<grangepayments@westernunionspeedpay.com> Gran...,[Finance],Finance
1,Chase <no.reply.alerts@chase.com>,This is an Alert to help manage your account ...,Your Debit Card Transaction,Finance,31779,Chase <no.reply.alerts@chase.com> Your Debit C...,[Finance],Finance
2,Amazon Web Services <no-reply-aws@amazon.com>,Please let us know if we helped resolve your i...,Resolved 6559329691: Limit Increase: SageMaker,Productivity,31738,Amazon Web Services <no-reply-aws@amazon.com> ...,[Productivity],Productivity
3,Lambda Labs <noreply@github.com>,Youve been added to the Labs 18 - Tagger team ...,Bernie Durfee added you to the Lambda Labs tea...,Productivity,31693,Lambda Labs <noreply@github.com> Bernie Durfee...,[Productivity],Productivity
4,Amazon Web Services <no-reply-aws@amazon.com>,"Hello, We haven't heard back from you regard...",Attention required on case 6559329691: Limit I...,Productivity,31684,Amazon Web Services <no-reply-aws@amazon.com> ...,[Productivity],Productivity
...,...,...,...,...,...,...,...,...
11545,"""Medium Daily Digest"" <noreply@medium.com>",Today's highlights Understanding Random For...,Understanding Random Forest | Tony Yiu in Towa...,Entertainment,3693,"""Medium Daily Digest"" <noreply@medium.com> Und...",[Entertainment],Entertainment
11546,"""Glassdoor Jobs"" <noreply@glassdoor.com>",...,You look like a good fit for the job at Procte...,Productivity,3702,"""Glassdoor Jobs"" <noreply@glassdoor.com> You l...",[Productivity],Productivity
11547,Amazon Web Services <aws-marketing-email-repli...,Thank you for attending AWS Machine Learning W...,Thank you for attending AWS Machine Learning W...,"Events, Productivity",3706,Amazon Web Services <aws-marketing-email-repli...,"[Events, Productivity]",Events
11548,"""no-reply-aws@amazon.com"" <no-reply-aws@amazon...","Hello again, I hope you're having a nice...",RE:[CASE 6570793521] Limit Increase: SageMaker,Productivity,3721,"""no-reply-aws@amazon.com"" <no-reply-aws@amazon...",[Productivity],Productivity


In [177]:
df.to_csv('updated_emails_fri_nov22.csv', index=False)

In [4]:
df = pd.read_csv('updated_emails_fri_nov22.csv')

In [5]:
train, test = train_test_split(df, stratify=df['first_tag'])

train.shape, test.shape

((8400, 8), (2801, 8))

In [6]:
train['first_tag'].value_counts(normalize=True)

Finance          0.506905
Entertainment    0.146071
Other            0.107024
Shopping         0.105238
Productivity     0.092024
Events           0.017738
Social           0.013810
Travel           0.010952
Security         0.000238
Name: first_tag, dtype: float64

In [7]:
test['first_tag'].value_counts(normalize=True)

Finance          0.506962
Entertainment    0.146019
Other            0.107105
Shopping         0.105320
Productivity     0.092110
Events           0.017851
Social           0.013567
Travel           0.010710
Security         0.000357
Name: first_tag, dtype: float64

In [8]:
X_train = train['text']
y_train = train['first_tag']

X_test = test['text']
y_test = test['first_tag']

In [9]:
vect = TfidfVectorizer(stop_words='english')

X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)

In [181]:
model = RandomForestClassifier(n_estimators=500, max_depth=5)

model.fit(X_train, y_train)

model.score(X_test, y_test)

0.6590503391645841

In [174]:
from sklearn.model_selection import GridSearchCV

In [11]:
#X = df['text']
#y = df['first_tag']

#X_train = vect.fit_transform(X_train)

param_distributions = {
    'n_estimators': [250, 500, 750, 1000],
    'max_depth': [3,6,9,None]
}

model = RandomForestClassifier()

grid = GridSearchCV(model, param_distributions, n_jobs=-1, cv=3, verbose=10)

grid.fit(X_train, y_train)

Fitting 3 folds for each of 16 candidates, totalling 48 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:   12.1s
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:   25.0s
[Parallel(n_jobs=-1)]: Done  17 tasks      | elapsed:   43.6s
[Parallel(n_jobs=-1)]: Done  24 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done  46 out of  48 | elapsed: 11.5min remaining:   30.1s
[Parallel(n_jobs=-1)]: Done  48 out of  48 | elapsed: 14.1min finished


GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [12]:
grid.best_score_

0.9248809523809524

In [13]:
grid.best_params_

{'max_depth': None, 'n_estimators': 1000}

In [19]:
model = RandomForestClassifier(n_estimators=1000, max_depth=None)
model.fit(X_train, y_train)
model.score(X_test, y_test)

0.9357372367011781

In [187]:
preds = grid.predict(X_test)

preds_df = pd.DataFrame({'msg': test['text'],
              'truth': y_test,
              'preds': preds})

In [189]:
preds_df.preds.value_counts(normalize=True)

Finance          0.501250
Entertainment    0.138165
Other            0.124241
Productivity     0.108176
Shopping         0.100321
Social           0.012853
Events           0.008211
Travel           0.006783
Name: preds, dtype: float64

In [152]:
untagged_test.head()

Unnamed: 0,From,Message,Subject,Tags,UID
1153,sneuman@coltown.com,VG54LiBOaWNlIHlvIGhlYXIgZnJvbSB5b3UuIEhvcGUgYW...,Re:,,98
2691,Carie Buckholts <cariebuckholts273@hotmail.com>,\n1985 2400ft - office -- large warehouse wor...,adams moving service fast affordable professi...,,20244
7647,Extended Stay America <extendedstayamerica@exp...,"Dear AVRAHAM JACOBSOHN,\n\nWe were delighted t...",Your recent stay at Extended Stay America Fish...,,28805
7648,Extended Stay America <extendedstayamerica@exp...,"Dear AVRAHAM JACOBSOHN,\n\nRecently, we sent y...",There's still time to rate your stay at Extend...,,28880
7649,Amazon Payments <noreply@amazon.com>,"Greetings from Amazon Payments,\n\nYour Amazon...",Your Amazon Payments Monthly Statement,,28954


In [20]:
joblib.dump(model, 'model1.joblib')
joblib.dump(vect, 'vect1.joblib')

['vect1.joblib']

In [155]:
untagged_test.isna().sum()

From        0
Message     0
Subject     2
Tags       50
UID         0
text        2
dtype: int64

In [156]:
untagged_test['Subject'] = untagged_test['Subject'].replace(np.nan, '')

untagged_test.isna().sum()

From        0
Message     0
Subject     0
Tags       50
UID         0
text        2
dtype: int64

In [158]:
untagged_test['text'] = untagged_test['From'] + ' ' + untagged_test['Subject'] + ' ' + untagged_test['Message']

X = vect.transform(untagged_test['text'])

preds = grid.predict(X)

In [159]:
untagged_test['Tags'] = preds

In [160]:
untagged_test

Unnamed: 0,From,Message,Subject,Tags,UID,text
1153,sneuman@coltown.com,VG54LiBOaWNlIHlvIGhlYXIgZnJvbSB5b3UuIEhvcGUgYW...,Re:,Other,98,sneuman@coltown.com Re: VG54LiBOaWNlIHlvIGhlYX...
2691,Carie Buckholts <cariebuckholts273@hotmail.com>,\n1985 2400ft - office -- large warehouse wor...,adams moving service fast affordable professi...,Other,20244,Carie Buckholts <cariebuckholts273@hotmail.com...
7647,Extended Stay America <extendedstayamerica@exp...,"Dear AVRAHAM JACOBSOHN,\n\nWe were delighted t...",Your recent stay at Extended Stay America Fish...,Other,28805,Extended Stay America <extendedstayamerica@exp...
7648,Extended Stay America <extendedstayamerica@exp...,"Dear AVRAHAM JACOBSOHN,\n\nRecently, we sent y...",There's still time to rate your stay at Extend...,Other,28880,Extended Stay America <extendedstayamerica@exp...
7649,Amazon Payments <noreply@amazon.com>,"Greetings from Amazon Payments,\n\nYour Amazon...",Your Amazon Payments Monthly Statement,Finance,28954,Amazon Payments <noreply@amazon.com> Your Amaz...
7650,Amazon Payments <no-reply@amazon.com>,"Greetings from Amazon Payments,\n\n\nYour paym...",Payment completed,Finance,29450,Amazon Payments <no-reply@amazon.com> Payment ...
7651,Amazon Payments <no-reply@amazon.com>,"Greetings from Amazon Payments,\n\n\nYou recei...",You received $1000.00 from AJ Stern sent via A...,Finance,29451,Amazon Payments <no-reply@amazon.com> You rece...
7652,Amazon Payments <noreply@amazon.com>,"Hello,\n\nAs an Amazon Payments account holder...",Your Amazon Payments Quarterly Notice,Finance,29453,Amazon Payments <noreply@amazon.com> Your Amaz...
7653,Amazon Payments <noreply@amazon.com>,"Hello Avraham Jacobsohn,\n\nWe wanted to let y...","Your summary of payments for Sep 30, 2013",Finance,29454,Amazon Payments <noreply@amazon.com> Your summ...
7654,"""Chave Jacobsohn"" <chave@designsbyfmc.com>",\n\n-----Original Message-----\nFrom: dcutler6...,FW: Mazal Tov!!! Mandelman Engagement - Vort I...,Other,29682,"""Chave Jacobsohn"" <chave@designsbyfmc.com> FW:..."


In [161]:
import joblib

In [162]:
joblib.dump(vect, 'vect1.joblib')
joblib.dump(model, 'model1.joblib')

['model1.joblib']

In [163]:
untagged_test['Tags'].value_counts(normalize=True)

Other            0.48
Finance          0.28
Entertainment    0.20
Productivity     0.04
Name: Tags, dtype: float64

In [166]:
X_test = vect.transform(X_test)
pred = grid.predict(X_test)

pd.DataFrame({'text': test['text'],
              'truth': y_test,
              'preds': preds})

AttributeError: lower not found