In [1]:
import pandas as pd
import numpy as np
import dateparser
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
%matplotlib inline

In [2]:
#Read train_clean.csv and test_clean.csv
df_train = pd.read_csv('train_clean.csv', index_col = 0)
df_test = pd.read_csv('test_clean.csv', index_col = 0)

In [86]:
#Concatenate data together for feature engineering
n = len(df_train)
temp = pd.concat([df_train, df_test], axis=0, sort=False).drop(['label'], axis=1)

#Seperate the 'mail_type' column
temp['mail_type'] = temp['mail_type'].str.strip()
temp['mail_type_1'] = temp['mail_type'].str.split('/',expand=True)[0]
temp['mail_type_2'] = temp['mail_type'].str.split('/',expand=True)[1]
temp = temp.drop('mail_type', axis=1)

#Transform the 'org' and 'tld' columns
temp['org_tld'] = temp['tld'] + '.' + temp['org']
temp['org_tld'] = temp['org_tld'].str.strip()
temp['org_tld'] = temp['org_tld'].str.lower()
temp['org_tld'] = temp['org_tld'].str.split('.')
temp = temp.drop(['org', 'tld'], axis =1)
temp['org_tld'] = temp['org_tld'].fillna('Nan')

mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(temp['org_tld']),
                   columns=mlb.classes_,
                   index=temp.index)

temp = pd.concat([temp, res], axis=1)
temp = temp.drop('org_tld', axis=1)

#Add numerical labels
catogorical = ['mail_type_1', 'mail_type_2', 'timezone']
temp[catogorical] = temp[catogorical].apply(lambda x: pd.factorize(x)[0])

#Add 'org_freq' and 'tld_freq' columns
for col in catogorical:
    encoding = temp.groupby(col).size()/len(temp)
    temp[col + '_freq'] = temp[col].map(encoding)
    
#Drop apparently non-important features, and date
drop_c = ['bcced', 'designation', 'salutations', 'date', 'mail_type_1_freq', 'mail_type_1']
temp = temp.drop(drop_c, axis=1)

#Drop features that have value 1 for only one row
t = temp.sum()
temp = temp.drop(t[t==1].index, axis=1)

In [87]:
#Split the data to X, y and X_test

X = temp[:n]
X_test = temp[n:]
y = df_train['label']

In [88]:
#Cross Validation

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 100)

print(cross_validate(model, X, y, cv=5, scoring='f1_macro', return_train_score=True))

{'fit_time': array([6.82508755, 6.99499154, 6.67917228, 7.30579615, 7.12291861]), 'score_time': array([0.20388293, 0.16690469, 0.16490555, 0.1929028 , 0.16789842]), 'test_score': array([0.95367512, 0.95272775, 0.94964305, 0.95025944, 0.95100211]), 'train_score': array([1., 1., 1., 1., 1.])}


In [89]:
#Model Training
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [91]:
#Generate our predictions
y_test = model.predict(X_test).astype(int)
res = pd.DataFrame({'ID': np.arange(len(y_test)),
                    'label': y_test})
res.to_csv('submission.csv', index=False)