In [1]:
import pandas as pd
import numpy as np
import dateparser
import matplotlib.pyplot as plt
from sklearn.preprocessing import MultiLabelBinarizer
%matplotlib inline

In [2]:
#Read train_clean.csv and test_clean.csv
df_train = pd.read_csv('train_clean.csv', index_col = 0)
df_test = pd.read_csv('test_clean.csv', index_col = 0)

In [3]:
#Concatenate data together for feature engineering
n = len(df_train)
temp2 = pd.concat([df_train, df_test], axis=0, sort=False).drop(['label'], axis=1)
temp = temp2

#Seperate the 'mail_type' column
temp['mail_type'] = temp['mail_type'].str.strip()
temp['mail_type_1'] = temp['mail_type'].str.split('/',expand=True)[0]
temp['mail_type_2'] = temp['mail_type'].str.split('/',expand=True)[1]
temp = temp.drop('mail_type', axis=1)

#Transform the 'org' and 'tld' columns
temp['org_tld'] = temp['tld'] + '.' + temp['org']
temp['org_tld'] = temp['org_tld'].str.strip()
temp['org_tld'] = temp['org_tld'].str.lower()
temp['org_tld'] = temp['org_tld'].str.split('.')
temp = temp.drop(['org', 'tld'], axis =1)
temp['org_tld'] = temp['org_tld'].fillna('N')

mlb = MultiLabelBinarizer()
res = pd.DataFrame(mlb.fit_transform(temp['org_tld']),
                   columns=mlb.classes_,
                   index=temp.index)

temp = pd.concat([temp, res], axis=1)
temp = temp.drop('org_tld', axis=1)

#Add numerical labels
catogorical = ['mail_type_1', 'mail_type_2', 'timezone']
temp[catogorical] = temp[catogorical].apply(lambda x: pd.factorize(x)[0])

#Add 'org_freq' and 'tld_freq' columns
for col in catogorical:
    encoding = temp.groupby(col).size()/len(temp)
    temp[col + '_freq'] = temp[col].map(encoding)

#Add the frequency of the 'tld' and 'org'
freq = temp[temp.columns[19:-3]].sum(axis=1)
temp = pd.concat([temp,freq], axis=1)

#Drop apparently non-important features, and date
drop_c = ['bcced', 'designation', 'salutations', 'date', 'mail_type_1_freq', 'mail_type_1']
temp = temp.drop(drop_c, axis=1)

#Drop features that have value 1 for only one row
t1 = temp[:n].sum()
t2 = temp[n:].sum()
temp = temp.drop(set(t2[t2==0].index) | set(t1[t1==0].index), axis=1)

#Drop duplicate columns in the training set
col_non_dup = (temp[:n].T.drop_duplicates().T).columns
temp = temp[col_non_dup]

In [4]:
#Split the data to X, y and X_test

X = temp[:n]
X_test = temp[n:]
y = df_train['label']

In [5]:
#Cross Validation

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

#250 & 200 before droppinp duplicate columns
#250 & 175 after droppinp duplicate columns
model = RandomForestClassifier(n_estimators = 250, class_weight='balanced', max_features = 175)

print(cross_validate(model, X, y, cv=5, scoring='f1_macro', return_train_score=True))

{'fit_time': array([54.96952987, 47.87690592, 45.09794831, 48.46701407, 46.00493383]), 'score_time': array([0.55400205, 0.45932221, 0.3886199 , 0.40511489, 0.38913417]), 'test_score': array([0.95542942, 0.95766601, 0.95357298, 0.95547597, 0.95325909]), 'train_score': array([1., 1., 1., 1., 1.])}


In [6]:
#Model Training
model.fit(X, y)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=None, max_features=175,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=250,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [7]:
#Generate our predictions
y_test = model.predict(X_test).astype(int)
res = pd.DataFrame({'ID': np.arange(len(y_test)),
                    'label': y_test})
res.to_csv('submission.csv', index=False)