In [1]:
import pandas as pd
import numpy as np
import dateparser

In [15]:
#load the original data set
df = pd.read_csv('train.csv', index_col = 0)
#df = pd.read_csv('test.csv', index_col = 0)

In [16]:
#Obtain clean date

#1. There are no missing values in the date column, but several formats exist:
#(a) Sun, 28 Aug 2016 00:17:44 +0000
#(b) 11-MAR-2018 20:40:58
#(c) Mon, 8 Apr 2013 09:59:21
#(d) 25 Apr 2019 07:13:43 -0400
#...
#2. There are also incorrect time zones values, '+0580', which I changed to '0530'

df['date'] = df['date'].str.replace(r' \(.*\)', '')
df['date'] = df['date'].str.rstrip()
df['date'] = df['date'].str.replace('-0000', '+0000')
df['date'] = df['date'].str.replace('0580', '0530')
df['date'] = df['date'].apply(dateparser.parse, languages=['en'])

In [17]:
#Generate year, month, day, day of the week, hour, minute, second and timezone from date
df['year'] = df['date'].apply(lambda x:x.year)
df['month'] = df['date'].apply(lambda x:x.month)
df['day'] = df['date'].apply(lambda x:x.day)
df['hour'] = df['date'].apply(lambda x:x.hour)
df['minute'] = df['date'].apply(lambda x:x.minute)
df['second'] = df['date'].apply(lambda x:x.second)
df['weekday'] = df['date'].apply(lambda x:x.weekday())
df['timezone'] = df['date'].apply(lambda x:x.tzname())

In [18]:
#Merge all the upper case and lower case mailtype calsses by lowering-case the whole column
#e.g. multipart/alternative and Multipart/Alternative
df['mail_type'] = df['mail_type'].str.lower()

In [19]:
#Save df to train_clean.csv
df.to_csv('train_clean.csv')
#df.to_csv('test_clean.csv')

In [21]:
#A quick and dirty model before feature engineering

#Read train_clean.csv and test_clean.csv
df_train = pd.read_csv('train_clean.csv', index_col = 0)
df_test = pd.read_csv('test_clean.csv', index_col = 0)

#Create dummpy variables and split the data to X, y and X_test
n = len(df_train)
temp = pd.concat([df_train, df_test], axis=0).drop(['date', 'label'], axis=1)
temp_with_dummies = pd.get_dummies(temp, drop_first=True)
X = temp_with_dummies[:n]
X_test = temp_with_dummies[n:]
y = df_train['label']


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [27]:
#Cross Validation

from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators = 30)

print(cross_validate(model, X, y, cv=5))

{'fit_time': array([2.79915881, 2.74349189, 2.72964191, 2.66462827, 2.74367952]), 'score_time': array([0.08395147, 0.12191725, 0.08394647, 0.09393978, 0.08995247]), 'test_score': array([0.94104628, 0.9418277 , 0.94122383, 0.93941224, 0.93757551]), 'train_score': array([0.99969804, 0.99974839, 0.99974839, 0.99969807, 0.9996981 ])}


In [29]:
#Model Training
model.fit(X, y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=30, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [36]:
#Generate our predictions
y_test = model.predict(X_test)
res = pd.DataFrame({'ID': np.arange(len(y_test)),
                    'label': y_test})
res.to_csv('submission.csv', index=False)