In [1]:
import pandas as pd
import numpy as np
from scipy import stats
import datetime
import seaborn as sns
from catboost import CatBoostClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

print('Train Shape:', train.shape)
print('Test Shape:', test.shape)
train.head()

Train Shape: (80176, 14)
Test Shape: (34365, 13)


Unnamed: 0.1,Unnamed: 0,date,org,tld,ccs,bcced,mail_type,images,urls,salutations,designation,chars_in_subject,chars_in_body,label
0,0,"Mon, 6 Nov 2017 11:13:45 +0100",reply,ebay.in,0,0,multipart/alternative,35,120,0,0,49.0,80027,2
1,1,"Wed, 14 Feb 2018 11:00:16 -0000",edm,efinmail.com,0,0,multipart/alternative,1,7,0,0,107.0,2961,1
2,2,"Wed, 6 Jul 2016 19:53:37 +0000",usebackpack,com,0,0,text/html,4,17,0,0,35.0,25149,1
3,3,"Fri, 11 Oct 2019 11:25:40 +0200",granular,ai,0,0,multipart/mixed,0,0,0,0,15.0,635296,1
4,4,"Tue, 07 Nov 2017 11:07:18 +0000 (UTC)",github,com,1,0,multipart/alternative,2,11,0,0,49.0,2355,1


In [3]:
df = pd.concat([train, test])
df.shape

(114541, 14)

In [4]:
labels = {0: 'Updates', 1: 'Personal', 2: 'Promotions', 3: 'Forums', 4: 'Purchases', 5: 'Travel', 6: 'Spam', 7: 'Social'}
df['label_type'] = df.label.map(labels)
df.label_type.value_counts()

Personal      37195
Updates       17995
Forums        10727
Promotions     8567
Social         5042
Purchases       357
Spam            152
Travel          141
Name: label_type, dtype: int64

In [5]:
# #### Date Correction
df['date_length'] = df.date.str.len()
df.groupby('date_length').agg({'date': ['min', 'max', 'count']})

Unnamed: 0_level_0,date,date,date
Unnamed: 0_level_1,min,max,count
date_length,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
18,26 Dec 13 18:48:01,26 Dec 13 18:48:01,1
20,11-MAR-2018 20:40:58,11-MAR-2018 20:40:58,2
24,"Fri, 5 Jul 2013 13:30:59","Wed, 4 Sep 2013 19:18:20",12
25,1 Apr 2014 12:59:18 -0000,"Wed, 21 Mar 2018 17:50:26",681
26,01 Apr 2018 18:50:30 +0530,"Thu, 28 Aug 2014 14:38:35",2312
28,"Fri, 9 Nov 2012 06:11:45 GMT","Wed, 6 Feb 2013 16:40:36 GMT",6
29,"Fri, 16 Jun 2017 08:44:58 GMT","Wed, 28 Oct 2020 09:20:57 GMT",130
30,"Fri, 1 Apr 2016 00:30:09 +0000","Wed, 9 Sep 2020 19:00:30 +0000",18207
31,"Fri, 2 Aug 2019 12:00:13 +0000","Wed, 31 Oct 2018 20:09:58 +0000",73143
32,"Sun, 08 Sep 2019 18:42:09 +0000","Sun, 08 Sep 2019 18:42:09 +0000",2


In [6]:
df['local_date'] = df.date

# Removing Day Name
for day in ['Mon,', 'Tue,', 'Wed,', 'Thu,', 'Fri,', 'Sat,', 'Sun,']:
    df['local_date'] = df.local_date.apply(lambda x: x.replace(day, ''))

In [7]:
# Removing Timezones
df['local_date'] = df.local_date.apply(lambda x: x.split('+')[0]) 
df['local_date'] = df.local_date.apply(lambda x: x.split('-0')[0])
df['local_date'] = df.local_date.apply(lambda x: x.split('GMT')[0])


In [8]:
# Other Corrections
df['local_date'] = df.local_date.apply(lambda x: ' '.join(x.split())) # removing duplicate spaces
df['local_date'] = df.local_date.apply(lambda x: x.replace('-', ' ')) # removing dashes in date
df['local_date'] = df.local_date.str.strip() # removing extra spaces from ends


In [9]:
df['date_length'] = df.local_date.str.len()
df.groupby('date_length').agg({'local_date': ['min', 'max', 'count']})

Unnamed: 0_level_0,local_date,local_date,local_date
Unnamed: 0_level_1,min,max,count
date_length,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
18,26 Dec 13 18:48:01,26 Dec 13 18:48:01,1
19,1 Apr 2014 06:00:32,9 Sep 2020 19:00:30,22628
20,01 Apr 2014 02:56:09,31 Oct 2019 20:17:55,91912


In [10]:
df.loc[df.local_date == '26 Dec 13 18:48:01', 'local_date'] = '26 Dec 2013 18:48:01'
df.local_date = pd.to_datetime(df.local_date, format = '%d %b %Y %H:%M:%S')
df = df.drop('date_length', axis = 1)

In [11]:
# Time Zones
df['timezone'] = df.date
df['timezone'] = df.timezone.apply(lambda x: ('+' + x.split('+')[1]) if '+' in x else x)
df['timezone'] = df.timezone.apply(lambda x: ('-' + x.split('-')[1]) if '-' in x else x)
df['timezone'] = df.timezone.apply(lambda x: '+0000' if ('GMT' in x) or ('-0000' in x) else x)
df['timezone'] = df.timezone.apply(lambda x: (x.split('(')[0]) if '(' in x else x)
df['timezone'] = df.timezone.apply(lambda x: 'unspecified' if ':' in x else x)
df['timezone'] = df.timezone.apply(lambda x: 'unspecified' if '-MAR' in x else x)
df['timezone'] = df.timezone.str.strip()
np.sort(df['timezone'].unique()).tolist()

['+0000',
 '+0100',
 '+0200',
 '+0300',
 '+0330',
 '+0400',
 '+0430',
 '+0500',
 '+0530',
 '+0545',
 '+0580',
 '+0600',
 '+0700',
 '+0800',
 '+0900',
 '+1000',
 '+1100',
 '-0300',
 '-0400',
 '-0500',
 '-0600',
 '-0700',
 '-0800',
 'unspecified']

In [12]:
#Defining a function for EDA
def color_format(df, formatting = None):
    palette = sns.light_palette('seagreen', as_cmap = True)
    df = df.style.background_gradient(cmap = palette).format(formatting, na_rep = '').highlight_null(null_color = 'white')
    return(df)

In [13]:
color_format(df.groupby('label_type')['timezone'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")

label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
timezone,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
+0000,11%,42%,60%,80%,84%,81%,10%,55%
+0100,1%,2%,2%,1%,0%,1%,,2%
+0200,1%,3%,4%,6%,0%,,19%,2%
+0300,0%,0%,0%,,,,,0%
+0330,,0%,,,,,,
+0400,0%,0%,,,,,,0%
+0430,,0%,,,,,,
+0500,0%,,,,,,,
+0530,79%,38%,14%,7%,,15%,62%,12%
+0545,0%,0%,,,,,,


In [14]:
df['day'] = df.local_date.dt.day
df['weekday'] = df.local_date.dt.weekday
df['weekend'] = np.where(df.local_date.dt.weekday.isin([5,6]), 1, 0)
df['month'] = df.local_date.dt.month
df['hour'] = df.local_date.dt.hour
df['minutes'] = df.local_date.dt.minute
df['seconds'] = df.local_date.dt.second

color_format(df.groupby('label_type')['day'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")

label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,4%,3%,3%,4%,3%,2%,6%,4%
2,4%,3%,3%,4%,3%,3%,,3%
3,4%,3%,3%,5%,3%,1%,2%,3%
4,4%,3%,3%,4%,3%,3%,3%,3%
5,4%,4%,3%,3%,3%,1%,1%,3%
6,4%,4%,3%,3%,3%,4%,1%,3%
7,4%,5%,3%,3%,4%,3%,1%,6%
8,4%,4%,3%,4%,4%,1%,4%,3%
9,4%,4%,3%,2%,3%,5%,4%,4%
10,4%,3%,4%,3%,4%,2%,3%,3%


In [15]:
color_format(df.groupby('label_type')['weekday'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")

label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
weekday,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,17%,16%,13%,18%,18%,16%,23%,14%
1,19%,16%,17%,12%,16%,22%,23%,19%
2,17%,15%,16%,13%,17%,12%,11%,14%
3,18%,16%,18%,12%,14%,9%,10%,16%
4,16%,17%,17%,16%,13%,18%,20%,16%
5,6%,11%,11%,15%,11%,13%,4%,13%
6,5%,9%,9%,14%,10%,10%,9%,9%


In [16]:
color_format(df.groupby('label_type')['weekend'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")

label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
weekend,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,88%,80%,81%,72%,79%,77%,87%,78%
1,12%,20%,19%,28%,21%,23%,13%,22%


In [17]:
color_format(df.groupby('label_type')['hour'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")

label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
hour,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,2%,4%,7%,1%,2%,3%,,3%
1,1%,3%,4%,1%,4%,3%,4%,3%
2,1%,2%,3%,1%,4%,2%,2%,3%
3,1%,3%,4%,1%,3%,2%,3%,5%
4,1%,3%,3%,2%,4%,4%,,5%
5,1%,3%,4%,4%,4%,14%,1%,5%
6,1%,3%,5%,4%,4%,8%,3%,6%
7,2%,3%,5%,7%,6%,4%,1%,5%
8,3%,4%,5%,7%,11%,3%,5%,5%
9,7%,5%,5%,6%,10%,9%,3%,6%


In [18]:
df['promotion_hour'] = np.where(df.hour.isin([0]), 1, 0)
df['spam_hour'] = np.where(df.hour.isin([5]), 1, 0)
df['travel_hour'] = np.where(df.hour.isin([10]), 1, 0)

In [19]:
color_format(df.groupby('label_type')['month'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")

label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,10%,8%,7%,6%,7%,,2%,8%
2,8%,7%,8%,3%,8%,,2%,7%
3,8%,8%,10%,12%,8%,,1%,8%
4,7%,9%,8%,9%,8%,,9%,9%
5,5%,8%,9%,9%,9%,,13%,7%
6,5%,8%,8%,17%,9%,,14%,7%
7,6%,10%,9%,7%,9%,,18%,11%
8,13%,9%,10%,11%,10%,,11%,10%
9,12%,9%,11%,6%,10%,10%,7%,10%
10,11%,9%,10%,7%,9%,90%,12%,10%


In [20]:
#creating a feature for Spam emails
df['spam_month'] = np.where(df.month.isin([10]), 1, 0)


# #### Org Classification


#reading a csv org_classification file created to classify orgs into labels they appeared most in
org_class = pd.read_csv('org_classification.csv')
df = pd.merge(df, org_class, how = 'left', on = 'org')
df['org_class'].fillna('Others', inplace = True)

In [21]:
# Train Data
df[~df['label'].isna()]['org_class'].value_counts(normalize = True)

# Test Data
df[df['label'].isna()]['org_class'].value_counts(normalize = True)

Others        0.304787
Personal      0.298327
Updates       0.140754
Social        0.105660
Promotions    0.066521
Purchases     0.051564
Forums        0.022639
Spam          0.005849
Travel        0.003899
Name: org_class, dtype: float64

In [22]:
color_format(df.groupby('label_type')['org_class'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")


label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
org_class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Forums,12%,1%,,,,,,1%
Others,4%,30%,45%,15%,8%,11%,31%,47%
Personal,84%,37%,1%,2%,1%,3%,23%,4%
Promotions,,6%,31%,0%,0%,,,3%
Purchases,,5%,15%,80%,,17%,,4%
Social,,9%,1%,,91%,,,1%
Spam,,0%,2%,,,68%,,1%
Travel,,0%,0%,,,,45%,1%
Updates,,12%,5%,3%,,,,38%


In [23]:
#cleaning mail_type
df['mail_type'] = df.mail_type.str.lower().str.strip()
df['mail_type'] = df.mail_type.fillna('unspecified')
color_format(df.groupby('label_type')['mail_type'].value_counts(normalize = True).unstack().transpose(), "{:.0%}")


label_type,Forums,Personal,Promotions,Purchases,Social,Spam,Travel,Updates
mail_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
multipart/alternative,74%,74%,77%,74%,95%,76%,17%,63%
multipart/idm,,0%,,,,,,
multipart/mixed,11%,10%,7%,11%,3%,5%,47%,5%
multipart/related,7%,3%,1%,,,,2%,1%
multipart/report,,0%,,,,,,
multipart/signed,,0%,,,,,,
text/calendar,,0%,,,,,,
text/html,5%,9%,16%,14%,1%,15%,31%,26%
text/plain,3%,4%,0%,1%,0%,3%,3%,5%
unspecified,,0%,,,,,,0%


In [24]:
#creating a feature website by combining org and tld and taking out frequencies for website, mail_type, orgs, and tld
df.loc[:,"website"]=df.loc[:,"org"].fillna('NA')  + df.loc[:,"tld"].fillna('NA')
org_freq=df.groupby("website").size()/len(df)
df.loc[:,"website_freq"]=df.loc[:,"website"].map(org_freq)


mail_freq=df.groupby("mail_type").size()/len(df)
df.loc[:,"mail_type_freq"]=df.loc[:,"mail_type"].map(mail_freq)


df.loc[:,"org_freq"]=df.loc[:,"org"].fillna('NA')
org_freq=df.groupby("org_freq").size()/len(df)
df.loc[:,"org_freq"]=df.loc[:,"org_freq"].map(org_freq)


df.loc[:,"tld_freq"]=df.loc[:,"tld"].fillna('NA')
org_freq=df.groupby("tld_freq").size()/len(df)
df.loc[:,"tld_freq"]=df.loc[:,"tld_freq"].map(org_freq)


In [25]:
#defining a feature to identify travel mails
df['travel_mail_type'] = np.where(df.mail_type.isin(['multipart/mixed']), 1, 0)


# #### Characters in Subject & Body

# In[25]:


df['chars_in_subject'] = df.chars_in_subject.fillna(0)
df['chars_in_body'] = df.chars_in_body.fillna(0)

In [26]:
#OneHotEncoding
df = pd.get_dummies(df, columns = ['org_class', 'timezone', 'mail_type'])
df.columns

Index(['Unnamed: 0', 'date', 'org', 'tld', 'ccs', 'bcced', 'images', 'urls',
       'salutations', 'designation', 'chars_in_subject', 'chars_in_body',
       'label', 'label_type', 'local_date', 'day', 'weekday', 'weekend',
       'month', 'hour', 'minutes', 'seconds', 'promotion_hour', 'spam_hour',
       'travel_hour', 'spam_month', 'website', 'website_freq',
       'mail_type_freq', 'org_freq', 'tld_freq', 'travel_mail_type',
       'org_class_Forums', 'org_class_Others', 'org_class_Personal',
       'org_class_Promotions', 'org_class_Purchases', 'org_class_Social',
       'org_class_Spam', 'org_class_Travel', 'org_class_Updates',
       'timezone_+0000', 'timezone_+0100', 'timezone_+0200', 'timezone_+0300',
       'timezone_+0330', 'timezone_+0400', 'timezone_+0430', 'timezone_+0500',
       'timezone_+0530', 'timezone_+0545', 'timezone_+0580', 'timezone_+0600',
       'timezone_+0700', 'timezone_+0800', 'timezone_+0900', 'timezone_+1000',
       'timezone_+1100', 'timezone_-0300',

In [27]:
#function to split training data into training set and validation set
def train_val_split(data):

    val_set = []
    for label in range(0, 8):
        val_data = data[data['label'] == label].sample(frac = 0.2, random_state = 0)
        val_set.append(val_data)
    
    val = pd.concat(val_set)
    train = data[~(data.index.isin(val.index))]
    
    return(train, val)

In [28]:
#creating a list which mentions which features to remove
remove_features = [
    'label',
    'mail_type_multipart/signed',
    'mail_type_unspecified',
    'mail_type_multipart/report',
    'mail_type_text/calendar',
    'mail_type_multipart/idm',
    'timezone_+0330',
    'timezone_+0430',
    'timezone_+0580',
    'timezone_+0545',
    'timezone_+0600',
    'timezone_+1000',
    'timezone_+0700',
    'timezone_+0400',
    'timezone_+0900',
    'timezone_-0300',
    'timezone_+0500',
    'timezone_+1100',
    'timezone_+0300',
    'timezone_+0800',
    'timezone_-0400',
    'timezone_+0100',
    'timezone_-0500',
    'timezone_-0600',
    'timezone_-0700',
    'timezone_+0200',
    'timezone_-0800',
    'designation', 'weekend', 'promotion_hour', 'travel_hour',
       'mail_type_text/plain', 'spam_hour', 'mail_type_multipart/related',
       'bcced', 'timezone_unspecified', 'mail_type_multipart/idm', 'mail_type_multipart/related',
       'mail_type_multipart/report', 'mail_type_multipart/signed',
       'mail_type_text/calendar',
       'mail_type_text/plain', 'mail_type_unspecified',
    'chars_in_subject','chars_in_body',
    'seconds','minutes',
    'travel_mail_type','spam_month'
    


]

In [29]:
train_df, val_df = train_val_split(data = df[~df['label'].isna()])
print('Train Data:', train_df.shape)
print('Validation Data:', val_df.shape)

features = list(set(df.select_dtypes(include = np.number).columns)- set(remove_features))

X, y, X_val, y_val = train_df[features], train_df['label'], val_df[features], val_df['label']

print('Train Data:', X.shape)
print('Validation Data:', X_val.shape)

Train Data: (64143, 75)
Validation Data: (16033, 75)
Train Data: (64143, 27)
Validation Data: (16033, 27)


In [30]:
#training the first model only on label 1
df_copy = train_df.copy()
df_copy.loc[df_copy['label']!=1,'label']  = 0
df_copy['label'].value_counts()


clf_class_1 = CatBoostClassifier(n_estimators = 1000,
                       learning_rate = 0.05,
                       rsm = 0.2, ## Analogous to colsample_bytree
                       random_state=0,
                         max_depth =6,
                         verbose = 0,
                        auto_class_weights = 'SqrtBalanced',
                        eval_metric = 'F1'
                        
                                 
#                                  L2, Cosine, NewtonL2, NewtonCosine
                     )
clf_class_1.fit(df_copy[features], df_copy['label'])

y_pred_train = (clf_class_1.predict_proba(df_copy[features])[:,1] >= 0.5).astype(int) 

print('\nTrain F1 Score:', f1_score(df_copy['label'], y_pred_train, average = 'macro'))
print('\nTrain Report: \n', classification_report(df_copy['label'], y_pred_train))
#print('Test Report: \n', classification_report(y_val, y_pred_val))


Train F1 Score: 0.6567796104289905

Train Report: 
               precision    recall  f1-score   support

         0.0       0.66      0.76      0.71     34387
         1.0       0.67      0.56      0.61     29756

    accuracy                           0.66     64143
   macro avg       0.66      0.66      0.66     64143
weighted avg       0.66      0.66      0.66     64143



In [31]:
#Training the second class on all other labels except 1
df_oth_class = train_df[~(train_df['label'].isin([1]))].copy()

clf_oth_class = CatBoostClassifier(n_estimators = 1000,
                       learning_rate = 0.05,
                       rsm = 0.2, ## Analogous to colsample_bytree
                       random_state=0,
                         max_depth =6,
                         verbose = 0,
                         auto_class_weights = 'SqrtBalanced',
                         
                     )
clf_oth_class.fit(df_oth_class[features], df_oth_class['label'])

y_pred_train = clf_oth_class.predict(df_oth_class[features])

print('\nTrain F1 Score:', f1_score(df_oth_class['label'], y_pred_train, average = 'macro'))
print('\nTrain Report: \n', classification_report(df_oth_class['label'], y_pred_train))


Train F1 Score: 0.8146154914537732

Train Report: 
               precision    recall  f1-score   support

         0.0       0.93      0.89      0.91     14396
         2.0       0.86      0.84      0.85      6854
         3.0       0.99      0.99      0.99      8582
         4.0       0.45      0.98      0.62       286
         5.0       0.54      1.00      0.70       113
         6.0       0.51      1.00      0.67       122
         7.0       0.94      0.99      0.97      4034

    accuracy                           0.92     34387
   macro avg       0.74      0.96      0.81     34387
weighted avg       0.92      0.92      0.92     34387



In [33]:
#testing the data on validation set using the two stacked models
val_predict = []
for i in val_df.index:
    class1 = clf_class_1.predict(val_df[val_df.index==i][features])[0]
    if class1 ==1:
        val_predict.append(class1)
    else:
        oth_class = clf_oth_class.predict(val_df[val_df.index==i][features])[0][0]
        val_predict.append(oth_class)

In [34]:
print('Validation F1 Score:', f1_score(y_val, val_predict, average = 'macro'))
print('Test Report: \n', classification_report(y_val, val_predict))

Validation F1 Score: 0.5279218696507064
Test Report: 
               precision    recall  f1-score   support

         0.0       0.57      0.62      0.59      3599
         1.0       0.60      0.50      0.55      7439
         2.0       0.48      0.61      0.54      1713
         3.0       0.66      0.55      0.60      2145
         4.0       0.28      0.77      0.41        71
         5.0       0.20      0.39      0.27        28
         6.0       0.46      0.87      0.60        30
         7.0       0.54      0.87      0.67      1008

    accuracy                           0.57     16033
   macro avg       0.47      0.65      0.53     16033
weighted avg       0.58      0.57      0.57     16033



In [35]:
feature_importance = pd.Series(clf_class_1.feature_importances_, index = features).sort_values(ascending = False).round(2)
feature_importance

org_freq                           9.91
website_freq                       9.61
tld_freq                           8.58
urls                               7.61
org_class_Forums                   6.65
images                             6.55
ccs                                6.13
month                              6.10
org_class_Personal                 5.33
Unnamed: 0                         4.22
timezone_+0000                     3.80
hour                               3.29
mail_type_text/html                2.75
timezone_+0530                     2.66
mail_type_freq                     2.61
day                                2.21
org_class_Updates                  2.19
salutations                        2.01
weekday                            1.81
mail_type_multipart/alternative    1.73
org_class_Others                   1.51
mail_type_multipart/mixed          1.00
org_class_Spam                     0.52
org_class_Promotions               0.36
org_class_Social                   0.33


In [36]:
feature_importance = pd.Series(clf_oth_class.feature_importances_, index = features).sort_values(ascending = False).round(2)
feature_importance

website_freq                       10.22
org_freq                            8.35
urls                                8.26
month                               6.88
images                              6.58
tld_freq                            5.37
org_class_Social                    5.17
org_class_Others                    5.15
timezone_+0000                      4.96
hour                                3.94
org_class_Purchases                 3.79
day                                 3.13
salutations                         3.09
timezone_+0530                      2.98
org_class_Updates                   2.97
weekday                             2.47
Unnamed: 0                          2.42
mail_type_multipart/alternative     2.36
org_class_Personal                  2.08
mail_type_freq                      1.97
ccs                                 1.63
mail_type_text/html                 1.52
org_class_Promotions                1.52
mail_type_multipart/mixed           1.10
org_class_Spam  

In [37]:
#Predicting test data set
test_df = df[df['label'].isna()]


val_predict = []
for i in test_df.index:
    class1 = clf_class_1.predict(test_df[test_df.index==i][features])[0]
    if class1 ==1:
        val_predict.append(class1)
    else:
        oth_class = clf_oth_class.predict(test_df[test_df.index==i][features])[0][0]
        val_predict.append(oth_class)

In [38]:
len(val_predict)

34365

In [39]:
pred_df = pd.DataFrame(val_predict, columns=['label'])
pred_df['label'] = pred_df['label'].astype(int)
#pred_df.to_csv("stack_cat_boost_submission.csv", index=True, index_label='Id')


In [40]:
pred_df['label'].unique()

array([1, 0, 7, 2, 3, 4, 5, 6])