In [1]:
# plotting inline but with a non-gui backend
#import matplotlib as mpl; mpl.use('Agg')
%matplotlib inline

# importing necessary modules
import re
import time
import random
import numpy as np
import scipy as sp
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
#from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_predict, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import OneHotEncoder
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype


# turning off automatic plot showing, and setting style
plt.style.use('bmh')
plt.ioff()

In [2]:
df = pd.read_csv('./data.csv')

In [3]:
df.head()

Unnamed: 0,record_id,city_key,event_time,weekday,category_key,rptcatg,req_id,session_group
0,5c1699b0c3658c25008e1192,city_chennai_v2,2018-12-17 00:00:08,weekday,professional_bathroom_cleaning,Cleaning - Other,5c1699d714bd522300fae954,3022
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,2018-12-17 00:00:15,weekday,electricians,EPC,5c1699ed70fa4f2500d66e7c,3303
2,5c1699df4fb96a2400166132,city_mumbai_v2,2018-12-17 00:00:55,weekday,salon_at_home,Salon at Home,,3330
3,5c1699f405cbce26002ff832,city_kolkata_v2,2018-12-17 00:01:16,weekday,salon_at_home,Salon at Home,,3330
4,5c1699fb02f88224006901f5,city_chennai_v2,2018-12-17 00:01:23,weekday,plumbers,EPC,5c169a6c8c954c2200693538,3303


#### conversion  = function(category, day_of_week, no_of_slots_shown)

In [4]:
# req_id conversion
df['req_id'].fillna(0, inplace=True)
df.loc[df['req_id'] != 0, 'req_id'] = 1
df.head()

Unnamed: 0,record_id,city_key,event_time,weekday,category_key,rptcatg,req_id,session_group
0,5c1699b0c3658c25008e1192,city_chennai_v2,2018-12-17 00:00:08,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,2018-12-17 00:00:15,weekday,electricians,EPC,1,3303
2,5c1699df4fb96a2400166132,city_mumbai_v2,2018-12-17 00:00:55,weekday,salon_at_home,Salon at Home,0,3330
3,5c1699f405cbce26002ff832,city_kolkata_v2,2018-12-17 00:01:16,weekday,salon_at_home,Salon at Home,0,3330
4,5c1699fb02f88224006901f5,city_chennai_v2,2018-12-17 00:01:23,weekday,plumbers,EPC,1,3303


In [5]:
df['category_key'].unique()

array(['professional_bathroom_cleaning', 'electricians', 'salon_at_home',
       'plumbers', 'pest_control', 'ro_repair', 'massage_for_men',
       'spa_at_home', 'professional_home_cleaning', 'carpenters',
       'refrigerator_repair', 'professional_kitchen_cleaning',
       'washing_machine_repair', 'professional_sofa_cleaning',
       'microwave_repair', 'ac_service_repair',
       'professional_carpet_cleaning', 'car_cleaning', 'mens_grooming'],
      dtype=object)

In [6]:
df['rptcatg'].unique()

array(['Cleaning - Other', 'EPC', 'Salon at Home', 'Pest Control',
       'Appliance Repair', 'Massage for Men', 'Spa for Women',
       'Full Home Cleaning', 'Mens Grooming'], dtype=object)

In [7]:
df['weekday'].unique()

array(['weekday', 'friday', 'saturday', 'sunday'], dtype=object)

In [8]:
df['session_group'].unique()

array([3022, 3303, 3330, 3230, 3333, 3233, 3030, 3122, 3223, 3130, 3203,
       3220, 3001, 3222, 3003, 3102, 3023, 3133, 3033, 3123, 3103, 2233,
       2033, 2133, 2003, 2103, 2122, 2123, 2022, 2023, 2001, 2203, 2002,
       2223, 2202, 2113, 2013, 2213, 1133, 1033, 1012, 1023, 1022, 1013,
       1002, 1103, 1123, 1003, 1001,   33,    3,   23,   22,   13,    2,
         12,   20,   32,    1, 3010, 3113, 3120, 3020, 3013, 3213, 3002,
       2230, 2232, 2222, 2010, 2012, 2200, 1032, 1100, 1102, 1132, 1020,
       1112, 1011, 3132, 3323, 2032, 2102, 2020, 2112, 2120, 1120, 1010,
         30,   21, 3232, 3100, 3032, 3332, 3202, 3200, 2132, 2210, 1122,
       1113,   11,   10, 3212, 3012, 2110, 2100, 1021, 3112, 3101, 2021,
       2011, 2121, 1121, 2201, 2030, 1130, 1030, 3021, 3300, 2220, 3011,
       3201, 2212,   31, 3231, 1131, 1101, 1031, 3322, 3111, 3121, 2130,
       2111, 2101,    0, 2221, 2211, 1111, 1000, 1110, 3210, 3313, 2231,
       3302, 2000, 2131, 3320, 3131, 2031, 3221, 33

In [9]:
df.head()

Unnamed: 0,record_id,city_key,event_time,weekday,category_key,rptcatg,req_id,session_group
0,5c1699b0c3658c25008e1192,city_chennai_v2,2018-12-17 00:00:08,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,2018-12-17 00:00:15,weekday,electricians,EPC,1,3303
2,5c1699df4fb96a2400166132,city_mumbai_v2,2018-12-17 00:00:55,weekday,salon_at_home,Salon at Home,0,3330
3,5c1699f405cbce26002ff832,city_kolkata_v2,2018-12-17 00:01:16,weekday,salon_at_home,Salon at Home,0,3330
4,5c1699fb02f88224006901f5,city_chennai_v2,2018-12-17 00:01:23,weekday,plumbers,EPC,1,3303


In [10]:
df.head()

Unnamed: 0,record_id,city_key,event_time,weekday,category_key,rptcatg,req_id,session_group
0,5c1699b0c3658c25008e1192,city_chennai_v2,2018-12-17 00:00:08,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,2018-12-17 00:00:15,weekday,electricians,EPC,1,3303
2,5c1699df4fb96a2400166132,city_mumbai_v2,2018-12-17 00:00:55,weekday,salon_at_home,Salon at Home,0,3330
3,5c1699f405cbce26002ff832,city_kolkata_v2,2018-12-17 00:01:16,weekday,salon_at_home,Salon at Home,0,3330
4,5c1699fb02f88224006901f5,city_chennai_v2,2018-12-17 00:01:23,weekday,plumbers,EPC,1,3303


In [11]:
# Pulling hours from day time

def pulling_day_category(df):
    df['event_time'][0].split(' ')[1]
    df['time'] = df['event_time'].str.split(' ').str[1]
    df['hour_based_daytime'] = df['time'].str.split(':').str[0]
    df['hour_based_daytime'] = pd.to_numeric(df['hour_based_daytime'])
    bins = [-1, 10, 12, 15, 18, 20, 24]
    labels = ["morning","b_noon","noon", "b_evening", "evening", "night"]
    df['day_category'] = pd.cut(df['hour_based_daytime'], bins=bins, labels=labels)
    df.drop('hour_based_daytime', axis=1,inplace=True)
    df.drop('time', axis=1,inplace=True)
    df.drop('event_time', axis=1, inplace=True)

In [12]:
df.head()

Unnamed: 0,record_id,city_key,event_time,weekday,category_key,rptcatg,req_id,session_group
0,5c1699b0c3658c25008e1192,city_chennai_v2,2018-12-17 00:00:08,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,2018-12-17 00:00:15,weekday,electricians,EPC,1,3303
2,5c1699df4fb96a2400166132,city_mumbai_v2,2018-12-17 00:00:55,weekday,salon_at_home,Salon at Home,0,3330
3,5c1699f405cbce26002ff832,city_kolkata_v2,2018-12-17 00:01:16,weekday,salon_at_home,Salon at Home,0,3330
4,5c1699fb02f88224006901f5,city_chennai_v2,2018-12-17 00:01:23,weekday,plumbers,EPC,1,3303


In [13]:
pulling_day_category(df)

In [14]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,day_category
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,morning
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,morning
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,morning


In [15]:
df['day_category'].dtypes

CategoricalDtype(categories=['morning', 'b_noon', 'noon', 'b_evening', 'evening',
                  'night'],
                 ordered=True)

In [16]:
#df['city_key'] = df.city_key.astype('category')
#df['weekday'] = df.weekday.astype('category')
#df['category_key'] = df.category_key.astype('category')
#df['rptcatg'] = df.rptcatg.astype('category')

def convert_cat(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [17]:
convert_cat(df)

In [18]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,day_category
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,morning
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,morning
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,morning


In [19]:
df['day_category'].dtypes

CategoricalDtype(categories=['morning', 'b_noon', 'noon', 'b_evening', 'evening',
                  'night'],
                 ordered=True)

In [20]:
df['city_key'].dtypes

CategoricalDtype(categories=['city_ahmedabad_v2', 'city_bangalore_v2',
                  'city_chandigarh_v2', 'city_chennai_v2', 'city_delhi_v2',
                  'city_hyderabad_v2', 'city_jaipur_v2', 'city_kolkata_v2',
                  'city_mumbai_v2', 'city_pune_v2'],
                 ordered=True)

In [21]:
df['weekday'].dtypes

CategoricalDtype(categories=['friday', 'saturday', 'sunday', 'weekday'], ordered=True)

In [22]:
df['rptcatg'].dtypes

CategoricalDtype(categories=['Appliance Repair', 'Cleaning - Other', 'EPC',
                  'Full Home Cleaning', 'Massage for Men', 'Mens Grooming',
                  'Pest Control', 'Salon at Home', 'Spa for Women'],
                 ordered=True)

In [23]:
df['category_key'].dtypes

CategoricalDtype(categories=['ac_service_repair', 'car_cleaning', 'carpenters',
                  'electricians', 'massage_for_men', 'mens_grooming',
                  'microwave_repair', 'pest_control', 'plumbers',
                  'professional_bathroom_cleaning',
                  'professional_carpet_cleaning', 'professional_home_cleaning',
                  'professional_kitchen_cleaning',
                  'professional_sofa_cleaning', 'refrigerator_repair',
                  'ro_repair', 'salon_at_home', 'spa_at_home',
                  'washing_machine_repair'],
                 ordered=True)

In [24]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,day_category
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,morning
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,morning
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,morning


In [25]:
df.shape

(816530, 8)

In [26]:
# class imbalance
df['req_id'].value_counts()/816530

0    0.543902
1    0.456098
Name: req_id, dtype: float64

# One hot encoding

In [28]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,day_category
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,morning
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,morning
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,morning
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,morning


In [29]:
from sklearn.preprocessing import LabelEncoder
le_city_key = LabelEncoder()

df['city_key_encoded'] = le_city_key.fit_transform(df.city_key)

In [30]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,day_category,city_key_encoded
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,morning,3
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,morning,8
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,morning,8
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,morning,7
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,morning,3


In [31]:
from sklearn.preprocessing import OneHotEncoder
city_key_ohe = OneHotEncoder()
weekday_ohe = OneHotEncoder()
category_key_ohe = OneHotEncoder()
rptcatg_key_ohe = OneHotEncoder()


X = city_key_ohe.fit_transform(df.city_key_encoded.values.reshape(-1,1)).toarray()
Y = weeke.fit_transform(df.city_key_encoded.values.reshape(-1,1)).toarray()
Z = city_key_ohe.fit_transform(df.city_key_encoded.values.reshape(-1,1)).toarray()
W = city_key_ohe.fit_transform(df.city_key_encoded.values.reshape(-1,1)).toarray()

In [33]:
dfOneHot = pd.DataFrame(X, columns = ["City_key_"+str(int(i)) for i in range(X.shape[1])])
df = pd.concat([df, dfOneHot], axis=1)

In [34]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,day_category,city_key_encoded,City_key_0,City_key_1,City_key_2,City_key_3,City_key_4,City_key_5,City_key_6,City_key_7,City_key_8,City_key_9
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,morning,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,morning,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,morning,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,morning,7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,morning,3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0


## Cool. The data is tidy with only numeric columns. The proportion of the minority class is 45%.

In [57]:
from pandas.api.types import is_numeric_dtype

def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1


def convert_cat_num(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

In [58]:
df, y, nas = convert_cat_num(df, 'req_id')

In [59]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,session_group,day_category
0,3,4,4,10,2,3022,1
1,6,9,4,4,3,3303,1
2,12,9,4,17,8,3330,1
3,17,8,4,17,8,3330,1
4,19,4,4,9,3,3303,1


In [60]:
# Test
df['category_key'].unique()

array([10,  4, 17,  9,  8, 16,  5, 18, 12,  3, 15, 13, 19, 14,  7,  1, 11,
        2,  6])

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 816530 entries, 0 to 816529
Data columns (total 7 columns):
record_id        816530 non-null int32
city_key         816530 non-null int8
weekday          816530 non-null int8
category_key     816530 non-null int8
rptcatg          816530 non-null int8
session_group    816530 non-null int64
day_category     816530 non-null int8
dtypes: int32(1), int64(1), int8(5)
memory usage: 13.2 MB


In [62]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,session_group,day_category
0,3,4,4,10,2,3022,1
1,6,9,4,4,3,3303,1
2,12,9,4,17,8,3330,1
3,17,8,4,17,8,3330,1
4,19,4,4,9,3,3303,1


In [63]:
# getting design matrix and target
X = df.copy().drop(['record_id','city_key', 'day_category'], axis=1)

In [64]:
X.head()

Unnamed: 0,weekday,category_key,rptcatg,session_group
0,4,10,2,3022
1,4,4,3,3303
2,4,17,8,3330
3,4,17,8,3330
4,4,9,3,3303


In [65]:
y

array([1, 1, 0, ..., 1, 1, 0])

# SPLIT

In [66]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 546869  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((269661, 7), (269661,), (546869, 7))

# MODELING

In [67]:
# validation process
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=101)

### Vanilla Extremely Randomized Trees Forest

Let us try now the `ExtraTreesClassifier` from sklearn. We first search for the best hiperparameters via random search:

In [None]:
# dataframe with experiment results
results_df = pd.DataFrame()

# random search of parameters
N_TRIALS = 20
for trial_id in range(N_TRIALS):
    
    # choosing parameters
    params = {'n_estimators': 100,
              'class_weight': np.random.choice([None,'balanced','balanced_subsample'], 1)[0],
              'min_samples_leaf': np.random.randint(10, 100),
              'max_features': np.random.uniform(0.1, 1.00),
              'bootstrap': np.random.choice([True,False], 1)[0],
              'n_jobs':-1}
    
    # printing the parameters
    print(params)
    
    # configuring the model
    et = ExtraTreesClassifier(**params)
    
    # running CV
    preds = cross_val_predict(et, X_train, y_train, cv=skf, method='predict_proba')
    
    # evaluating
    result = roc_auc_score(y_train, preds[:,1])
    print(result)
    print('')
    
    # saving to dict
    temp_results = pd.concat([pd.DataFrame(params, index=[trial_id]), pd.DataFrame({'result':result}, index=[trial_id])], axis=1)
    results_df = pd.concat([results_df, temp_results])

{'n_estimators': 100, 'class_weight': 'balanced', 'min_samples_leaf': 41, 'max_features': 0.9067408319024074, 'bootstrap': True, 'n_jobs': -1}
0.6283007134525966

{'n_estimators': 100, 'class_weight': 'balanced_subsample', 'min_samples_leaf': 23, 'max_features': 0.8673335726889143, 'bootstrap': False, 'n_jobs': -1}


In [None]:
# let us check the results and sort them by auc
results_df.sort_values('result', ascending=False)

In [None]:
preds[:,1][2000:2005]

In [None]:
preds

In [None]:
y_train[2000:2005]