In [60]:
import os, sys
import numpy as np
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import re

In [4]:
def add_datepart(df, fldname, drop=True, time=False, errors="raise"):
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True, errors=errors)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear',
            'Is_month_end', 'Is_month_start', 'Is_quarter_end', 'Is_quarter_start', 'Is_year_end', 'Is_year_start']
    if time: attr = attr + ['Hour', 'Minute', 'Second']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [5]:
df_raw = pd.read_csv('data.csv', low_memory=False, 
                     parse_dates=["event_time"])

In [6]:
# Data Dec 17,2018 to Jan 30,2019
# Train Model on 17th to Jan 13

# predict values for Jan 14 - Jan 20
# what happends to data from 20th - 30th

In [7]:
df_raw.head()

Unnamed: 0,record_id,city_key,event_time,weekday,category_key,rptcatg,req_id,session_group
0,5c1699b0c3658c25008e1192,city_chennai_v2,2018-12-17 00:00:08,weekday,professional_bathroom_cleaning,Cleaning - Other,5c1699d714bd522300fae954,3022
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,2018-12-17 00:00:15,weekday,electricians,EPC,5c1699ed70fa4f2500d66e7c,3303
2,5c1699df4fb96a2400166132,city_mumbai_v2,2018-12-17 00:00:55,weekday,salon_at_home,Salon at Home,,3330
3,5c1699f405cbce26002ff832,city_kolkata_v2,2018-12-17 00:01:16,weekday,salon_at_home,Salon at Home,,3330
4,5c1699fb02f88224006901f5,city_chennai_v2,2018-12-17 00:01:23,weekday,plumbers,EPC,5c169a6c8c954c2200693538,3303


In [8]:
display(df_raw.tail().T)

Unnamed: 0,816525,816526,816527,816528,816529
record_id,5c44bde3e55c0a2500c63f15,5c44bde52225782400b89bcf,5c44bdecdaae8f2400a22225,5c44bdf966fee926000b3a9d,5c44be1ca48ea124006e00d2
city_key,city_hyderabad_v2,city_mumbai_v2,city_bangalore_v2,city_mumbai_v2,city_delhi_v2
event_time,2019-01-20 23:58:51,2019-01-20 23:58:53,2019-01-20 23:59:00,2019-01-20 23:59:13,2019-01-20 23:59:48
weekday,sunday,sunday,sunday,sunday,sunday
category_key,salon_at_home,salon_at_home,salon_at_home,microwave_repair,salon_at_home
rptcatg,Salon at Home,Salon at Home,Salon at Home,Appliance Repair,Salon at Home
req_id,,5c44be32fb1ebb2700dbc7b0,5c44be719daac92400f7d277,5c44be21d248a42500c783b8,
session_group,33,33,33,20,33


In [9]:
# Checking null values
display(df_raw.isnull().sum().sort_index()/len(df_raw))

category_key     0.000000
city_key         0.000000
event_time       0.000000
record_id        0.000000
req_id           0.543902
rptcatg          0.000000
session_group    0.000000
weekday          0.000000
dtype: float64

In [10]:
add_datepart(df_raw, 'event_time')
display(df_raw.head().T)

Unnamed: 0,0,1,2,3,4
record_id,5c1699b0c3658c25008e1192,5c1699b72dd2b92500f5f400,5c1699df4fb96a2400166132,5c1699f405cbce26002ff832,5c1699fb02f88224006901f5
city_key,city_chennai_v2,city_mumbai_v2,city_mumbai_v2,city_kolkata_v2,city_chennai_v2
weekday,weekday,weekday,weekday,weekday,weekday
category_key,professional_bathroom_cleaning,electricians,salon_at_home,salon_at_home,plumbers
rptcatg,Cleaning - Other,EPC,Salon at Home,Salon at Home,EPC
req_id,5c1699d714bd522300fae954,5c1699ed70fa4f2500d66e7c,,,5c169a6c8c954c2200693538
session_group,3022,3303,3330,3330,3303
event_timeYear,2018,2018,2018,2018,2018
event_timeMonth,12,12,12,12,12
event_timeWeek,51,51,51,51,51


In [11]:
df_raw.shape

(816530, 20)

In [12]:
df_raw['session_group'].head()

0    3022
1    3303
2    3330
3    3330
4    3303
Name: session_group, dtype: int64

In [13]:
df_raw['req_id'].fillna(0, inplace=True)

In [14]:
df_raw.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,event_timeYear,event_timeMonth,event_timeWeek,event_timeDay,event_timeDayofweek,event_timeDayofyear,event_timeIs_month_end,event_timeIs_month_start,event_timeIs_quarter_end,event_timeIs_quarter_start,event_timeIs_year_end,event_timeIs_year_start,event_timeElapsed
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,5c1699d714bd522300fae954,3022,2018,12,51,17,0,351,False,False,False,False,False,False,1545004808
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,5c1699ed70fa4f2500d66e7c,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004815
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004855
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004876
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,5c169a6c8c954c2200693538,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004883


In [15]:
df_raw.loc[df_raw['req_id'] != 0, 'req_id'] = 1

In [16]:
df_raw.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,event_timeYear,event_timeMonth,event_timeWeek,event_timeDay,event_timeDayofweek,event_timeDayofyear,event_timeIs_month_end,event_timeIs_month_start,event_timeIs_quarter_end,event_timeIs_quarter_start,event_timeIs_year_end,event_timeIs_year_start,event_timeElapsed
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,2018,12,51,17,0,351,False,False,False,False,False,False,1545004808
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004815
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004855
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004876
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004883


In [24]:
from pandas.api.types import is_string_dtype

def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

In [25]:
train_cats(df_raw)

In [26]:
df_raw.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,req_id,session_group,event_timeYear,event_timeMonth,event_timeWeek,event_timeDay,event_timeDayofweek,event_timeDayofyear,event_timeIs_month_end,event_timeIs_month_start,event_timeIs_quarter_end,event_timeIs_quarter_start,event_timeIs_year_end,event_timeIs_year_start,event_timeElapsed
0,5c1699b0c3658c25008e1192,city_chennai_v2,weekday,professional_bathroom_cleaning,Cleaning - Other,1,3022,2018,12,51,17,0,351,False,False,False,False,False,False,1545004808
1,5c1699b72dd2b92500f5f400,city_mumbai_v2,weekday,electricians,EPC,1,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004815
2,5c1699df4fb96a2400166132,city_mumbai_v2,weekday,salon_at_home,Salon at Home,0,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004855
3,5c1699f405cbce26002ff832,city_kolkata_v2,weekday,salon_at_home,Salon at Home,0,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004876
4,5c1699fb02f88224006901f5,city_chennai_v2,weekday,plumbers,EPC,1,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004883


In [48]:
# Feature Format
# os.makedirs('tmp', exist_ok=True)
# df_raw.to_feather('tmp/uc-data')

In [49]:
# Read feather format
# df_raw = pd.read_feather('tmp/uc-data')

In [57]:
from pandas.api.types import is_numeric_dtype

def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or len(col.cat.categories)>max_n_cat):
        df[name] = pd.Categorical(col).codes+1


def proc_df(df, y_fld=None, skip_flds=None, ignore_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not ignore_flds: ignore_flds=[]
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    else: df = df.copy()
    ignored_flds = df.loc[:, ignore_flds]
    df.drop(ignore_flds, axis=1, inplace=True)
    if preproc_fn: preproc_fn(df)
    if y_fld is None: y = None
    else:
        if not is_numeric_dtype(df[y_fld]): df[y_fld] = pd.Categorical(df[y_fld]).codes
        y = df[y_fld].values
        skip_flds += [y_fld]
    df.drop(skip_flds, axis=1, inplace=True)

    if na_dict is None: na_dict = {}
    else: na_dict = na_dict.copy()
    na_dict_initial = na_dict.copy()
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if len(na_dict_initial.keys()) > 0:
        df.drop([a + '_na' for a in list(set(na_dict.keys()) - set(na_dict_initial.keys()))], axis=1, inplace=True)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    df = pd.get_dummies(df, dummy_na=True)
    df = pd.concat([ignored_flds, df], axis=1)
    res = [df, y, na_dict]
    if do_scale: res = res + [mapper]
    return res

In [58]:
df, y, nas = proc_df(df_raw, 'req_id')

In [59]:
df.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,session_group,event_timeYear,event_timeMonth,event_timeWeek,event_timeDay,event_timeDayofweek,event_timeDayofyear,event_timeIs_month_end,event_timeIs_month_start,event_timeIs_quarter_end,event_timeIs_quarter_start,event_timeIs_year_end,event_timeIs_year_start,event_timeElapsed
0,3,4,4,10,2,3022,2018,12,51,17,0,351,False,False,False,False,False,False,1545004808
1,6,9,4,4,3,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004815
2,12,9,4,17,8,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004855
3,17,8,4,17,8,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004876
4,19,4,4,9,3,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004883


In [61]:
m = RandomForestClassifier(n_jobs=-1)
m.fit(df, y)
m.score(df,y)



0.9576390334709074

# Splitting train and test

In [65]:
df[df.event_timeDay >= 14].count()

record_id                     546869
city_key                      546869
weekday                       546869
category_key                  546869
rptcatg                       546869
session_group                 546869
event_timeYear                546869
event_timeMonth               546869
event_timeWeek                546869
event_timeDay                 546869
event_timeDayofweek           546869
event_timeDayofyear           546869
event_timeIs_month_end        546869
event_timeIs_month_start      546869
event_timeIs_quarter_end      546869
event_timeIs_quarter_start    546869
event_timeIs_year_end         546869
event_timeIs_year_start       546869
event_timeElapsed             546869
dtype: int64

In [66]:
df.shape

(816530, 19)

In [67]:
def split_vals(a,n): return a[:n].copy(), a[n:].copy()

n_valid = 546869  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

((269661, 19), (269661,), (546869, 19))

In [77]:
def cross_entropy(x, y, epsilon=1e-12):
    """
    Computes cross entropy between targets (encoded as one-hot vectors)
    and predictions. 
    Input: predictions (N, k) ndarray
           targets (N, k) ndarray        
    Returns: scalar
    """
    x = np.clip(x, epsilon, 1. - epsilon)
    N = x.shape[0]
    ce = -np.sum(y*np.log(x+1e-9))/N
    return ce
#predictions = np.array([[0.25,0.25,0.25,0.25],
#                        [0.01,0.01,0.01,0.96]])
#targets = np.array([[0,0,0,1],
#                  [0,0,0,1]])
#ans = 0.71355817782  #Correct answer
#x = cross_entropy(predictions, targets)
#print(np.isclose(x,ans))

In [None]:
def print_score(m):
    res = [cross_entropy(m.predict(X_train), y_train), cross_entropy(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [80]:
# Random Forest Classifier
m = RandomForestClassifier(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)



CPU times: user 5.11 s, sys: 287 ms, total: 5.39 s
Wall time: 1.34 s
[0.6690948000670042, 5.436862385938188, 0.9592117510503929, 0.5431209302410632]


In [83]:
X_valid.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,session_group,event_timeYear,event_timeMonth,event_timeWeek,event_timeDay,event_timeDayofweek,event_timeDayofyear,event_timeIs_month_end,event_timeIs_month_start,event_timeIs_quarter_end,event_timeIs_quarter_start,event_timeIs_year_end,event_timeIs_year_start,event_timeElapsed
269661,47045,2,4,18,9,33,2018,12,51,18,1,352,False,False,False,False,False,False,1545177524
269662,47047,2,4,17,8,33,2018,12,51,18,1,352,False,False,False,False,False,False,1545177534
269663,47048,6,4,3,3,23,2018,12,51,18,1,352,False,False,False,False,False,False,1545177536
269664,47051,8,4,5,5,33,2018,12,51,18,1,352,False,False,False,False,False,False,1545177570
269665,47052,9,4,18,9,33,2018,12,51,18,1,352,False,False,False,False,False,False,1545177574


In [85]:
y_valid

array([1, 0, 1, ..., 1, 1, 0])

In [89]:
predictions = X_valid.values
targets = y_valid

preds = m.predict(predictions)
(preds.T==targets).mean()

0.5431209302410632

In [106]:
preds

array([1, 0, 0, ..., 0, 0, 0])

In [107]:
y_valid

array([1, 0, 1, ..., 1, 1, 0])

In [91]:
preds.shape

(546869,)

In [92]:
m = RandomForestClassifier(n_estimators = 40, n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

CPU times: user 21.5 s, sys: 859 ms, total: 22.4 s
Wall time: 3.38 s
[0.051256027109835206, 5.145695955699478, 0.9967626019335388, 0.5465641680183005]


In [95]:
predictions = X_valid.values
targets = y_valid

preds = m.predict(predictions)
(preds.T==targets).mean()

0.5450299797574922

# Feature Importance

In [100]:
feature_importances = pd.DataFrame(m.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

In [105]:
feature_importances

Unnamed: 0,importance
record_id,0.459393
event_timeElapsed,0.334402
city_key,0.08225
session_group,0.056452
category_key,0.030078
rptcatg,0.021857
event_timeDayofyear,0.003938
event_timeDayofweek,0.003908
event_timeDay,0.00386
weekday,0.001906


In [121]:
to_drop = ['event_timeDayofyear', 'event_timeDayofweek','event_timeDay', 'weekday', 'event_timeWeek', 'event_timeYear',
'event_timeMonth', 'event_timeIs_quarter_end', 'event_timeIs_year_start','event_timeIs_month_start', 
'event_timeIs_quarter_start', 'event_timeIs_month_end', 'event_timeIs_year_end']

In [122]:
len(to_drop)

13

In [123]:
df_keep = X_train.copy()

In [125]:
df_keep.head()

Unnamed: 0,record_id,city_key,weekday,category_key,rptcatg,session_group,event_timeYear,event_timeMonth,event_timeWeek,event_timeDay,event_timeDayofweek,event_timeDayofyear,event_timeIs_month_end,event_timeIs_month_start,event_timeIs_quarter_end,event_timeIs_quarter_start,event_timeIs_year_end,event_timeIs_year_start,event_timeElapsed
0,3,4,4,10,2,3022,2018,12,51,17,0,351,False,False,False,False,False,False,1545004808
1,6,9,4,4,3,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004815
2,12,9,4,17,8,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004855
3,17,8,4,17,8,3330,2018,12,51,17,0,351,False,False,False,False,False,False,1545004876
4,19,4,4,9,3,3303,2018,12,51,17,0,351,False,False,False,False,False,False,1545004883


In [126]:
df_keep.drop(to_drop, axis=1,inplace=True)

In [127]:
df_keep.head()

Unnamed: 0,record_id,city_key,category_key,rptcatg,session_group,event_timeElapsed
0,3,4,10,2,3022,1545004808
1,6,9,4,3,3303,1545004815
2,12,9,17,8,3330,1545004855
3,17,8,17,8,3330,1545004876
4,19,4,9,3,3303,1545004883


In [136]:
frt = RandomForestClassifier(n_estimators = 40, n_jobs=-1)
%time frt.fit(df_keep, y_train)
frt.score(df_keep, y_train)

CPU times: user 27.6 s, sys: 790 ms, total: 28.4 s
Wall time: 4.13 s


0.9959208042690637

In [109]:
# Logistic Regression
from sklearn.linear_model import LogisticRegression

logi = LogisticRegression(C=0.1, dual=True)
%time logi.fit(X_train, y_train);
print_score(logi)



CPU times: user 9.54 s, sys: 138 ms, total: 9.68 s
Wall time: 9.73 s
[4.780412236880623, 8.055124597052732, 0.4917136701265663, 0.5224487034372034]


In [110]:
# Naive Bayes Classifer
from sklearn.naive_bayes import MultinomialNB
mul_nb = MultinomialNB(class_prior=[0.25, 0.5])
%time mul_nb.fit(X_train, y_train);
print_score(mul_nb)

CPU times: user 314 ms, sys: 76.1 ms, total: 390 ms
Wall time: 372 ms
[5.376503699794646, 1.9233505547273724, 0.5234275627547179, 0.4855276126458073]


# Neural Network to get better prediction
# NBSVM++ Navive Bias SVM