In [1]:
import numpy as np
import pandas as pd
from bokeh.io import show , output_notebook
from bokeh.plotting import figure
from bokeh.models import ColumnDataSource ,HoverTool
from bokeh.layouts import row , column , widgetbox
from bokeh.models.widgets import Tabs , Panel
from bokeh.application.handlers import FunctionHandler
from bokeh.application import Application
from ipywidgets import interact

import category_encoders as ce
from sklearn.model_selection import StratifiedKFold
from catboost import CatBoostClassifier
from sklearn import metrics


output_notebook()

In [2]:
pd.options.display.max_columns = None
pd.options.display.max_rows = None

In [3]:
train_df = pd.read_csv("./Data/train.csv")
test_df = pd.read_csv("./Data/test.csv")
samp_submission = pd.read_csv("./Data/sample_submission.csv")


In [4]:
train_df.shape

(600000, 25)

In [5]:
train_df.head(10)

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,Bassoon,de4c57ee2,a64bc7ddf,598080a91,0256c7a4b,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,Theremin,2bb3c3e5c,3a3a936e8,1dddb8473,52ead350c,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,Bassoon,b574c9841,708248125,5ddc9a726,745b909d1,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,Theremin,673bdf1f6,23edb8da3,3a33ef960,bdaa56dd1,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,,777d1ac2c,3a7975e46,bc9cc2a94,,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0
5,5,0.0,,1.0,T,N,Red,Triangle,Lion,China,Bassoon,a2e1bf0b1,ae6737c29,8c30b9b0b,690411ac0,05afc0f8b,2.0,Expert,Hot,b,Q,wa,3.0,4.0,0
6,6,0.0,0.0,0.0,F,N,Red,Triangle,Hamster,Costa Rica,Bassoon,87a5be0d7,cdc35bd00,1cba571fa,b8e63cace,4d3766412,1.0,Grandmaster,Cold,c,R,rg,5.0,6.0,0
7,7,0.0,0.0,1.0,T,N,Red,Triangle,Axolotl,Finland,Bassoon,104aee31d,2a50808ba,81d67e1bb,bd9643a20,a651dec43,3.0,Expert,Cold,b,Y,PS,1.0,1.0,0
8,8,0.0,0.0,0.0,F,N,Blue,Polygon,Hamster,Russia,Oboe,024efa364,a4a81ab45,429114096,94c5fd40c,,1.0,Novice,Boiling Hot,c,N,mX,6.0,3.0,0
9,9,0.0,0.0,,F,Y,Red,Polygon,Hamster,Finland,Theremin,9fa084b36,e7aa94f40,56d35c774,0279391c5,79b29d54c,3.0,Contributor,Lava Hot,n,I,OZ,1.0,8.0,1


In [6]:
train_df.isna().sum()

id            0
bin_0     17894
bin_1     18003
bin_2     17930
bin_3     18014
bin_4     18047
nom_0     18252
nom_1     18156
nom_2     18035
nom_3     18121
nom_4     18035
nom_5     17778
nom_6     18131
nom_7     18003
nom_8     17755
nom_9     18073
ord_0     18288
ord_1     18041
ord_2     18075
ord_3     17916
ord_4     17930
ord_5     17713
day       17952
month     17988
target        0
dtype: int64

In [7]:
train_df.columns

Index(['id', 'bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4', 'nom_0', 'nom_1',
       'nom_2', 'nom_3', 'nom_4', 'nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9',
       'ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5', 'day', 'month',
       'target'],
      dtype='object')

In [8]:
train_df.iloc[:,:].nunique()

id        600000
bin_0          2
bin_1          2
bin_2          2
bin_3          2
bin_4          2
nom_0          3
nom_1          6
nom_2          6
nom_3          6
nom_4          4
nom_5       1220
nom_6       1519
nom_7        222
nom_8        222
nom_9       2218
ord_0          3
ord_1          5
ord_2          6
ord_3         15
ord_4         26
ord_5        190
day            7
month         12
target         2
dtype: int64

In [9]:
cols = ['bin_0' , 'bin_1','bin_2','bin_3' ,'bin_4','ord_0','ord_1','ord_2','ord_3','ord_4',
        'nom_0','nom_1','nom_2','nom_3','nom_4','target' , 'day','month']
train_dfS = train_df.loc[:,cols].copy()


In [10]:
def Uniques(col):
    
    x = [str(i) for i in train_dfS.loc[:,col].unique() if not pd.isnull(i)]
    return x


In [11]:
import bokeh
from bokeh.models import Select
def modify_doc(doc):
    
    
    def create_figure():
        
        current_feature_name = feature_name.value
        targets = sorted(Uniques(current_feature_name))
        source = ColumnDataSource(data = {
            
            'x' : targets,
            'y' : train_dfS.loc[:,current_feature_name].value_counts().to_list(),
            'color' : bokeh.palettes.plasma(len(targets))
        })
        #print(source.data)
        plot = figure(x_range = targets,title = "Categorical Embedding -II" , plot_height = 500 , plot_width = 500)
        plot.vbar(x = 'x' , top = 'y' , color = 'color' , width = 0.5 , source = source,legend_field = 'x')
        plot.xaxis.axis_label = current_feature_name
        plot.yaxis.axis_label = "Counts"
        plot.legend.orientation = 'horizontal'
        plot.legend.location = 'top_right'
        plot.left[0].formatter.use_scientific = False
        plot.add_tools(HoverTool(tooltips = [('Counts' , '@y')]))
        #show(plot)
        return plot
        
    def update_plot(attr , old , new):
        
        layout.children[1] = create_figure()
        
    
    #Controls
    feature_name = Select(title = "Categorical Columns" , options = cols , value = cols[0])
    feature_name.on_change('value' , update_plot)
    p = create_figure()
    layout = row(widgetbox(feature_name) , p)
    doc.add_root(layout)

handler = FunctionHandler(modify_doc)
app = Application(handler)
        
        
        
        
        

In [12]:
doc = app.create_document()

In [13]:
show(app)

In [14]:
#Let's fill in the missing data using mode

for col in train_df.columns:
    
    train_df[col].fillna(train_df[col].mode()[0] , inplace = True)
    

for col in test_df.columns:
    
    test_df[col].fillna(test_df[col].mode()[0] , inplace = True)

In [15]:
test_df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month
0,600000,0.0,0.0,0.0,F,Y,Blue,Polygon,Axolotl,Finland,Piano,52f6dd16c,147d704e4,8d857a0a1,ca9ad1d4b,fced9e114,3.0,Novice,Boiling Hot,f,U,oU,3.0,9.0
1,600001,0.0,0.0,0.0,F,Y,Red,Circle,Lion,Russia,Bassoon,691ebeae8,8653dcc2e,67a8d4ebb,060a21580,7ca8775da,1.0,Novice,Cold,n,N,Fl,2.0,8.0
2,600002,0.0,0.0,0.0,F,Y,Blue,Circle,Axolotl,Russia,Theremin,81f792c16,6cdda499e,69403e18c,165e81a00,5940334c9,1.0,Expert,Warm,i,N,DN,2.0,6.0
3,600003,1.0,0.0,0.0,F,N,Red,Polygon,Axolotl,Costa Rica,Bassoon,c9134205b,acbca4827,cb681246b,77d41330d,6fbdeefc8,1.0,Expert,Hot,m,B,AG,1.0,6.0
4,600004,0.0,0.0,1.0,F,Y,Red,Circle,Hamster,Finland,Theremin,f0f100f57,6f800b9af,cd9feb5c6,2218d9dfe,2a27c8fde,1.0,Contributor,Lava Hot,o,J,DT,3.0,3.0


In [16]:
#importing catgeory encoders library

def EncodeMapings(df):
    
    #Encoding for training set
    df_encoded = df.copy()
    
    df_encoded['bin_3'] = df_encoded['bin_3'].apply(lambda x : 0 if x == 'F' else 1)
    df_encoded['bin_4'] = df_encoded['bin_4'].apply(lambda x : 0 if x == 'N' else 1)
    
    df_encoded.ord_1.replace(to_replace = ['Novice', 'Contributor','Expert', 'Master', 'Grandmaster'],
                         value = [0, 1, 2, 3, 4], inplace = True)

    df_encoded.ord_2.replace(to_replace = ['Freezing', 'Cold', 'Warm', 'Hot','Boiling Hot', 'Lava Hot'],
                         value = [0, 1, 2, 3, 4, 5], inplace = True)

    df_encoded.ord_3.replace(to_replace = ['a', 'b', 'c', 'd', 'e', 'f', 'g','h', 'i', 'j', 'k', 'l', 'm', 'n', 'o'],
                         value = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], inplace = True)

    df_encoded.ord_4.replace(to_replace = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I','J', 'K', 'L', 'M', 'N', 'O', 
                                     'P', 'Q', 'R','S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'],
                         value = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 
                                  22, 23, 24, 25], inplace = True)
    
    return df_encoded
    
   
    


    
    
    

In [17]:
train_dfP = EncodeMapings(train_df)
test_dfP = EncodeMapings(test_df)

In [19]:
nom_col = ['nom_0','nom_1' ,'nom_2','nom_3','nom_4']
    
        
for col in nom_col:
            woe_enc = ce.WOEEncoder()
        
        
            train_dfP[f'{col}_woe'] = woe_enc.fit_transform(train_dfP[col] , train_dfP.target)
            test_dfP[f'{col}_woe'] = woe_enc.transform(test_dfP[col])
    
 #Using Leave one outencoder for high cardinality data
    
    
high_card = ['nom_5' , 'nom_6' , 'nom_7','nom_8','nom_9','ord_5'] 
        
        
for col in high_card:
            loo_enc = ce.LeaveOneOutEncoder()
        
        
            train_dfP[f'{col}_loo'] = loo_enc.fit_transform(train_dfP[col] , train_dfP.target)
            test_dfP[f'{col}_loo'] = loo_enc.transform(test_dfP[col])
    
train_dfP.drop(['nom_0','nom_1' ,'nom_2','nom_3','nom_4','nom_5' , 'nom_6' , 'nom_7','nom_8','nom_9','ord_5'],
                   inplace = True , axis = 1)
test_dfP.drop(['nom_0','nom_1' ,'nom_2','nom_3','nom_4','nom_5' , 'nom_6' , 'nom_7','nom_8','nom_9','ord_5'],
                   inplace = True , axis = 1)

In [28]:
print("Training Set Shape: {}".format(train_dfP.shape))
print("Test set Shape: {}".format(test_dfP.shape))

Training Set Shape: (600000, 25)
Test set Shape: (400000, 24)


In [32]:
train_dfP.to_csv("train_dfP" , index = False)
test_dfP.to_csv("test_dfP", index = False)

In [29]:
import gc
del train_df , train_dfS ,test_df
gc.collect()

331

In [30]:
y = train_dfP.target.values
X = train_dfP.drop(['target','id'] , axis = 1).values
test_dfP.drop(['id'] , inplace = True , axis = 1)

In [33]:

skf = StratifiedKFold(n_splits = 5 ,random_state = 42 ,shuffle = True)

model = CatBoostClassifier(iterations=600,
                              learning_rate=0.01,
                              depth=5,
                              bootstrap_type='Bernoulli',
                              loss_function='Logloss',
                              subsample=0.9,
                              eval_metric='AUC',
                              metric_period=50,
                              allow_writing_files=False)


oof_y = []
oof_pred = []

scores = []

for train_idx, test_idx in skf.split(X,y):
    
    X_train , X_val = X[train_idx] , X[test_idx]
    y_train , y_val = y[train_idx] , y[test_idx]
    
    model.fit(X_train , y_train , eval_set = (X_val , y_val))
    
    pred = model.predict_proba(X_val)[:,1]
    
    oof_y.append(y_val)
    oof_pred.append(pred)
    score = metrics.roc_auc_score(y_val , pred)
    print("Fold Score :{}".format(score))
    scores.append(score)
    

    
    
    
    
    
    

0:	test: 0.6793385	best: 0.6793385 (0)	total: 204ms	remaining: 2m 1s
50:	test: 0.7354176	best: 0.7354176 (50)	total: 7.27s	remaining: 1m 18s
100:	test: 0.7667188	best: 0.7667188 (100)	total: 14.5s	remaining: 1m 11s
150:	test: 0.7831765	best: 0.7831765 (150)	total: 22.3s	remaining: 1m 6s
200:	test: 0.7990722	best: 0.7990722 (200)	total: 30.2s	remaining: 59.9s
250:	test: 0.8164614	best: 0.8164614 (250)	total: 37s	remaining: 51.5s
300:	test: 0.8324709	best: 0.8324709 (300)	total: 43.9s	remaining: 43.6s
350:	test: 0.8457630	best: 0.8457630 (350)	total: 50.8s	remaining: 36.1s
400:	test: 0.8542386	best: 0.8542386 (400)	total: 58.3s	remaining: 28.9s
450:	test: 0.8644136	best: 0.8644136 (450)	total: 1m 5s	remaining: 21.8s
500:	test: 0.8747558	best: 0.8747558 (500)	total: 1m 14s	remaining: 14.7s
550:	test: 0.8820841	best: 0.8820841 (550)	total: 1m 22s	remaining: 7.31s
599:	test: 0.8882726	best: 0.8882726 (599)	total: 1m 28s	remaining: 0us

bestTest = 0.8882726173
bestIteration = 599

Fold Score

In [34]:
print("Mean Auc_roc Score : {}".format(sum(scores) / skf.n_splits))

Mean Auc_roc Score : 0.8902848642683117


In [37]:
#plot model feature importances

feature_names = [col for col in train_dfP.columns if col not in ['target' , 'id']]
source = ColumnDataSource(data = {'x' : feature_names,
                                  'y' : model.feature_importances_,
                                  'color' : bokeh.palettes.turbo(len(feature_names))
                                 })

plot = figure(x_range = feature_names , title= "Feature Importance" , plot_height = 1500 , plot_width = 1500)
plot.vbar(x = 'x' , top = 'y' ,color  =  'color' , source = source  , width = 0.5)
#plot.legend.orientation = 'horizontal'
#plot.legend.location = 'top_right'
plot.left[0].formatter.use_scientific = False
plot.add_tools(HoverTool(tooltips = [('Value' , '@y')]))
plot.xaxis.axis_label = 'Features'
plot.yaxis.axis_label = 'Values'

show(plot)


In [39]:
df_test = test_dfP.values

In [45]:
test_preds = model.predict_proba(df_test)[:,1]


In [51]:
test_preds


array([0.19657312, 0.11148602, 0.1265723 , ..., 0.36857387, 0.23015979,
       0.19778562])

In [52]:
samp_submission.drop('targets' ,inplace = True , axis = 1)


In [54]:
samp_submission['target'] = test_preds

In [56]:
samp_submission.to_csv("Submission_baseline.csv" , index  =False)

# Neural Network submission 


In [None]:
#importing required keras libraries

