In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import scipy as sp
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns

# Preprocessing, modelling and evaluating
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_score, KFold
from xgboost import XGBClassifier
import xgboost as xgb

In [2]:
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

In [3]:
! ls

[31m## Missing Values- Feature Engineering.ipynb[m[m
Feature Encoding .ipynb
[31mOutliers.ipynb[m[m
[31mREADME.md[m[m
Untitled.ipynb
Untitled1.ipynb
Untitled2.ipynb
loan.csv
[31mmercedes.csv[m[m
[31mmobile_dataset.csv[m[m
[31mmonthly-milk-production-pounds.csv[m[m
[31mtest.csv[m[m
[31mtitanic.csv[m[m
[31mtrain.csv[m[m


In [4]:
df = pd.read_csv('train.csv')

In [5]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,T,Y,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,T,Y,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,F,Y,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,F,Y,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,F,N,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [6]:
def resumetable(df):
    print(f"Dataset Shape: {df.shape}")
    summary = pd.DataFrame(df.dtypes,columns=['dtypes'])
    summary = summary.reset_index()
    summary['Name'] = summary['index']
    summary = summary[['Name','dtypes']]
    summary['Missing'] = df.isnull().sum().values    
    summary['Uniques'] = df.nunique().values
    summary['First Value'] = df.loc[0].values
    summary['Second Value'] = df.loc[1].values
    summary['Third Value'] = df.loc[2].values

    return summary

In [7]:
resumetable(df)

Dataset Shape: (300000, 25)


Unnamed: 0,Name,dtypes,Missing,Uniques,First Value,Second Value,Third Value
0,id,int64,0,300000,0,1,2
1,bin_0,int64,0,2,0,0,0
2,bin_1,int64,0,2,0,1,0
3,bin_2,int64,0,2,0,0,0
4,bin_3,object,0,2,T,T,F
5,bin_4,object,0,2,Y,Y,Y
6,nom_0,object,0,3,Green,Green,Blue
7,nom_1,object,0,6,Triangle,Trapezoid,Trapezoid
8,nom_2,object,0,6,Snake,Hamster,Lion
9,nom_3,object,0,6,Finland,Russia,Russia


#####  Method 1 : Label Encoding

In [13]:
df.bin_3.value_counts()

1    153535
0    146465
Name: bin_3, dtype: int64

In [15]:
df.bin_4.value_counts()

1    191633
0    108367
Name: bin_4, dtype: int64

In [16]:
from sklearn.preprocessing import LabelEncoder

labelEncoder = LabelEncoder()

df.bin_3 = labelEncoder.fit_transform(df.bin_3.values.ravel())
df.bin_4 = labelEncoder.fit_transform(df.bin_4.values.ravel())



#### OnHot encoding

In [17]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,nom_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0,0,0,1,1,Green,Triangle,Snake,Finland,Bassoon,50f116bcf,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0
1,1,0,1,0,1,1,Green,Trapezoid,Hamster,Russia,Piano,b3b4d25d0,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0
2,2,0,0,0,0,1,Blue,Trapezoid,Lion,Russia,Theremin,3263bdce5,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0
3,3,0,1,0,0,1,Red,Trapezoid,Snake,Canada,Oboe,f12246592,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1
4,4,0,0,0,0,0,Red,Trapezoid,Lion,Canada,Oboe,5b0f5acd5,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0


In [18]:
df.shape

(300000, 25)

In [19]:
df.nom_0.value_counts()

Green    127341
Blue      96166
Red       76493
Name: nom_0, dtype: int64

In [20]:
df = pd.get_dummies(columns=["nom_0" ,"nom_1" , "nom_2" , "nom_3" , "nom_4"] , data=df , drop_first=True)

In [27]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_5,nom_6,nom_7,nom_8,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target,nom_0_Green,nom_0_Red,nom_1_Polygon,nom_1_Square,nom_1_Star,nom_1_Trapezoid,nom_1_Triangle,nom_2_Cat,nom_2_Dog,nom_2_Hamster,nom_2_Lion,nom_2_Snake,nom_3_China,nom_3_Costa Rica,nom_3_Finland,nom_3_India,nom_3_Russia,nom_4_Oboe,nom_4_Piano,nom_4_Theremin
0,0,0,0,0,1,1,0.358134,3ac1b8814,68f6ad3e9,c389000ab,2f4cb3d51,2,Grandmaster,Cold,h,D,kr,2,2,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0
1,1,0,1,0,1,1,0.388889,fbcb50fc1,3b6dd5612,4cd920251,f83c56c21,1,Grandmaster,Hot,a,A,bF,7,8,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,1,0
2,2,0,0,0,0,1,0.274564,0922e3cb8,a6a36f527,de9c9f684,ae6800dd0,1,Expert,Lava Hot,h,R,Jc,7,2,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1
3,3,0,1,0,0,1,0.234872,50d7ad46a,ec69236eb,4ade6ab69,8270f0d71,1,Grandmaster,Boiling Hot,i,D,kW,2,1,1,0,1,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0
4,4,0,0,0,0,0,0.312438,1fe17a1fd,04ddac2be,cb43ab175,b164b72a7,1,Grandmaster,Freezing,a,R,qP,7,8,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0


In [22]:
df.shape

(300000, 40)

#### Target Encoding


In [23]:
target  = df.groupby('nom_5')['target'].agg('sum') / df.groupby('nom_5')['target'].agg('count')

In [25]:
target

nom_5
005dd4ce3    0.258303
037bd73d8    0.325731
05950689f    0.274924
05d5943a3    0.308231
06eeaf0aa    0.348364
075ceb58b    0.268675
077fd9465    0.340747
0870880f6    0.297491
0870b0a5d    0.327116
09a4ad97d    0.195335
09ed0a686    0.229075
0b3bec656    0.362525
0bdf8165a    0.256912
0de4acd31    0.255259
0de5598a9    0.280353
0dee9b39a    0.343976
0eb46e992    0.266904
0ef5c1879    0.228395
115a252ba    0.425177
116f7e3e2    0.337512
1305d6e77    0.320053
159a7306f    0.312731
176809a41    0.362069
17a3709ae    0.391304
185ba0a59    0.336914
19db35594    0.314933
1bd1068d9    0.238095
1e6cb96e8    0.221228
1f1702d2f    0.316456
1fd0233cd    0.195122
200009fc3    0.429577
20b10a832    0.239266
2979f0d45    0.333191
29e7f8525    0.370690
2a0e95ba1    0.315315
2cac4af40    0.371960
2cadfed8e    0.280035
2cc9e16b9    0.270987
2d61990e2    0.279778
2e7f4d636    0.272834
2ff007c26    0.375285
30a15b6bd    0.397658
30a530eab    0.293763
321bf770e    0.304762
3263bdce5    0.274564
3271

In [26]:
df.nom_5 = df.nom_5.replace(target).values

In [28]:
df.nom_5.value_counts()

0.370225    2801
0.348364    2750
0.358739    2729
0.239266    2725
0.416268    2717
0.246868    2714
0.293091    2692
0.332836    2683
0.309648    2674
0.270857    2673
0.361049    2670
0.289899    2663
0.276291    2653
0.327925    2650
0.233068    2643
0.244790    2639
0.246768    2630
0.375285    2630
0.261003    2613
0.306185    2603
0.369657    2597
0.358134    2594
0.201241    2579
0.344587    2577
0.274471    2554
0.334381    2545
0.362525    2535
0.274564    2524
0.397658    2477
0.287342    2457
0.243365    2449
0.337855    2433
0.425177    2399
0.267670    2391
0.227292    2389
0.378299    2387
0.343976    2349
0.279199    2346
0.333191    2344
0.293763    2325
0.280035    2289
0.379280    2278
0.374005    2262
0.314798    2230
0.190220    2229
0.382366    2223
0.323397    2214
0.312731    2168
0.255259    2139
0.292315    2121
0.408790    2116
0.228504    2105
0.300809    2101
0.241297    2097
0.310295    2069
0.276773    2045
0.328117    2045
0.314933    2029
0.306356    20