For EDA, model selection and how information about the df_metedata pickle object click [here](https://www.kaggle.com/batofgotham/eda-and-feature-selection?scriptVersionId=28684443)

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/porto-seguro-safe-driver-prediction/sample_submission.csv
/kaggle/input/porto-seguro-safe-driver-prediction/test.csv
/kaggle/input/porto-seguro-safe-driver-prediction/train.csv
/kaggle/input/pssdpickledfmetedatapickle/df_metedata_pickle


In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [3]:
input_path = '/kaggle/input/porto-seguro-safe-driver-prediction/'
df = pd.read_csv(input_path+'train.csv')
df_test = pd.read_csv(input_path+'test.csv')

In [4]:
id_test = df_test['id']

In [5]:
df.shape

(595212, 59)

In [6]:
target = df['target']

In [7]:
target.value_counts()

0    573518
1     21694
Name: target, dtype: int64

- As it is imbalanced dataset we have to synthesize for new data points 

In [8]:
df.drop(columns=['target'],inplace=True)

### Getting the Metadata Dataframe

In [9]:
import pickle
df_metadata = pickle.load(open('/kaggle/input/pssdpickledfmetedatapickle/df_metedata_pickle','rb'))

In [10]:
df_metadata

Unnamed: 0,DTypes,Dropped,Missing
id,int64,True,7.0
ps_ind_01,Ordinal,False,0.0
ps_ind_02_cat,Categorical,False,1.0
ps_ind_03,int64,False,4.423318
ps_ind_04_cat,Categorical,False,0.0
ps_ind_05_cat,Categorical,False,0.0
ps_ind_06_bin,Categorical,False,0.0
ps_ind_07_bin,Categorical,False,0.0
ps_ind_08_bin,Categorical,False,0.0
ps_ind_09_bin,Categorical,False,0.0


 - Before Synthesizing the new data lets complete the pre processing

### PreProcessing

- Dropping Stastically insignificant columns, Filling the Missing values and changing the datatypes of columns accordingly

In [11]:
def preprocessing(df):
    df.replace(to_replace=-1,value=np.nan,inplace=True)
    for col in df.columns:
        #Dropping Insignificant Columns
        if df_metadata.loc[col,'Dropped']:
            df.drop(columns=[col],inplace=True)
            continue
        #Filling Missing Values
        df[col].fillna(df_metadata.loc[col,'Missing'],inplace=True)
        #Changing the datatype of columns
        if (df_metadata.loc[col,'DTypes'] == 'Categorical') or (df_metadata.loc[col,'DTypes'] == 'Ordinal'):
            df[col] = df[col].astype('category')

In [12]:
df.shape

(595212, 58)

In [13]:
preprocessing(df)
preprocessing(df_test)

In [14]:
df.isnull().sum()

ps_ind_01        0
ps_ind_02_cat    0
ps_ind_03        0
ps_ind_04_cat    0
ps_ind_05_cat    0
ps_ind_06_bin    0
ps_ind_07_bin    0
ps_ind_08_bin    0
ps_ind_09_bin    0
ps_ind_12_bin    0
ps_ind_14        0
ps_ind_15        0
ps_ind_16_bin    0
ps_ind_17_bin    0
ps_ind_18_bin    0
ps_reg_01        0
ps_reg_02        0
ps_reg_03        0
ps_car_01_cat    0
ps_car_02_cat    0
ps_car_04_cat    0
ps_car_05_cat    0
ps_car_06_cat    0
ps_car_07_cat    0
ps_car_08_cat    0
ps_car_09_cat    0
ps_car_11_cat    0
ps_car_11        0
ps_car_12        0
ps_car_13        0
ps_car_14        0
ps_car_15        0
dtype: int64

In [15]:
df_test.isnull().sum()

ps_ind_01        0
ps_ind_02_cat    0
ps_ind_03        0
ps_ind_04_cat    0
ps_ind_05_cat    0
ps_ind_06_bin    0
ps_ind_07_bin    0
ps_ind_08_bin    0
ps_ind_09_bin    0
ps_ind_12_bin    0
ps_ind_14        0
ps_ind_15        0
ps_ind_16_bin    0
ps_ind_17_bin    0
ps_ind_18_bin    0
ps_reg_01        0
ps_reg_02        0
ps_reg_03        0
ps_car_01_cat    0
ps_car_02_cat    0
ps_car_04_cat    0
ps_car_05_cat    0
ps_car_06_cat    0
ps_car_07_cat    0
ps_car_08_cat    0
ps_car_09_cat    0
ps_car_11_cat    0
ps_car_11        0
ps_car_12        0
ps_car_13        0
ps_car_14        0
ps_car_15        0
dtype: int64

### Operating with outliers

- The Idea is to find the outliers and replace them accordingly

In [16]:
def outlier_processing(df,df_test):
    for col in df.columns:
        if df[col].dtype.name != 'category':
            first_quartile, third_quartile = np.percentile(df[col],[25,75])
            first_percetnile, ninetynine_percentile = np.percentile(df[col],[1,99])
            IQR = third_quartile - first_quartile
            lower_bound = first_quartile - (1.5*IQR)
            upper_bound = third_quartile + (1.5*IQR)
            df[col].loc[df[col]>upper_bound] = ninetynine_percentile
            df_test[col].loc[df_test[col]>upper_bound] = ninetynine_percentile
            df[col].loc[df[col]<lower_bound] = first_percetnile
            df_test[col].loc[df_test[col]<lower_bound] = first_percetnile
        

In [17]:
#outlier_processing(df,df_test)

### Encoding

- The Idea is to encode the ordinal values with Ordinal Encoder and Categorical values with OneHot Encoder - unless they are binary

In [18]:
ordinal_columns = [col for col in df.columns if df_metadata.loc[col,'DTypes'] == 'Ordinal' and df[col].nunique() > 2]

In [19]:
categorical_columns_great_2 = [col for col in df.columns if df_metadata.loc[col,'DTypes'] == 'Categorical' and df[col].nunique() > 2]

In [20]:
from sklearn.preprocessing import LabelEncoder
for col in ordinal_columns:
    label_encode = LabelEncoder()
    df[col+'label'] = label_encode.fit_transform(df[col])
    df_test[col+'label'] = label_encode.transform(df_test[col])
    df.drop(columns=[col],inplace=True)
    df_test.drop(columns=[col],inplace=True)

In [21]:
df = pd.get_dummies(df,prefix=col,columns=categorical_columns_great_2,drop_first=True)
df_test = pd.get_dummies(df_test,columns=categorical_columns_great_2,prefix=col,drop_first=True)

In [22]:
df.shape

(595212, 178)

In [23]:
df_test.shape

(892816, 178)

### Lets do the scaling

In [24]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [25]:
df_train_scale = scaler.fit_transform(df)

In [26]:
df_test_scale = scaler.transform(df_test)

## Making data in to multiple folds

In [27]:
from sklearn.model_selection import StratifiedKFold

In [28]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=17)

### Gini Custom Metric
- Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897

In [29]:
def gini(actual,pred,cmpcol = 0,sortcol = 1):
    assert( len(actual) == len(pred) )
    All = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    All = All[ np.lexsort((All[:,2], -1*All[:,1])) ]
    totAllosses = All[:,0].sum()
    giniSum = All[:,0].cumsum().sum() / totAllosses
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)

def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return 'gini', gini_score

### XGBoost

In [30]:
import xgboost as xgb

In [31]:
params = {
    'min_child_weight': 10.0,
    'objective': 'binary:logistic',
    'max_depth': 7,
    'max_delta_step': 1.8,
    'colsample_bytree': 0.4,
    'subsample': 0.8,
    'eta': 0.025,
    'gamma': 0.65,
    'num_boost_round' : 1000
    }

In [32]:
model_ensemble = []
for i,(train_index, valid_index) in enumerate(skf.split(df_train_scale,target)):
    print('[FOLD %d/%d]'%(i+1,5))
    X_train,X_valid = df_train_scale[train_index],df_train_scale[valid_index]
    y_train,y_valid = target.loc[train_index],target.loc[valid_index]
    #Convert Data in to XGBoost format
    df_train_xgb = xgb.DMatrix(X_train,y_train)
    df_valid_xgb = xgb.DMatrix(X_valid,y_valid)
    valid_list = [(df_train_xgb, 'train'), (df_valid_xgb, 'valid')]
    xgb_model = xgb.train(params, df_train_xgb, 3000, valid_list, feval=gini_xgb, maximize=True, early_stopping_rounds=70,verbose_eval=100)
    model_ensemble.append(xgb_model)

[FOLD 1/5]
[0]	train-error:0.036447	valid-error:0.036449	train-gini:0.030843	valid-gini:0.031913
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 70 rounds.
[100]	train-error:0.036447	valid-error:0.036449	train-gini:0.317252	valid-gini:0.25364
[200]	train-error:0.036447	valid-error:0.036449	train-gini:0.351991	valid-gini:0.26577
[300]	train-error:0.036447	valid-error:0.036449	train-gini:0.379853	valid-gini:0.272672
[400]	train-error:0.036441	valid-error:0.036449	train-gini:0.400003	valid-gini:0.275406
[500]	train-error:0.036439	valid-error:0.036449	train-gini:0.416764	valid-gini:0.276221
[600]	train-error:0.036437	valid-error:0.036449	train-gini:0.432025	valid-gini:0.276335
Stopping. Best iteration:
[569]	train-error:0.036437	valid-error:0.036449	train-gini:0.427559	valid-gini:0.276575

[FOLD 2/5]
[0]	train-error:0.036447	valid-error:0.036449	train-gini:0.039955	valid-gini:0.021712
Multiple eval metric

### Predictions

In [33]:
predict_proba = 0
df_test_xgb = xgb.DMatrix(df_test_scale)
for i, model in enumerate(model_ensemble):
    print('[FOLD %d/%d Prediciton:]'%(i+1,5))
    predictions = xgb_model.predict(df_test_xgb)
    predict_proba += predictions
predict_proba = predict_proba/5

[FOLD 1/5 Prediciton:]
[FOLD 2/5 Prediciton:]
[FOLD 3/5 Prediciton:]
[FOLD 4/5 Prediciton:]
[FOLD 5/5 Prediciton:]


## Submition

In [34]:
submit = pd.DataFrame({'id':id_test,'target':predict_proba})
submit.to_csv('xgb_porto.csv',index=False) 
submit.head()

Unnamed: 0,id,target
0,0,0.026637
1,1,0.024244
2,2,0.022462
3,3,0.016507
4,4,0.035768
