Notebook02 for Safe Driver Prediction

Timeline: 2017/10/13

Goals: Using the xgboost model to make predictions

I. Import Packages, define functions and import files

In [34]:
# Data Manipulation
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Training
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
import xgboost as xgb

# display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [35]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

# Create an XGBoost-compatible metric from Gini
def gini_xgb(preds, dtrain):
    labels = dtrain.get_label()
    gini_score = gini_normalized(labels, preds)
    return [('gini', gini_score)]

In [36]:
# Import files
train_df = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/train.csv')
test_df = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/test.csv')
submission_df = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/sample_submission.csv')

II. Data manipulation

In [37]:
# Pick out columns with specific keyword inside
def select_cols(df,description):
    get_cols = [col for col in df.columns if description in col]
    return df[get_cols]

# Remove -1 in the code and replace with N/A
def recover_na(df):
    df = df.replace(-1, np.NaN)
    return df

In [38]:
# Select columns with specific data type (w/o price)
cat_cols = select_cols(train_df,'cat')
bin_cols = select_cols(train_df,'bin')
cont_cols = train_df.select_dtypes(include=['float64'])
temp_cols = [col for col in train_df.columns if ('cat' not in col) and ('bin' not in col) and (train_df[col].dtype != float) 
            and ('id' not in col) and ('target' not in col)]
ord_cols = train_df[temp_cols]

# Select columns with specific category
ind_cols = select_cols(train_df,'ind')
reg_cols = select_cols(train_df,'reg')
car_cols = select_cols(train_df,'car')
calc_cols = select_cols(train_df,'calc')

# Recover the NA
train_copy = recover_na(train_df)

In [39]:
#Dropping columns with 'ps_calc_'
col_to_drop = train_df.columns[train_df.columns.str.startswith('ps_calc_')]
train_df = train_df.drop(col_to_drop, axis=1)  
test_df = test_df.drop(col_to_drop, axis=1)

In [40]:
# Preparing for training
id_train = train_df['id'].values
id_test = test_df['id'].values

y = train_df['target']
X = train_df.drop(['target', 'id'], axis=1)
y_valid_pred = 0*y
X_test = test_df.drop(['id'], axis=1)
y_test_pred = 0
sub=test['id'].to_frame()
sub['target']=0

III. Training:

In [41]:
# Set up folds
K = 5
kf = KFold(n_splits = K, random_state = 1, shuffle = True)

In [44]:
# Run Training and CV
for i, (train_index, test_index) in enumerate(kf.split(train_df)):
    
    # Create data for this fold
    y_train, y_valid = y.iloc[train_index], y.iloc[test_index]
    X_train, X_valid = X.iloc[train_index,:], X.iloc[test_index,:]
    print( "\nFold ", i)
    
    # Convert data into XGBoost format
    d_train = xgb.DMatrix(X_train, y_train)
    d_valid = xgb.DMatrix(X_valid, y_valid)
    d_test = xgb.DMatrix(X_test)
    
    # This is the data xgboost will test on after eachboosting round
    watchlist = [(d_train, 'train'), (d_valid, 'valid')]

    # Train the model. Pass in a max of 10,000 rounds (with early stopping after 100)
    # and the custom metric (maximize=True tells xgb that higher metric is better)
    mdl = xgb.train(params, d_train, 10000, watchlist, early_stopping_rounds=100, feval=gini_xgb, maximize=True, verbose_eval=10)
    
    sub['target'] += xgb.predict(xgb.DMatrix(test_df[features].values), 
                        ntree_limit=xgb_model.best_ntree_limit+50) / (kfold)


Fold  0
Training 0 :
[0]	train-error:0.036439	valid-error:0.03639	train-gini:0.203415	valid-gini:0.201296
Multiple eval metrics have been passed: 'valid-gini' will be used for early stopping.

Will train until valid-gini hasn't improved in 100 rounds.
[100]	train-error:0.036447	valid-error:0.036407	train-gini:0.289093	valid-gini:0.258655
[200]	train-error:0.036447	valid-error:0.036407	train-gini:0.32	valid-gini:0.267934
[300]	train-error:0.036443	valid-error:0.036407	train-gini:0.353338	valid-gini:0.277362


KeyboardInterrupt: 

In [None]:
# Create submission file
sub.to_csv('xgb1.csv', index=False)

In [None]:
sub.describe()

Insight:<br>
First xgboost model with LB score 0.281, and first working model.