Notebook03 for Safe Driver Prediction

Timeline: 2017/10/18 12:34
Cleaned: 2017/10/23 18:25

Goals: Use RandomForestRegression

I. Import Packages, define functions and import files

In [1]:
# Data Manipulation
import pandas as pd
import numpy as np

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# display
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# Import files:
train_df = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/train.csv')
test_df = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/test.csv')
submission_df = pd.read_csv('/Users/maxji/Desktop/Kaggle/0SafeDriver/data/sample_submission.csv')

II. Data manipulation

In [3]:
# Pick out columns with specific keyword inside
def select_cols(df,description):
    get_cols = [col for col in df.columns if description in col]
    return df[get_cols]

# Remove -1 in the code and replace with N/A
def recover_na(df):
    df = df.replace(-1, np.NaN)
    return df
    

In [4]:
# Select columns with specific data type (w/o price)
cat_cols = select_cols(train_df,'cat')
bin_cols = select_cols(train_df,'bin')
cont_cols = train_df.select_dtypes(include=['float64'])
temp_cols = [col for col in train_df.columns if ('cat' not in col) and ('bin' not in col) and (train_df[col].dtype != float) 
            and ('id' not in col) and ('target' not in col)]
ord_cols = train_df[temp_cols]

# Select columns with specific category
ind_cols = select_cols(train_df,'ind')
reg_cols = select_cols(train_df,'reg')
car_cols = select_cols(train_df,'car')
calc_cols = select_cols(train_df,'calc')

In [5]:
train_recover = recover_na(train_df)

In [6]:
# Define the gini metric - from https://www.kaggle.com/c/ClaimPredictionChallenge/discussion/703#5897
def gini(actual, pred, cmpcol = 0, sortcol = 1):
    assert( len(actual) == len(pred) )
    all = np.asarray(np.c_[ actual, pred, np.arange(len(actual)) ], dtype=np.float)
    all = all[ np.lexsort((all[:,2], -1*all[:,1])) ]
    totalLosses = all[:,0].sum()
    giniSum = all[:,0].cumsum().sum() / totalLosses
    
    giniSum -= (len(actual) + 1) / 2.
    return giniSum / len(actual)
 
def gini_normalized(a, p):
    return gini(a, p) / gini(a, a)

In [7]:
# Define the function to copy the entries with target 1; Details in Notebook01
def copy(train_df,k):
    targetachieved = train_df['target']==1
    df_copy = train_df[targetachieved]
    train_df1 = train_df.append([df_copy]*3,ignore_index=True)
    return train_df1

In [8]:
# One-hot encoding
cat_features = [a for a in train_df.columns if a.endswith('cat')]
for column in cat_features:
    temp = pd.get_dummies(pd.Series(train_df[column]))
    train_df = pd.concat([train_df,temp],axis=1)
    train_df = train_df.drop([column],axis=1)
    
for column in cat_features:
    temp = pd.get_dummies(pd.Series(test_df[column]))
    test_df = pd.concat([test_df,temp],axis=1)
    test_df = test_df.drop([column],axis=1)

In [9]:
print(train_df.shape,test_df.shape)

(595212, 229) (892816, 228)


III. Training

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import StratifiedKFold

# Initialize CV
nrounds=2000 
kfold = 5
skf = StratifiedKFold(n_splits=kfold, random_state=0)

# Prepare data for training
train_df1 = copy(train_df,3)
X = train_df1.drop(['target', 'id'], axis=1)
y = train_df1['target']
y =y+0.01
X=X.values
y=y.values

#LB 0.255, cv 0.30+
"""parameters = {'n_estimators': 298, 
                  'max_depth': 10, 
                  'max_features': 60, # 30%-50% of number of features
                  'min_samples_split': 2,
                  'min_samples_leaf': 81,
                  'bootstrap': False
                }"""

#LB 0.263, cv 0.37
parameters = {'n_estimators': 300, 
                  'max_depth': 20, 
                  'max_features': 50, # 30%-50% of number of features
                  'min_samples_split': 10,
                  'min_samples_leaf': 50,
                  'bootstrap': True
                }


# Training
for i, (train_index, test_index) in enumerate(skf.split(X, y)):
    print(' rf kfold: {}  of  {} : '.format(i+1, kfold))
    X_train, X_valid = X[train_index], X[test_index]
    y_train, y_valid = y[train_index], y[test_index]

    # Choose the type of classifier. 
    clf = RandomForestRegressor(**parameters)
    clf.fit(X_train, y_train)

    # Make predictions
    predictions = clf.predict(X_valid)
    print(gini_normalized(y_valid,predictions))


NameError: name 'copy' is not defined

In [14]:
# Make predictions and output
ids = test_df['id']
predictions = clf.predict(test_df.drop('id', axis=1))
output = pd.DataFrame({ 'id' : ids, 'target': predictions })
output.to_csv('driver-predictions-3.csv', index = False)

In [15]:
#output.describe()

Unnamed: 0,id,target
count,892816.0,892816.0
mean,744153.5,0.138078
std,429683.0,0.046379
min,0.0,0.055006
25%,372021.8,0.105562
50%,744307.0,0.130014
75%,1116308.0,0.159685
max,1488026.0,0.418243


Insight:
RandomForestRegression is much better than the Classifier model in Notebook01, with a LB score of 0.263.
However, this is far from the results of xgboost(0.281).