# Import

In [1]:
import pandas as pd
import numpy as np
import math
from sklearn import preprocessing
from dateutil import parser
import warnings
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

# Load Dataset

In [2]:
#Original data set
data_original_train = pd.read_csv("train_auto.csv", index_col=None)
data_original_test = pd.read_csv("test_auto.csv", index_col=None)

pd.options.display.max_columns = 50
pd.set_option('display.max_rows', 50)

#Extract the labels of the train dataset
# y = data_original_train['label'].to_numpy()

In [None]:
data_original_train.head()

Unnamed: 0,INDEX,TARGET_FLAG,TARGET_AMT,KIDSDRIV,AGE,HOMEKIDS,YOJ,INCOME,PARENT1,HOME_VAL,MSTATUS,SEX,EDUCATION,JOB,TRAVTIME,CAR_USE,BLUEBOOK,TIF,CAR_TYPE,RED_CAR,OLDCLAIM,CLM_FREQ,REVOKED,MVR_PTS,CAR_AGE,URBANICITY
0,1,0,0.0,0,60.0,0,11.0,"$67,349",No,$0,z_No,M,PhD,Professional,14,Private,"$14,230",11,Minivan,yes,"$4,461",2,No,3,18.0,Highly Urban/ Urban
1,2,0,0.0,0,43.0,0,11.0,"$91,449",No,"$257,252",z_No,M,z_High School,z_Blue Collar,22,Commercial,"$14,940",1,Minivan,yes,$0,0,No,0,1.0,Highly Urban/ Urban
2,4,0,0.0,0,35.0,1,10.0,"$16,039",No,"$124,191",Yes,z_F,z_High School,Clerical,5,Private,"$4,010",4,z_SUV,no,"$38,690",2,No,3,10.0,Highly Urban/ Urban
3,5,0,0.0,0,51.0,0,14.0,,No,"$306,251",Yes,M,<High School,z_Blue Collar,32,Private,"$15,440",7,Minivan,yes,$0,0,No,0,6.0,Highly Urban/ Urban
4,6,0,0.0,0,50.0,0,,"$114,986",No,"$243,925",Yes,z_F,PhD,Doctor,36,Private,"$18,000",1,z_SUV,no,"$19,217",2,Yes,3,17.0,Highly Urban/ Urban


In [None]:
data_original_train.shape

(8161, 26)

In [38]:
data_original_test.shape

(2141, 26)

In [None]:
##### categorical data:
# KIDSDRIV(01234), 
# HOMEKIDS (012345)
# PARENT1 (yes no)
# MSTATUS（yes no）
# SEX（z_F, M）
# EDUCATION(z_High School:2330; Bachelors:2242; Masters:1658; <High School:1203; PhD:728)
# JOB (z_Blue Collar:1825; Clerical: 1271; Professional:1117; Manager: 988; Lawyer: 835; Student:712; Home Maker:641; Doctor :46)
# CAR_USE（private/commercial）
# CAR_TYPE (z_SUV:2294; Minivan:2145; Pickup:1389; Sports Car: 907; Van:750; Panel Truck:676)
# RED_CAR（yes/no）
# CLM_FREQ (0-5)
# REVOKED (yes/no)
# URBANICITY (urban/ rural)


###### numerical data:
# TARGET_AMT（6008 of 0, otherwise all big numbers）； 
# AGE 46- 80
# YOJ (0-23)
# INCOME 
# HOME_VAL（2000 of 0）
# TRAVTIME（integer）
# BLUEBOOK
# TIF （1-25）
# OLDCLAIM (5009 of 0)
# MVR_PTS (integer; 0-13)
# CAR_AGE (-3 ~28)

In [37]:
data_original_train.adtypes

INDEX            int64
TARGET_FLAG      int64
TARGET_AMT     float64
KIDSDRIV         int64
AGE            float64
HOMEKIDS         int64
YOJ            float64
INCOME         float64
PARENT1         object
HOME_VAL        object
MSTATUS         object
SEX             object
EDUCATION       object
JOB             object
TRAVTIME         int64
CAR_USE         object
BLUEBOOK        object
TIF              int64
CAR_TYPE        object
RED_CAR         object
OLDCLAIM        object
CLM_FREQ         int64
REVOKED         object
MVR_PTS          int64
CAR_AGE        float64
URBANICITY      object
dtype: object

# Clean Datas

### Convert 'fake string' columns to float

We removed the '$' symbol, replaced ',' by '.' and set the 'nan' to real np.nan so all the datas could be interpreted and converted to float

In [87]:
for field in ['OLDCLAIM', 'HOME_VAL', 'BLUEBOOK', 'INCOME']:
    data_original_train[field] = data_original_train[field].astype(str).apply(lambda x : x.replace(',', '.')[1:]).replace('an', np.nan).astype(float)

In [88]:
for field in ['OLDCLAIM', 'HOME_VAL', 'BLUEBOOK', 'INCOME']:
    data_original_test[field] = data_original_test[field].astype(str).apply(lambda x : x.replace(',', '.')[1:]).replace('an', np.nan).astype(float)

### Encode string columns using one-hot-encoder

This can be costly in term of memory, we will frist try to encode every categorical columns. If we face memory issue when running our Classification algorithm we will drop some columns

In [48]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore', sparse = False)

In [55]:
data_original_train = pd.get_dummies(data_original_train)

In [89]:
data_original_test = pd.get_dummies(data_original_test)

### Handling missing values

As sklearn implementation of Random Forest and other ML algorithms doesn't take automatically take care of missing values, we will have to handle missing values before implementing any ML algorithm.

In [61]:
data_original_train.isna().sum()

INDEX                                 0
TARGET_FLAG                           0
TARGET_AMT                            0
KIDSDRIV                              0
AGE                                   6
HOMEKIDS                              0
YOJ                                 454
INCOME                              445
HOME_VAL                            464
TRAVTIME                              0
BLUEBOOK                              0
TIF                                   0
OLDCLAIM                              0
CLM_FREQ                              0
MVR_PTS                               0
CAR_AGE                             510
PARENT1_No                            0
PARENT1_Yes                           0
MSTATUS_Yes                           0
MSTATUS_z_No                          0
SEX_M                                 0
SEX_z_F                               0
EDUCATION_<High School                0
EDUCATION_Bachelors                   0
EDUCATION_Masters                     0


In [62]:
data_original_train.isna().sum().sum()

1879

The above output shows us that between 510 and 1879 rows contain missing values, which correspond to 6% to 23% of the datas. This being a significant proportion of the dataset we cannot afford to delete the concerned rows and thus have to find and ad-hoc solution for each field containing missing values.

- AGE/YOJ

From our understanding of the different features, interpolating this feature from the other features would not be sufficiently relevant. 

We choose then to fill nan values with the median of ages

In [71]:
data_original_train['AGE'] = data_original_train['AGE'].replace(np.nan, data_original_train.AGE.median())
data_original_train['YOJ'] = data_original_train['YOJ'].replace(np.nan, data_original_train.YOJ.median())

In [90]:
data_original_test['AGE'] = data_original_test['AGE'].replace(np.nan, data_original_test.AGE.median())
data_original_test['YOJ'] = data_original_test['YOJ'].replace(np.nan, data_original_test.YOJ.median())

- INCOME/HOME_VAL/CAR_AGE

From our understanding of the different features, interpolating theses features is relevant. 

We choose then to use the pandas interpolate method.

In [77]:
data_original_train = data_original_train.interpolate(method='linear')

In [91]:
data_original_test = data_original_test.interpolate(method='linear')

### TARGET_AMT

This field is entirely nan in the test set, as it has 'TARGET' in its name we can induce that it is entirely dependant on the other variable thus independant from TARGET_FLAG, we choose then to train our model on the dataset excluidng the TARGET_AMT field.

(Not sure about this)

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [105]:
clf = RandomForestClassifier(max_depth=2, random_state=0)

X = data_original_train.drop(columns = ['TARGET_FLAG', 'TARGET_AMT'])
y = data_original_train['TARGET_FLAG'] 

clf.fit(X, y)

X_test = data_original_test.drop(columns = ['TARGET_FLAG', 'TARGET_AMT'])

Y_test = clf.predict(X_test)