## Mise en place

In [64]:
import pandas as pd
import numpy as np
import gini

from util_data import DataSet

from sklearn.neighbors import KNeighborsRegressor

# added
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing

data = DataSet()

In [47]:
training_set = data.get_training_set(25000).as_matrix()
testing_set = data.get_training_set(15000).as_matrix()

Ytr, Xtr = training_set[:,1],training_set[:,2:]
Ytst, Xtst = testing_set[:,1],testing_set[:,2:]

## Analyse avec panda

In [48]:
# I think we can use panda as an alternative for analysis, since it helps us gain more insights into the data
train = pd.read_csv("./data/train.csv",delimiter=',')
test = pd.read_csv("./data/test.csv", delimiter=',')

In [49]:
# very pretty summary of each feature
train.describe()

Unnamed: 0,id,target,ps_ind_01,ps_ind_02_cat,ps_ind_03,ps_ind_04_cat,ps_ind_05_cat,ps_ind_06_bin,ps_ind_07_bin,ps_ind_08_bin,...,ps_calc_11,ps_calc_12,ps_calc_13,ps_calc_14,ps_calc_15_bin,ps_calc_16_bin,ps_calc_17_bin,ps_calc_18_bin,ps_calc_19_bin,ps_calc_20_bin
count,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,...,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0,595212.0
mean,743803.6,0.036448,1.900378,1.358943,4.423318,0.416794,0.405188,0.393742,0.257033,0.163921,...,5.441382,1.441918,2.872288,7.539026,0.122427,0.62784,0.554182,0.287182,0.349024,0.153318
std,429367.8,0.187401,1.983789,0.664594,2.699902,0.493311,1.350642,0.488579,0.436998,0.370205,...,2.332871,1.202963,1.694887,2.746652,0.327779,0.483381,0.497056,0.452447,0.476662,0.360295
min,7.0,0.0,0.0,-1.0,0.0,-1.0,-1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,371991.5,0.0,0.0,1.0,2.0,0.0,0.0,0.0,0.0,0.0,...,4.0,1.0,2.0,6.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,743547.5,0.0,1.0,1.0,4.0,0.0,0.0,0.0,0.0,0.0,...,5.0,1.0,3.0,7.0,0.0,1.0,1.0,0.0,0.0,0.0
75%,1115549.0,0.0,3.0,2.0,6.0,1.0,0.0,1.0,1.0,0.0,...,7.0,2.0,4.0,9.0,0.0,1.0,1.0,1.0,1.0,0.0
max,1488027.0,1.0,7.0,4.0,11.0,1.0,6.0,1.0,1.0,1.0,...,19.0,10.0,13.0,23.0,1.0,1.0,1.0,1.0,1.0,1.0


In [50]:
# another util that helps look at the type of each feature
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 595212 entries, 0 to 595211
Data columns (total 59 columns):
id                595212 non-null int64
target            595212 non-null int64
ps_ind_01         595212 non-null int64
ps_ind_02_cat     595212 non-null int64
ps_ind_03         595212 non-null int64
ps_ind_04_cat     595212 non-null int64
ps_ind_05_cat     595212 non-null int64
ps_ind_06_bin     595212 non-null int64
ps_ind_07_bin     595212 non-null int64
ps_ind_08_bin     595212 non-null int64
ps_ind_09_bin     595212 non-null int64
ps_ind_10_bin     595212 non-null int64
ps_ind_11_bin     595212 non-null int64
ps_ind_12_bin     595212 non-null int64
ps_ind_13_bin     595212 non-null int64
ps_ind_14         595212 non-null int64
ps_ind_15         595212 non-null int64
ps_ind_16_bin     595212 non-null int64
ps_ind_17_bin     595212 non-null int64
ps_ind_18_bin     595212 non-null int64
ps_reg_01         595212 non-null float64
ps_reg_02         595212 non-null float64
ps_re

In [51]:
# take out 2 columns "id" and "target", group by target
# among all customers, 21694 claim an insurance, while 573518 do not
train[['target','id']].groupby(['target'], as_index=False).count()

Unnamed: 0,target,id
0,0,573518
1,1,21694


In [52]:
# separate data and label
X = train.drop(['id','target'],axis=1) # drop id and target from X, since "id" wouldn't do much help for prediction
Y = train['target']

In [53]:
# we are told that features ending with 'cat' are categorical ones, we'd try to extract which features are indeed categorical
X.filter(like='cat').columns
# if needed, add .shape at the end to count the number of categorical features (17)

Index(['ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat', 'ps_car_01_cat',
       'ps_car_02_cat', 'ps_car_03_cat', 'ps_car_04_cat', 'ps_car_05_cat',
       'ps_car_06_cat', 'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
       'ps_car_10_cat', 'ps_car_11_cat'],
      dtype='object')

In [54]:
# do the same to extract binary features (14)
X.filter(like='bin').columns

Index(['ps_ind_06_bin', 'ps_ind_07_bin', 'ps_ind_08_bin', 'ps_ind_09_bin',
       'ps_ind_10_bin', 'ps_ind_11_bin', 'ps_ind_12_bin', 'ps_ind_13_bin',
       'ps_ind_16_bin', 'ps_ind_17_bin', 'ps_ind_18_bin', 'ps_calc_15_bin',
       'ps_calc_16_bin', 'ps_calc_17_bin', 'ps_calc_18_bin', 'ps_calc_19_bin',
       'ps_calc_20_bin'],
      dtype='object')

In [55]:
# deal with missing values
# we are told that there are, indeed, missing values in the dataset, and those values are denoted by -1

train02 = train.replace(-1,np.nan) # replace all missing values (-1) by Nan
train02[train02.isnull().any(axis=1)].shape # get all rows that has Nan values

# there are 470 281 rows with missing values

(470281, 59)

In [56]:
X_test = test.drop(['id'], axis=1)

In [60]:
logreg = LogisticRegression()
logreg.fit(X, Y)
Y_pred = logreg.predict(X_test)
acc_log = round(logreg.score(X, Y) * 100, 2)
acc_log
# I receive an accuracy of 96.36 here which is very strange 
# something must be so wrong accuracy couldn't be so high :))

96.359999999999999

In [61]:
gini.gini_visualization(Ytst,Y_pred,True)
# I don't really understand why gini doesn't run here ?
# maybe if the code is running fine on your machine could you please test it :-) ?

NameError: name 'predictions' is not defined

In [65]:
# compute feature importance
X_scaled = preprocessing.scale(X)

clf = ExtraTreesClassifier()
clf.fit(X_scaled,Y)
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
print("Feature ranking:")

for f in range(X.shape[1]):
    print("%d. feature %d of feature %s (%f)" % (f + 1, indices[f], X.columns[f], importances[indices[f]]))

Feature ranking:
1. feature 34 of feature ps_ind_01 (0.030619)
2. feature 50 of feature ps_ind_02_cat (0.030014)
3. feature 46 of feature ps_ind_03 (0.029646)
4. feature 47 of feature ps_ind_04_cat (0.029359)
5. feature 49 of feature ps_ind_05_cat (0.029200)
6. feature 2 of feature ps_ind_06_bin (0.028719)
7. feature 43 of feature ps_ind_07_bin (0.028684)
8. feature 14 of feature ps_ind_08_bin (0.028659)
9. feature 38 of feature ps_ind_09_bin (0.028607)
10. feature 20 of feature ps_ind_10_bin (0.028531)
11. feature 37 of feature ps_ind_11_bin (0.028405)
12. feature 19 of feature ps_ind_12_bin (0.028346)
13. feature 39 of feature ps_ind_13_bin (0.028343)
14. feature 42 of feature ps_ind_14 (0.028144)
15. feature 44 of feature ps_ind_15 (0.028130)
16. feature 45 of feature ps_ind_16_bin (0.027623)
17. feature 40 of feature ps_ind_17_bin (0.027206)
18. feature 35 of feature ps_ind_18_bin (0.027196)
19. feature 41 of feature ps_reg_01 (0.027078)
20. feature 48 of feature ps_reg_02 (0.02681

## Entrainement et Prédiction

In [59]:
clf = KNeighborsRegressor(n_neighbors=15,weights='distance')
clf.fit(Xtr,Ytr)

Ypred = clf.predict(Xtst)

## Analyse

In [None]:
gini.gini_visualization(Ytst,Ypred,True)