In [35]:
import pandas as pd
import numpy as np
import gini

from util_data import DataSet

# added
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt

# preprocessing
from sklearn.preprocessing import normalize

data = DataSet()

In [36]:
# I think we can use panda as an alternative for analysis, since it helps us gain more insights into the data

train = data.get_training_set()
test = data.get_testing_set()

## Finding the best model

Let's try a few models with standard parameters to find the best regressor

In [37]:
ListOfFunction = [LogisticRegression,AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor,RandomForestRegressor]

In [38]:
# separate data and label
X = train.drop(['id','target'],axis=1) # drop id and target from X, since "id" wouldn't do much help for prediction
Y = train['target'].as_matrix()
X =X.fillna(X.mean())

We take a training set of size 200 000 (more than 30% of the set)

In [39]:
for func in ListOfFunction:
    rfc = func()
    rfc.fit(X[:200000],Y[:200000])
    Y_pred = rfc.predict_proba(X[-50000:])
    print(func.__name__)
    gini.gini_visualization(Y[-50000:],Y_pred,True)

LogisticRegression
Gini: -0.106, Max. Gini: 0.482, Normalized Gini: -0.221


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

Clearly Gradient Boosting seems best by now

## Test on Gradient Boosting

Let's try to remove some unuseful features

In [None]:
# When there is calc we remove (see first_examples notebook)
customX = X.drop(X.filter(like='calc').columns,axis=1)

rfc = GradientBoostingRegressor()
rfc.fit(X[:200000],Y[:200000])
Y_pred = rfc.predict(X[-50000:])
print(func.__name__)
gini.gini_visualization(Y[-50000:],Y_pred,True)

Let's make change in the number of estimators

In [None]:
# When there is calc we remove (see first_examples notebook)
customX = X.drop(X.filter(like='calc').columns,axis=1)

numbers = [10,20,40,60,80,100,150,200,300,400,500]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(X[:200000],Y[:200000])
    Y_pred = rfc.predict(X[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
print("Summary")
plt.plot(numbers,results)
plt.xlabel('Number of classifiers')
plt.ylabel('Gini')
plt.show()

## More precised tests

Try with unuseful features and normalization

In [None]:
#Case 1 : Feature removal + Normalization
customX = X.drop(X.filter(like='calc').columns,axis=1)
customX = normalize(customX)

numbers = [60,80,100,150]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
# Case 2 : No feature removal + Normalization
customX = normalize(X)

numbers = [60,80,100,150] # these seems to be the best parameters
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
# Case 3 : Feature removal + No normalization
customX = X.drop(X.filter(like='calc').columns,axis=1)

numbers = [60,80,100,150]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
# Case 4 : No feature removal, no normalization
customX = X

numbers = [60,80,100,150]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

## More features removal ?

In [None]:
correlated_features = ["ps_reg_03","ps_car_13"]
lacunar_features = ["ps_car_03_cat","ps_car_05_cat"]
func = GradientBoostingRegressor

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1)

numbers = [60,80,100,150,200]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(lacunar_features,axis=1)

numbers = [60,80,100,150,200]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1)

numbers = [60,80,100,150,200]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

More and More ?

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1)
disbalance = []

for col in customX.filter(like='bin').columns:
    if(train[[col,'id']].groupby([col], as_index=False).count().loc[1]['id']< 20000 or train[[col,'id']].groupby([col], as_index=False).count().loc[0]['id']< 20000):
        print(col)
        disbalance.append(col)

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1).drop(disbalance,axis=1)

numbers = [150]
results = []
func = GradientBoostingRegressor

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [40]:
rk = [1, 1, 1, 1, 1, 4, 1, 1, 3, 1, 1, 1, 1, 6, 1, 1, 1, 2, 1, 1, 1, 1, 1,
       5, 1, 1, 1, 1, 1]

customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1).drop(disbalance,axis=1)

        


In [41]:

numbers = [1,3,6]
numbers_c = [100,130,150,170,190,230]
results = []
func = GradientBoostingRegressor

for j in numbers_c:
    selector = RFECV(rfc,step = 1)
    selector.fit(customX[:200000], Y[:200000])
    rk = selector.ranking_
    for i in numbers:
        highrk = []
        for col,r in zip(customX.columns,rk):
            if r>i:
                highrk.append(col)
        custom_X = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1).drop(disbalance,axis=1).drop(highrk,axis=1)
        rfc = GradientBoostingRegressor(n_estimators = j)
        rfc.fit(custom_X[:200000],Y[:200000])
        Y_pred = rfc.predict(custom_X[-50000:])
        print(func.__name__ + " for "+str(j)+" classifiers")
        results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

GradientBoostingRegressor for 100 classifiers
Gini: 0.122, Max. Gini: 0.482, Normalized Gini: 0.253
GradientBoostingRegressor for 100 classifiers
Gini: 0.122, Max. Gini: 0.482, Normalized Gini: 0.253
GradientBoostingRegressor for 100 classifiers
Gini: 0.122, Max. Gini: 0.482, Normalized Gini: 0.253
GradientBoostingRegressor for 130 classifiers
Gini: 0.124, Max. Gini: 0.482, Normalized Gini: 0.257
GradientBoostingRegressor for 130 classifiers
Gini: 0.124, Max. Gini: 0.482, Normalized Gini: 0.257
GradientBoostingRegressor for 130 classifiers
Gini: 0.124, Max. Gini: 0.482, Normalized Gini: 0.257
GradientBoostingRegressor for 150 classifiers
Gini: 0.124, Max. Gini: 0.482, Normalized Gini: 0.258
GradientBoostingRegressor for 150 classifiers
Gini: 0.123, Max. Gini: 0.482, Normalized Gini: 0.255
GradientBoostingRegressor for 150 classifiers
Gini: 0.125, Max. Gini: 0.482, Normalized Gini: 0.259
GradientBoostingRegressor for 170 classifiers
Gini: 0.124, Max. Gini: 0.482, Normalized Gini: 0.257
