In [1]:
import pandas as pd
import numpy as np
import gini

from util_data import DataSet

# added
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostRegressor, BaggingRegressor, GradientBoostingRegressor, RandomForestRegressor
from sklearn.feature_selection import RFECV
import matplotlib.pyplot as plt

# preprocessing
from sklearn.preprocessing import normalize

data = DataSet()

In [2]:
# I think we can use panda as an alternative for analysis, since it helps us gain more insights into the data

train = data.get_training_set()
test = data.get_testing_set()

## Finding the best model

Let's try a few models with standard parameters to find the best regressor

In [3]:
ListOfFunction = [LogisticRegression,AdaBoostRegressor,BaggingRegressor,GradientBoostingRegressor,RandomForestRegressor]

In [5]:
# separate data and label
X = train.drop(['id','target'],axis=1) # drop id and target from X, since "id" wouldn't do much help for prediction
Y = train['target'].as_matrix()
X =X.fillna(X.mean())

We take a training set of size 200 000 (more than 30% of the set)

In [5]:
for func in ListOfFunction:
    rfc = func()
    rfc.fit(X[:200000],Y[:200000])
    Y_pred = rfc.predict_proba(X[-50000:])
    print(func.__name__)
    gini.gini_visualization(Y[-50000:],Y_pred,True)

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

Clearly Gradient Boosting seems best by now

## Test on Gradient Boosting

Let's try to remove some unuseful features

In [None]:
# When there is calc we remove (see first_examples notebook)
customX = X.drop(X.filter(like='calc').columns,axis=1)

rfc = GradientBoostingRegressor()
rfc.fit(X[:200000],Y[:200000])
Y_pred = rfc.predict(X[-50000:])
print(func.__name__)
gini.gini_visualization(Y[-50000:],Y_pred,True)

Let's make change in the number of estimators

In [None]:
# When there is calc we remove (see first_examples notebook)
customX = X.drop(X.filter(like='calc').columns,axis=1)

numbers = [10,20,40,60,80,100,150,200,300,400,500]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(X[:200000],Y[:200000])
    Y_pred = rfc.predict(X[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
print("Summary")
plt.plot(numbers,results)
plt.xlabel('Number of classifiers')
plt.ylabel('Gini')
plt.show()

## More precised tests

Try with unuseful features and normalization

In [None]:
#Case 1 : Feature removal + Normalization
customX = X.drop(X.filter(like='calc').columns,axis=1)
customX = normalize(customX)

numbers = [60,80,100,150]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
# Case 2 : No feature removal + Normalization
customX = normalize(X)

numbers = [60,80,100,150] # these seems to be the best parameters
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
# Case 3 : Feature removal + No normalization
customX = X.drop(X.filter(like='calc').columns,axis=1)

numbers = [60,80,100,150]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
# Case 4 : No feature removal, no normalization
customX = X

numbers = [60,80,100,150]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

## More features removal ?

In [None]:
correlated_features = ["ps_reg_03","ps_car_13"]
lacunar_features = ["ps_car_03_cat","ps_car_05_cat"]
func = GradientBoostingRegressor

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1)

numbers = [60,80,100,150,200]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(lacunar_features,axis=1)

numbers = [60,80,100,150,200]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1)

numbers = [60,80,100,150,200]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

More and More ?

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1)
disbalance = []

for col in customX.filter(like='bin').columns:
    if(train[[col,'id']].groupby([col], as_index=False).count().loc[1]['id']< 20000 or train[[col,'id']].groupby([col], as_index=False).count().loc[0]['id']< 20000):
        print(col)
        disbalance.append(col)

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1).drop(disbalance,axis=1)

numbers = [60,80,100,150,200]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX[:200000],Y[:200000])
    Y_pred = rfc.predict(customX[-50000:])
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y[-50000:],Y_pred,False))

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1).drop(disbalance,axis=1)
disbalance2 = []

for col in X.filter(like='cat').columns:
    if(max(train[[col,'id']].groupby([col], as_index=False).count().id)>510000):
        print(col)
        disbalance2.append(col)

In [None]:
customX = X.drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1).drop(disbalance,axis=1)

numbers = [150]
results = []

for i in numbers:
    rfc = GradientBoostingRegressor(n_estimators = i)
    rfc.fit(customX,Y)
    Y_pred = rfc.predict(customX)
    print(func.__name__ + " for "+str(i)+" classifiers")
    results.append(gini.gini_visualization(Y,Y_pred,False))

In [None]:
res = []
testX = test.drop(["id"],axis=1).drop(X.filter(like='calc').columns,axis=1).drop(correlated_features,axis=1).drop(lacunar_features,axis=1).drop(disbalance,axis=1)
Y_pred = rfc.predict(testX)

k=0
for i,p in zip(test.id, Y_pred):
    res.append([i,max(0,p)])
    
pd.DataFrame(res,columns=["id","target"]).to_csv("prediction.csv",index = False)


In [7]:
rfc = GradientBoostingRegressor(n_estimators = 150)
selector = RFECV(rfc,step = 1)
selector.fit(X, Y)


RFECV(cv=None,
   estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.1, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_split=1e-07,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=150,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False),
   n_jobs=1, scoring=None, step=1, verbose=0)

In [8]:
selector.ranking_

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  3,  5,  1,  1,  1,  1,  1,  1,
       16,  1,  1,  1,  1,  6,  1,  1, 12,  1,  1,  1,  1, 13,  1,  1,  1,
        1,  1,  1,  8,  1,  9,  4,  1,  7,  1,  1, 10,  1,  1,  2,  1,  1,
       14, 15, 18, 17, 11, 19])