# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest

from sklearn.model_selection import train_test_split as split_TT
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import f_regression, mutual_info_regression, SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor

from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor

from sklearn.preprocessing import QuantileTransformer
from sklearn.ensemble import AdaBoostRegressor





In [2]:
x_train_origin =  pd.read_csv("x_train_wo_outlier_KNN.csv")
y_train_origin = pd.read_csv("y_train_wo_outlier.csv")

x_test =  pd.read_csv("x_test.csv", delimiter=",", index_col='id')

#formatting
train_data = pd.merge(left=x_train_origin, right=y_train_origin, how='inner').drop(columns=['id'])
x_train = x_train_origin.drop(columns=['id'])
y = y_train_origin['y']

# Data preprocessing

### Missing Values Imputation

In [3]:
x_train = x_train.fillna(x_train.median())
x_test = x_test.fillna(x_test.median())
print(x_train.shape)

(1090, 832)


### Outlier drop

###### We use Isolation Forest

In [43]:
clf = IsolationForest(contamination='auto', random_state=323)
clf.fit(x_train)
yesOrNoOutlier = clf.predict(x_train.values)

print("Num lines before drop : ", x_train.shape)
print("Num lines before drop : ", y_train.shape)


for num,line in enumerate(yesOrNoOutlier):
    if not line == 1:
        x_train = x_train.drop([num])
        y_train = y_train.drop([num])

print("Num lines after drop : ", x_train.shape)
print("Num lines after drop : ", y_train.shape)

Num lines before drop :  (1090, 60)
Num lines before drop :  (1090,)
Num lines after drop :  (1028, 60)
Num lines after drop :  (1028,)


### Feature Selection

##### We define all feature selection functions below

In [4]:
def Scale(X):
    scaler = QuantileTransformer()
    return scaler.fit_transform(X)

#Correlation
def CorrSelector(xy_data,threshold):
    corr = xy_data.corr()
    top_features = corr.index[abs(corr['y']>threshold)]
    
    ##Displays best features correlation as a table
    #plt.subplots(figsize=(12, 8))
    top_corr = xy_data[top_features].corr()
    sns.heatmap(top_corr, annot=True)
    plt.show()
    
    #Displays best features as graphs
    col = corr.index[abs(corr['y']>0.38)]
    sns.set(style='ticks')
    sns.pairplot(xy_data[col],kind='reg')
    
    return top_features, top_corr


#Remove Low Variance
def HighVarSelector(X_train, X_test):
    threshold = VarianceThreshold(threshold=(.95 * (1 - .95)))
    
    filtered_X_train = threshold.fit_transform(X_train)
    filtered_X_test = threshold.transform(X_test)
    return pd.DataFrame(filtered_X_train), pd.DataFrame(filtered_X_test)


#KBest Selector with mutual information regression (removes similar data)
def MutualInfoSelector(X_train,X_test,y,num_features):
    #TODO remove random_state
    k_best = SelectKBest(score_func=lambda X,y:mutual_info_regression(X, y, random_state=50), k=num_features)
    X_train_kbest = k_best.fit_transform(X_train,y)
    X_test_kbest = k_best.transform(X_test)
    return pd.DataFrame(X_train_kbest),pd.DataFrame(X_test_kbest)


#GradientBoost
def GBoostSelector(X_train,X_test,y):
    #TODO check params, remove random_state
    GBoost = GradientBoostingRegressor(max_depth=20, n_estimators=300, min_samples_leaf=15, min_samples_split=10, random_state =50)
    GBoost.fit(X_train.values, y.values)
    
    #Top features selection, sorts them by importance
    top_df = pd.DataFrame(data=GBoost.feature_importances_, columns=['value'])
    top_df['feature'] = np.asarray(top_df.index)
    top_df = top_df.sort_values(axis=0, by='value', ascending=False)
    #TODO vary this param in order to find optimal cut
    top_df = top_df[top_df['value'] > 0.006]

    #Displays best features and their score
    sns.barplot(x=top_df['feature'], y=top_df['value'])
    
    gboost_selector = SelectFromModel(GBoost, prefit=True)
    X_train_bestfeatures = gboost_selector.transform(X_train)
    X_test_bestfeatures = gboost_selector.transform(X_test)
    
    return pd.DataFrame(X_train_bestfeatures),pd.DataFrame(X_test_bestfeatures)

#Recursive Feature Elimination
def RFE_selector(X_train,X_test,y,num_features):
    estimator = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=4, min_samples_leaf=15, min_samples_split=10,random_state=50)
    rfe_selector = RFE(estimator=estimator, n_features_to_select=num_features, step=0.1, verbose=1)
    x_train_rfe = rfe_selector.fit_transform(X_train, y)
    x_test_rfe = rfe_selector.transform(X_test)
    rfe_support = rfe_selector.get_support()
    rfe_feature = x_train.loc[:,rfe_support].columns.tolist()
    return rfe_support, rfe_feature, pd.DataFrame(x_train_rfe),pd.DataFrame(x_test_rfe)


#LightGBM
def LightGBM(X_train,X_test,y,num_features):
    print("GBM")
    lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    lgb_selector = SelectFromModel(lgbc)
    #lgb_selector = SelectFromModel(lgbc, max_features=num_features)
    lgb_selector.fit(X_train, y)
    lgb_support = lgb_selector.get_support()
    lgb_feature = X_train.loc[:,lgb_support].columns.tolist()
    
    x_train_lgb = lgb_selector.transform(X_train)
    x_test_lgb = lgb_selector.transform(X_test)
    
    return lgb_support, lgb_feature, pd.DataFrame(x_train_lgb),pd.DataFrame(x_test_lgb)

#Chi squared test
def Chi_selector(X,y,num_features):
    print("Chi")
    #Normalize values as required for Chi2 test
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k=num_features)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    return chi_support, chi_feature



#Lasso : SelectFromModel (L1 norm)
def LassoModel(X,y,num_features):
    print("Lasso")
    X_norm = MinMaxScaler().fit_transform(X)
    embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1",solver='liblinear'), max_features=num_features)
    embeded_lr_selector.fit(X_norm, y)
    embeded_lr_support = embeded_lr_selector.get_support()
    embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
    return embeded_lr_support, embeded_lr_feature


#Tree-based : SelectFromModel
def TreebasedModel(X,y,num_features):
    print("TBM")
    embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_features)
    embeded_rf_selector.fit(X, y)
    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
    return embeded_rf_support, embeded_rf_feature


### Feature selection takes place here

In [5]:
#Set display options
pd.set_option('display.max_rows', 100)

#Init variables (number of retained top features)
num_features=60
feature_name = x_train.columns.tolist()

print ("Shape before Feature Selection: {} ".format(x_train.shape))
print ("Test shape before Feature Selection: {} ".format(x_test.shape))

### HighVar -> MutualInfo -> GBoost dev2
### HighVar -> MutualInfo -> RFE dev3 
### dev4 MutualInfo only
### dev5 GBoost only
### dev6 RFE only
### dev7 scale -> mutualinfo -> LightGBM -> gboost -> rfe
### dev8 scale -> var -> mutualinfo600 -> rfe 80
### dev9 scale -> var -> mutual info700 -> rfe 250
### dev10 highvar 0.8 -> mutualinfo -> RFE 100


###Scaling
#x_train = Scale(x_train)
#x_test = Scale(x_test)
#print ("Train shape after Scaling: {} ".format(x_train.shape))
#print ("Test shape after Scaling: {} ".format(x_test.shape))

###Correlation not used
#top_feat, top_corr = CorrSelector(train_data,0.3)
#print "Shape after CorrSelector: {} ".format(x_train.shape)

###HighVar
x_train, x_test = HighVarSelector(x_train,x_test)
print ("Train shape after HighVarSelector: {} ".format(x_train.shape))
print ("Test shape after HighVarSelector: {} ".format(x_test.shape))

###Mutual Info
x_train, x_test = MutualInfoSelector(x_train, x_test, y,600)
print ("Train shape after MutualInfoSelector: {} ".format(x_train.shape))
print ("Test shape after MutualInfoSelector: {} ".format(x_test.shape))

#LightGBM
#lgbm_support,_,x_train,x_test = LightGBM(x_train,x_test,y,num_features) #num features unused
#print ("Train shape after LGBMSelector: {} ".format(x_train.shape))
#print ("Test shape after LGBMSelector: {} ".format(x_test.shape))

###GBoost
#x_train,x_test = GBoostSelector(x_train,x_test, y)
#print ("Train shape after GBoostSelector: {} ".format(x_train.shape))
#print ("Test shape after GBoostSelector: {} ".format(x_test.shape))


###RFESelector
rfe_support,_,x_train,x_test = RFE_selector(x_train,x_test,y,100)
print ("Train shape after RFESelector: {} ".format(x_train.shape))
print ("Test shape after RFESelector: {} ".format(x_test.shape))




x_train.to_csv('x_train_dev10.csv', index=False)
x_test.to_csv('x_test_dev10.csv', index=False)


###CHI

###RFE


###Lasso


###TreeBased


###LightGBM



#chi_support,_ = Chi_selector(X,y,num_features)
#rfe_support,_ = RFE_selector(X,y,num_features)
#embeded_lr_support,_ = LassoModel(X,y,num_features)
#embeded_rf_support,_ = TreebasedModel(X,y,num_features)
#embeded_lgb_support,_ = LightGBM(X,y,num_features)

# put all selection together
#feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
#feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
#feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
#feature_selection_df.index = range(1, len(feature_selection_df)+1)
#feature_selection_df.head(num_features)
#Count the selected times for each feature and sorts them

Shape before Feature Selection: (1090, 832) 
Test shape before Feature Selection: (776, 832) 
Train shape after HighVarSelector: (1090, 721) 
Test shape after HighVarSelector: (776, 721) 
Train shape after MutualInfoSelector: (1090, 600) 
Test shape after MutualInfoSelector: (776, 600) 
Fitting estimator with 600 features.
Fitting estimator with 540 features.
Fitting estimator with 480 features.
Fitting estimator with 420 features.
Fitting estimator with 360 features.
Fitting estimator with 300 features.
Fitting estimator with 240 features.
Fitting estimator with 180 features.
Fitting estimator with 120 features.
Train shape after RFESelector: (1090, 100) 
Test shape after RFESelector: (776, 100) 


# MODEL

In [62]:
from sklearn.metrics import r2_score

#loading different data
x_train =  pd.read_csv("x_train_dev6.csv")
y_train = pd.read_csv("y_train_wo_outlier.csv")['y']

x_test = pd.read_csv("x_test_dev6.csv")

print(x_train.shape)
print(y_train.shape)

(1090, 60)
(1090,)


In [63]:
clf = IsolationForest(contamination='auto', random_state=323)
clf.fit(x_train)
yesOrNoOutlier = clf.predict(x_train.values)

print("Num lines before drop : ", x_train.shape)
print("Num lines before drop : ", y_train.shape)


for num,line in enumerate(yesOrNoOutlier):
    if not line == 1:
        x_train = x_train.drop([num])
        y_train = y_train.drop([num])

print("Num lines after drop : ", x_train.shape)
print("Num lines after drop : ", y_train.shape)

Num lines before drop :  (1090, 60)
Num lines before drop :  (1090,)
Num lines after drop :  (1028, 60)
Num lines after drop :  (1028,)


In [64]:
n_folds = 10

def r2_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train.values)
    r2= cross_val_score(model, x_train.values, y_train.values, scoring="r2", cv = kf, )
    return(r2)

### Splitting the dataset

In [65]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=5)

#When doing rendu NN we train on x_train y_train and get output from x_test

# Trying out multiple regressors

### Gradient Boosting

In [66]:
###Determine best params
grid={
    "n_estimators":[100,300],
    "learning_rate":[0.1],
    "max_depth":[i for i in range(3, 8)],
    "max_features":[0.1, 0.3, 0.5, 0.8]}
GBoost = GradientBoostingRegressor(min_samples_leaf=15, min_samples_split=10, random_state =5)
logreg_cv=GridSearchCV(GBoost,grid,cv=10, scoring='r2', verbose=2,n_jobs=6)
logreg_cv.fit(x_train.values, y_train.values)

print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
print("accuracy :",logreg_cv.best_score_)

Fitting 10 folds for each of 40 candidates, totalling 400 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    2.0s


WorkerInterrupt: 

In [67]:
#Params dev6 : 'learning_rate': 0.1, 'max_depth': 3, 'max_features': 0.1, 'n_estimators': 300}, accuracy : 0.5343144785026357
#run 2 : 
#Params dev9 : 'learning_rate': 0.1, 'max_depth': 4, 'max_features': 0.1, 'n_estimators': 300}, accuracy : 0.5726071179087822
#Params dev3 : 'learning_rate': 0.1, 'max_depth': 5, 'max_features': 0.3, 'n_estimators': 300 , accuracy : 0.6028442048688526
#params dev10 :'learning_rate': 0.1, 'max_depth': 6, 'max_features': 0.5, 'n_estimators': 300} accuracy : 0.5754784812686855

gBoost = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1,
                                   max_depth=4, max_features=0.1 ,
                                   min_samples_leaf=15, min_samples_split=10,
                                   random_state =50)

gBoost.fit(x_train.values, y_train.values)
y_gBoost_train = gBoost.predict(x_train.values)
y_gBoost_test = gBoost.predict(x_test.values)  #pour le rendu on garde ca

print(r2_score(y_test, y_gBoost_test))

0.5700541242391217


### Random Forest

In [8]:
###Determine best params
grid={
    "n_estimators":[100,300],
    "max_depth":[i for i in range(11, 15)],
    "max_features":[0.1,0.3,0.5,0.7]
}
rf = RandomForestRegressor(random_state=5, max_features='sqrt')
rf_cv=GridSearchCV(rf,grid,cv=10, scoring='r2', verbose=2,n_jobs = 6)
rf_cv.fit(x_train.values, y_train.values)

print("tuned hpyerparameters :(best parameters) ",rf_cv.best_params_)
print("accuracy :",rf_cv.best_score_)

Fitting 10 folds for each of 32 candidates, totalling 320 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:    3.3s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   36.9s
[Parallel(n_jobs=6)]: Done 320 out of 320 | elapsed:  1.5min finished


tuned hpyerparameters :(best parameters)  {'max_depth': 12, 'max_features': 0.3, 'n_estimators': 100}
accuracy : 0.5903424228525121


In [69]:
#Params ok 
#Params dev6 :{'max_depth': 13, 'max_features': 0.3, 'n_estimators': 300},0.5118924822902385
#run2 : 
#Params dev9 :{'max_depth': 13, 'max_features': 0.3, 'n_estimators': 300}, 0.5266540952907695
#Params dev3 : (best parameters)  {'max_depth': 13, 'max_features': 1.0, 'n_estimators': 300},accuracy : 0.5763962183900193
#dev3run2 : (best parameters)  {'max_depth': 12, 'max_features': 0.6, 'n_estimators': 300} accuracy : 0.5764937773744069
#Params dev10 :(best parameters)  {'max_depth': 13, 'max_features': 0.7, 'n_estimators': 300,0.5423908837250347

random_forest = RandomForestRegressor(max_depth=12, max_features=0.3, n_estimators=100, random_state=5)
random_forest.fit(x_train.values, y_train.values)
f_i = random_forest.feature_importances_

y_rf_train = random_forest.predict(x_train.values)
y_rf_test = random_forest.predict(x_test.values)

print(r2_score(y_test, y_rf_test))

0.5471111614636097


### XGBoost

In [9]:
###Determining best params
grid={"max_depth":[i for i in range(5, 12)],
      "learning_rate":[0.1,0.3,1]
     }
model_xgb = xgb.XGBRegressor(n_estimators=300, random_state =5, nthread = -1)
model_xgb_cv=GridSearchCV(model_xgb, grid, cv=10, scoring='r2', verbose=2,n_jobs = 6)
model_xgb_cv.fit(x_train, y_train.values)

print("tuned hpyerparameters :(best parameters) ",model_xgb_cv.best_params_)
print("accuracy :",model_xgb_cv.best_score_)

Fitting 10 folds for each of 21 candidates, totalling 210 fits


[Parallel(n_jobs=6)]: Using backend LokyBackend with 6 concurrent workers.
[Parallel(n_jobs=6)]: Done  29 tasks      | elapsed:   10.4s
[Parallel(n_jobs=6)]: Done 150 tasks      | elapsed:   40.5s
[Parallel(n_jobs=6)]: Done 210 out of 210 | elapsed:   46.1s finished


tuned hpyerparameters :(best parameters)  {'learning_rate': 0.1, 'max_depth': 6}
accuracy : 0.6058820857673277


In [70]:
#Params ok
#Params dev6 :best parameters)  {'max_depth': 6},0.5360371775608017
#Params dev9 : 'max_depth': 5} accuracy : 0.49125569249095324
#Params dev3 : (best parameters)  {'max_depth': 6},accuracy : 0.6015225706221253
#params dev10 : (best parameters)  {'learning_rate': 0.1, 'max_depth': 5} accuracy : 0.5596718320787533


model_xgb = xgb.XGBRegressor(learning_rate=0.1, max_depth=6, 
                              n_estimators=300,
                             random_state =5, nthread = -1)
model_xgb.fit(x_train.values, y_train.values)
y_xgb_train = model_xgb.predict(x_train.values)
y_xgb_test = model_xgb.predict(x_test.values)

print(r2_score(y_test, y_xgb_test))

0.5684779590252672


### Hist Gradient Boosting not used cause n < 10000

In [16]:
###Determining best params 
#grid={"max_depth":[i for i in range(14, 20)]}

#grid = {
# "max_iter": [100,300],
# "learning_rate": [0.1],
# "max_depth" : [3,4,5,6,7,8,9,10],
# }

#est = HistGradientBoostingRegressor(random_state=50)
#logreg_cv=GridSearchCV(est,grid,cv=10, scoring='r2', verbose=2,n_jobs = 8)
#logreg_cv.fit(x_train.values, y_train.values)

#print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
#print("accuracy :",logreg_cv.best_score_)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:    1.7s
[Parallel(n_jobs=8)]: Done 160 out of 160 | elapsed:   30.1s finished


tuned hpyerparameters :(best parameters)  {'learning_rate': 0.1, 'max_depth': 5, 'max_iter': 100}
accuracy : 0.6153539098768438


In [71]:
#Params OK
#Params dev6 : 'max_depth': 16} accuracy : 0.49125569249095324
#run2: (best parameters)  {'learning_rate': 0.1, 'max_depth': 10, 'max_iter': 100} accuracy : 0.6130383995857118
#run3:{'learning_rate': 0.1, 'max_depth': 7, 'max_iter': 100} accuracy : 0.6144288715016057
#run4:learning_rate': 0.1, 'max_depth': 5, 'max_iter': 100} accuracy : 0.6153539098768438
#Params dev9 : 'max_depth': 15} accuracy : 0.5562018718799511
#Params dev3 : (best parameters)  {'max_depth': 14} accuracy : 0.5795827146160243
#params dev10 :best parameters)  {'max_depth': 17} accuracy : 0.5541621105073512

hgBoost = HistGradientBoostingRegressor(max_depth=5,learning_rate=0.1 ,random_state=50, max_iter = 100)

hgBoost.fit(x_train.values, y_train.values)
y_hgbr_train = hgBoost.predict(x_train.values)
y_hgbr_test = hgBoost.predict(x_test.values)

print(r2_score(y_test, y_hgbr_test))

0.5768517933056773


### Adaboost

In [24]:
###Determining best params 
#grid={"max_depth":[i for i in range(14, 20)]}

grid = {"learning_rate": [3],
       "n_estimators":[2500]}

#est = AdaBoostRegressor(random_state=50)
#logreg_cv=GridSearchCV(est,grid,cv=10, scoring='r2', verbose=2,n_jobs = 8)
#logreg_cv.fit(x_train.values, y_train.values)

#print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
#print("accuracy :",logreg_cv.best_score_)

Fitting 10 folds for each of 16 candidates, totalling 160 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  25 tasks      | elapsed:   31.0s
[Parallel(n_jobs=8)]: Done 160 out of 160 | elapsed:  2.0min finished


tuned hpyerparameters :(best parameters)  {'learning_rate': 3, 'n_estimators': 2500}
accuracy : 0.5843089821947789


In [72]:
ada = AdaBoostRegressor(random_state=5, n_estimators=2500, learning_rate=3)
ada.fit(x_train.values, y_train.values)
y_ada_train = ada.predict(x_train.values)
y_ada_test = ada.predict(x_test.values)
print("Adaboost score ",r2_score(y_test, y_ada_test))


Adaboost score  0.573958174379655


# Neural Net

In [56]:
y_gBoost_train = y_gBoost_train.reshape((y_gBoost_train.shape[0], 1))
#y_rf_train = y_rf_train.reshape((y_rf_train.shape[0], 1))
y_xgb_train = y_xgb_train.reshape((y_rf_train.shape[0], 1))
y_hgBoost_train = y_hgbr_train.reshape((y_hgbr_train.shape[0], 1))
y_adaboost_train = y_ada_train.reshape((y_ada_train.shape[0], 1))

#tmp_train = np.concatenate((y_gBoost_train, y_rf_train),axis=1)
tmp_train = np.concatenate((y_hgBoost_train, y_gBoost_train),axis=1)
#tmp_train = np.concatenate((y_hgBoost_train, tmp_train),axis=1)
tmp_train = np.concatenate((y_adaboost_train, tmp_train),axis=1)
x_nntrain = np.concatenate((tmp_train, y_xgb_train), axis=1)


y_gBoost_test = y_gBoost_test.reshape((y_gBoost_test.shape[0], 1))
#y_rf_test = y_rf_test.reshape((y_rf_test.shape[0], 1))
y_xgb_test = y_xgb_test.reshape((y_rf_test.shape[0], 1))
y_hgBoost_test = y_hgbr_test.reshape((y_hgbr_test.shape[0], 1))
y_adaboost_test = y_ada_test.reshape((y_ada_test.shape[0], 1))



#tmp_test = np.concatenate((y_gBoost_test, y_rf_test),axis=1)
tmp_test = np.concatenate((y_hgBoost_test, y_gBoost_test),axis=1)
#tmp_test = np.concatenate((y_hgBoost_test, tmp_test),axis=1)
tmp_test = np.concatenate((y_adaboost_test, tmp_test),axis=1)

x_nntest = np.concatenate((tmp_test, y_xgb_test), axis=1)

In [57]:
#from sklearn.neural_network import MLPClassifier

#param_grid = [
#        {
#            'activation' : ['identity', 'logistic', 'tanh', 'relu'],
#            'solver' : ['lbfgs', 'sgd', 'adam'],
#            'hidden_layer_sizes': [
#             (1,),(2,),(3,),(4,),(5,),(6,),(7,),(8,),(9,),(10,),(11,), (12,),(13,),(14,),(15,),(16,),(17,),(18,),(19,),(20,),(21,)
#             ]
#        }
#       ]
#clf = GridSearchCV(MLPClassifier(max_iter = 1000), param_grid, cv=10,
#                           scoring='r2')
#clf.fit(x_nntrain, y_train.values)


#print("Best parameters set found on development set:")
#print(clf.best_params_)

In [58]:
from sklearn.neural_network import MLPRegressor

nnet = MLPRegressor(random_state=50,max_iter = 1000)
nnet.fit(x_nntrain, y_train.values)

y_nnet_test = nnet.predict(x_nntest)
#print("NN score",r2_score(y_test, y_nnet_test))

### Test Results

### Simple Average of multiple regressors

In [59]:
#loading different data$

#from sklearn.metrics import r2_score
#from sklearn.model_selection import train_test_split

#x_train =  pd.read_csv("x_train_dev3.csv")
#x_test =  pd.read_csv("x_test_dev3.csv")

#y_train = pd.read_csv("y_train.csv")['y']


#x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=5)

#When doing rendu NN we train on x_train y_train and get output from x_test



In [75]:

from sklearn.ensemble import VotingRegressor
#votingRegressor = VotingRegressor([('hist',hgBoost),('AdaBoost',ada)])
#votingRegressor = VotingRegressor([('hist',hgBoost), ('model_xgb', model_xgb),('GBoost',gBoost),('Adaboost',ada)])
votingRegressor = VotingRegressor([('hist',hgBoost),('est', random_forest), ('model_xgb', model_xgb),('GBoost',gBoost),('Adaboost',ada)])



votingRegressor.fit(x_train.values, y_train.values)
y_test_predict = votingRegressor.predict(x_test.values)  # dernieres val
print(r2_score(y_test, y_test_predict))



0.589974681100122


In [61]:
y_test_df = pd.DataFrame(data=y_test_predict)

#y_test_df = pd.DataFrame(data=y_nnet_test)
y_test_df.columns = ["y"]
y_test_df["id"] = y_test_df.index
y_test_df = y_test_df[["id", "y"]]

print(y_test_df.min())
print(y_test_df.max())

y_test_df.to_csv("y_predict_sub26.csv", index=False)

id     0.000000
y     49.555981
dtype: float64
id    775.000000
y      85.736298
dtype: float64


In [None]:
#0.57 sub18
#sub20 voting dev3 test 0.615 (nn 0.614)
#sub20 best params
#sub21 dev 3 best params voting reg 0.612845281037929
#sub22 dev10 
#sub23 dev6 + outlier nn
#sub24 dev6 + outline votingregr
#sub25 dev6 votingregr sans random forest, avec ada
#sub26 tuned hyperparams, everything but random forest

In [85]:
subx =  pd.read_csv("y_predict_sub11.csv")['y']
sub20 =  pd.read_csv("y_predict_sub20.csv")['y']
sub25 =  pd.read_csv("y_predict_sub25.csv")['y']
sub26 =  pd.read_csv("y_predict_sub26.csv")['y']


print(r2_score(subx, sub20))
print(r2_score(subx, sub25))
print(r2_score(subx, sub26))

print(r2_score(sub25, sub26))
print(r2_score(sub25, sub20))





sub2520
sub





0.8671166507391169
0.8883234977122186
0.8862075169479747
0.9884511418426016
0.9071230852896086
