# Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.ensemble import IsolationForest

from sklearn.model_selection import train_test_split as split_TT
from sklearn import linear_model
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.feature_selection import f_regression, mutual_info_regression, SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor

from xgboost import XGBRegressor
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.neural_network import MLPRegressor

from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor



In [2]:
x_train_origin =  pd.read_csv("x_train.csv")
y_train_origin = pd.read_csv("y_train.csv")

x_test =  pd.read_csv("x_test.csv", delimiter=",", index_col='id')

#formatting
train_data = pd.merge(left=x_train_origin, right=y_train_origin, how='inner').drop(columns=['id'])
x_train = x_train_origin.drop(columns=['id'])
y = y_train_origin['y']

# Data preprocessing

### Missing Values Imputation

In [16]:
x_train = x_train.fillna(x_train.median())
x_test = x_test.fillna(x_test.median())

### Outlier drop

###### We use Isolation Forest

In [22]:
clf = IsolationForest(contamination='auto', random_state=323)
clf.fit(x_train)
yesOrNoOutlier = clf.predict(x_train.values)

print("Num lines before drop : ", x_train.shape)

for num,line in enumerate(yesOrNoOutlier):
    if not line == 1:
        x_train = x_train.drop([num])

print("Num lines after drop : ", x_train.shape)

Num lines before drop :  (1191, 832)
Num lines after drop :  (1191, 832)


### Feature Selection

##### We define all feature selection functions below

In [6]:
#Correlation
def CorrSelector(xy_data,threshold):
    corr = xy_data.corr()
    top_features = corr.index[abs(corr['y']>threshold)]
    
    ##Displays best features correlation as a table
    #plt.subplots(figsize=(12, 8))
    top_corr = xy_data[top_features].corr()
    sns.heatmap(top_corr, annot=True)
    plt.show()
    
    #Displays best features as graphs
    col = corr.index[abs(corr['y']>0.38)]
    sns.set(style='ticks')
    sns.pairplot(xy_data[col],kind='reg')
    
    return top_features, top_corr


#Remove Low Variance
def HighVarSelector(X_train, X_test):
    threshold = VarianceThreshold(threshold=(.90 * (1 - .90)))
    
    filtered_X_train = threshold.fit_transform(X_train)
    filtered_X_test = threshold.transform(X_test)
    return pd.DataFrame(filtered_X_train), pd.DataFrame(filtered_X_test)


#KBest Selector with mutual information regression (removes similar data)
def MutualInfoSelector(X_train,X_test,y,num_features):
    #TODO remove random_state
    k_best = SelectKBest(score_func=lambda X,y:mutual_info_regression(X, y, random_state=50), k=num_features)
    X_train_kbest = k_best.fit_transform(X_train,y)
    X_test_kbest = k_best.transform(X_test)
    return pd.DataFrame(X_train_kbest),pd.DataFrame(X_test_kbest)


#GradientBoost
def GBoostSelector(X_train,X_test,y):
    #TODO check params, remove random_state
    GBoost = GradientBoostingRegressor(max_depth=20, n_estimators=300, min_samples_leaf=15, min_samples_split=10, random_state =50)
    GBoost.fit(X_train.values, y.values)
    
    #Top features selection, sorts them by importance
    top_df = pd.DataFrame(data=GBoost.feature_importances_, columns=['value'])
    top_df['feature'] = np.asarray(top_df.index)
    top_df = top_df.sort_values(axis=0, by='value', ascending=False)
    #TODO vary this param in order to find optimal cut
    top_df = top_df[top_df['value'] > 0.006]

    #Displays best features and their score
    sns.barplot(x=top_df['feature'], y=top_df['value'])
    
    gboost_selector = SelectFromModel(GBoost, prefit=True)
    X_train_bestfeatures = gboost_selector.transform(X_train)
    X_test_bestfeatures = gboost_selector.transform(X_test)
    
    return pd.DataFrame(X_train_bestfeatures),pd.DataFrame(X_test_bestfeatures)


#Chi squared test
def Chi_selector(X,y,num_features):
    print("Chi")
    #Normalize values as required for Chi2 test
    X_norm = MinMaxScaler().fit_transform(X)
    chi_selector = SelectKBest(chi2, k=num_features)
    chi_selector.fit(X_norm, y)
    chi_support = chi_selector.get_support()
    chi_feature = X.loc[:,chi_support].columns.tolist()
    return chi_support, chi_feature


#Recursive Feature Elimination
def RFE_selector(X_train,X_test,y,num_features):
    estimator = GradientBoostingRegressor(n_estimators=200, learning_rate=0.1, max_depth=3, min_samples_leaf=15, min_samples_split=10,random_state=50)
    rfe_selector = RFE(estimator=estimator, n_features_to_select=num_features, step=0.1, verbose=1)
    x_train_rfe = rfe_selector.fit_transform(X_train, y)
    x_test_rfe = rfe_selector.transform(X_test)
    rfe_support = rfe_selector.get_support()
    rfe_feature = x_train.loc[:,rfe_support].columns.tolist()
    return rfe_support, rfe_feature, pd.DataFrame(x_train_rfe),pd.DataFrame(x_test_rfe)


#Lasso : SelectFromModel (L1 norm)
def LassoModel(X,y,num_features):
    print("Lasso")
    X_norm = MinMaxScaler().fit_transform(X)
    embeded_lr_selector = SelectFromModel(LogisticRegression(penalty="l1",solver='liblinear'), max_features=num_features)
    embeded_lr_selector.fit(X_norm, y)
    embeded_lr_support = embeded_lr_selector.get_support()
    embeded_lr_feature = X.loc[:,embeded_lr_support].columns.tolist()
    return embeded_lr_support, embeded_lr_feature


#Tree-based : SelectFromModel
def TreebasedModel(X,y,num_features):
    print("TBM")
    embeded_rf_selector = SelectFromModel(RandomForestClassifier(n_estimators=100), max_features=num_features)
    embeded_rf_selector.fit(X, y)
    embeded_rf_support = embeded_rf_selector.get_support()
    embeded_rf_feature = X.loc[:,embeded_rf_support].columns.tolist()
    return embeded_rf_support, embeded_rf_feature


#LightGBM
def LightGBM(X,y,num_features):
    print("GBM")
    lgbc=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)
    embeded_lgb_selector = SelectFromModel(lgbc, max_features=num_features)
    embeded_lgb_selector.fit(X, y)
    embeded_lgb_support = embeded_lgb_selector.get_support()
    embeded_lgb_feature = X.loc[:,embeded_lgb_support].columns.tolist()
    return embeded_lgb_support, embeded_lgb_feature

### Feature selection takes place here

In [7]:
#Set display options
pd.set_option('display.max_rows', 100)

#Init variables (number of retained top features)
num_features=600
feature_name = x_train.columns.tolist()

print ("Shape before Feature Selection: {} ".format(x_train.shape))
print ("Test shape before Feature Selection: {} ".format(x_test.shape))

### HighVar -> MutualInfo -> GBoost dev2
### HighVar -> MutualInfo -> RFE dev3


###Correlation not used
#top_feat, top_corr = CorrSelector(train_data,0.3)
#print "Shape after CorrSelector: {} ".format(x_train.shape)

###HighVar
x_train, x_test = HighVarSelector(x_train,x_test)
print ("Train shape after HighVarSelector: {} ".format(x_train.shape))
print ("Test shape after HighVarSelector: {} ".format(x_test.shape))

###Mutual Info
x_train, x_test = MutualInfoSelector(x_train, x_test, y,num_features)
print ("Train shape after MutualInfoSelector: {} ".format(x_train.shape))
print ("Test shape after MutualInfoSelector: {} ".format(x_test.shape))

###GBoost
#x_train,x_test = GBoostSelector(x_train,x_test, y)
print ("Train shape after GBoostSelector: {} ".format(x_train.shape))
print ("Test shape after GBoostSelector: {} ".format(x_test.shape))

#RFESelector
rfe_support,_,x_train,x_test = RFE_selector(x_train,x_test,y,60)
print ("Train shape after RFESelector: {} ".format(x_train.shape))
print ("Test shape after RFESelector: {} ".format(x_test.shape))



x_train.to_csv('x_train_dev3.csv', index=False)
x_test.to_csv('x_test_dev3.csv', index=False)


###CHI

###RFE


###Lasso


###TreeBased


###LightGBM



#chi_support,_ = Chi_selector(X,y,num_features)
#rfe_support,_ = RFE_selector(X,y,num_features)
#embeded_lr_support,_ = LassoModel(X,y,num_features)
#embeded_rf_support,_ = TreebasedModel(X,y,num_features)
#embeded_lgb_support,_ = LightGBM(X,y,num_features)

# put all selection together
#feature_selection_df = pd.DataFrame({'Feature':feature_name, 'Pearson':cor_support, 'Chi-2':chi_support, 'RFE':rfe_support, 'Logistics':embeded_lr_support,'Random Forest':embeded_rf_support, 'LightGBM':embeded_lgb_support})
# count the selected times for each feature
#feature_selection_df['Total'] = np.sum(feature_selection_df, axis=1)
# display the top 100
#feature_selection_df = feature_selection_df.sort_values(['Total','Feature'] , ascending=False)
#feature_selection_df.index = range(1, len(feature_selection_df)+1)
#feature_selection_df.head(num_features)
#Count the selected times for each feature and sorts them

Shape before Feature Selection: (1212, 832) 
Test shape before Feature Selection: (776, 832) 
Train shape after HighVarSelector: (1212, 666) 
Test shape after HighVarSelector: (776, 666) 
Train shape after MutualInfoSelector: (1212, 600) 
Test shape after MutualInfoSelector: (776, 600) 
Train shape after GBoostSelector: (1212, 600) 
Test shape after GBoostSelector: (776, 600) 
Fitting estimator with 600 features.
Fitting estimator with 540 features.
Fitting estimator with 480 features.
Fitting estimator with 420 features.
Fitting estimator with 360 features.
Fitting estimator with 300 features.
Fitting estimator with 240 features.
Fitting estimator with 180 features.
Fitting estimator with 120 features.
Train shape after RFESelector: (1212, 60) 
Test shape after RFESelector: (776, 60) 


# MODEL

In [25]:
from sklearn.metrics import r2_score

#loading different data
x_train =  pd.read_csv("x_train_dev3.csv")
y_train = pd.read_csv("y_train.csv")['y']

x_test = pd.read_csv("x_test_dev3.csv")

In [26]:
n_folds = 10

def r2_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(x_train.values)
    r2= cross_val_score(model, x_train.values, y_train.values, scoring="r2", cv = kf, )
    return(r2)

### Splitting the dataset

In [5]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=5)

#When doing rendu NN we train on x_train y_train and get output from x_test

# Trying out multiple regressors

### Gradient Boosting

In [6]:
###Determine best params
#grid={"n_estimators":[300], "learning_rate":[0.1], "max_depth":[i for i in range(3, 15)], "max_features":[1.0, 0.1, 0.3]}
#GBoost = GradientBoostingRegressor(min_samples_leaf=15, min_samples_split=10, random_state =5)
#logreg_cv=GridSearchCV(GBoost,grid,cv=10, scoring='r2', verbose=2)
#logreg_cv.fit(x_train.values, y_train.values)

#print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
#print("accuracy :",logreg_cv.best_score_)

In [27]:
#Params OK
gBoost = GradientBoostingRegressor(n_estimators=300, learning_rate=0.1,
                                   max_depth=10, max_features=0.1 ,
                                   min_samples_leaf=15, min_samples_split=10,
                                   random_state =50)

gBoost.fit(x_train.values, y_train.values)
y_gBoost_train = gBoost.predict(x_train.values)
y_gBoost_test = gBoost.predict(x_test.values)  #pour le rendu on garde ca

#print(r2_score(y_test, y_gBoost_test))

### Random Forest

In [8]:
###Determine best params
#grid={"n_estimators":[300], "max_depth":[i for i in range(3, 15)], "max_features":[1.0, 0.1, 0.3]}
#rf = RandomForestRegressor(random_state=5, max_features='sqrt')
#rf_cv=GridSearchCV(rf,grid,cv=10, scoring='r2', verbose=2)
#rf_cv.fit(x_train.values, y_train.values)

#print("tuned hpyerparameters :(best parameters) ",rf_cv.best_params_)
#print("accuracy :",rf_cv.best_score_)

In [28]:
#Params ok 
random_forest = RandomForestRegressor(max_depth=14, max_features=0.3, n_estimators=300, random_state=50)
random_forest.fit(x_train.values, y_train.values)
f_i = random_forest.feature_importances_

y_rf_train = random_forest.predict(x_train.values)
y_rf_test = random_forest.predict(x_test.values)

#print(r2_score(y_test, y_rf_test))

### XGBoost

In [10]:
###Determining best params
#grid={"max_depth":[i for i in range(5, 10)]}
#model_xgb = xgb.XGBRegressor(n_estimators=300, learning_rate=0.1, random_state =5, nthread = -1)
#model_xgb_cv=GridSearchCV(model_xgb, grid, cv=10, scoring='r2', verbose=2)
#model_xgb_cv.fit(x_train, y_train.values)
#
#print("tuned hpyerparameters :(best parameters) ",model_xgb_cv.best_params_)
#print("accuracy :",model_xgb_cv.best_score_)

In [29]:
#Params ok
model_xgb = xgb.XGBRegressor(learning_rate=0.1, max_depth=6, 
                              n_estimators=300,
                             random_state =50, nthread = -1)
model_xgb.fit(x_train.values, y_train.values)
y_xgb_train = model_xgb.predict(x_train.values)
y_xgb_test = model_xgb.predict(x_test.values)

#print(r2_score(y_test, y_xgb_test))

### Hist Gradient Boosting

In [12]:
###Determining best params
#grid={"max_depth":[i for i in range(14, 20)]}
#est = HistGradientBoostingRegressor(random_state=50)
#logreg_cv=GridSearchCV(est,grid,cv=10, scoring='r2', verbose=2)
#logreg_cv.fit(x_train.values, y_train.values)
#
#print("tuned hpyerparameters :(best parameters) ",logreg_cv.best_params_)
#print("accuracy :",logreg_cv.best_score_)

In [30]:
#Params OK
hgBoost = HistGradientBoostingRegressor(max_depth=16, random_state=50)

hgBoost.fit(x_train.values, y_train.values)
y_hgbr_test = hgBoost.predict(x_test.values)

#print(r2_score(y_test, y_hgbr_test))

# Neural Net

In [31]:
y_gBoost_train = y_gBoost_train.reshape((y_gBoost_train.shape[0], 1))
y_rf_train = y_rf_train.reshape((y_rf_train.shape[0], 1))
y_xgb_train = y_xgb_train.reshape((y_rf_train.shape[0], 1))

tmp_train = np.concatenate((y_gBoost_train, y_rf_train),axis=1)
x_nntrain = np.concatenate((tmp_train, y_xgb_train), axis=1)


y_gBoost_test = y_gBoost_test.reshape((y_gBoost_test.shape[0], 1))
y_rf_test = y_rf_test.reshape((y_rf_test.shape[0], 1))
y_xgb_test = y_xgb_test.reshape((y_rf_test.shape[0], 1))

tmp_test = np.concatenate((y_gBoost_test, y_rf_test),axis=1)
x_nntest = np.concatenate((tmp_test, y_xgb_test), axis=1)

In [32]:
from sklearn.neural_network import MLPRegressor

nnet = MLPRegressor(random_state=50)
nnet.fit(x_nntrain, y_train.values)

y_nnet_test = nnet.predict(x_nntest)
#print(r2_score(y_test, y_nnet_test))

### Test Results

### Simple Average of multiple regressors

In [22]:
#loading different data$
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split

x_train =  pd.read_csv("x_train_dev3.csv")
x_test =  pd.read_csv("x_test_dev3.csv")

y_train = pd.read_csv("y_train.csv")['y']


#x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.33, random_state=5)

#When doing rendu NN we train on x_train y_train and get output from x_test



In [23]:
from sklearn.ensemble import VotingRegressor
votingRegressor = VotingRegressor([('est', hgBoost), ('model_xgb', model_xgb), ('GBoost', gBoost)])

votingRegressor.fit(x_train.values, y_train.values)
y_test_predict = votingRegressor.predict(x_test.values)  # dernieres val
#print(r2_score(y_test, y_test_predict))

In [33]:
#y_test_df = pd.DataFrame(data=y_test_predict)

y_test_df = pd.DataFrame(data=y_nnet_test)
y_test_df.columns = ["y"]
y_test_df["id"] = y_test_df.index
y_test_df = y_test_df[["id", "y"]]

print(y_test_df.min())
print(y_test_df.max())

y_test_df.to_csv("y_predict_sub8.csv", index=False)

id     0.000000
y     48.468132
dtype: float64
id    775.000000
y      87.905548
dtype: float64
