In [1]:
pip install xgboost

In [1]:
import numpy as np
import pandas as pd
import random

from sklearn import datasets
from sklearn import linear_model
from sklearn import naive_bayes
from sklearn import neighbors
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import ExtraTreesRegressor, IsolationForest
from sklearn.ensemble import RandomForestRegressor, StackingRegressor, StackingRegressor
from sklearn.linear_model import LinearRegression, Ridge, RidgeCV, ElasticNet, Lasso, LassoCV

from sklearn.feature_selection import SelectFromModel
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_regression
import xgboost as xgb
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.decomposition import PCA
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl


plt.style.use('fivethirtyeight') 
mpl.rcParams['lines.linewidth'] = 2
# mpl.rcParams['axes.labelsize'] = 14
# mpl.rcParams['xtick.labelsize'] = 12
# mpl.rcParams['ytick.labelsize'] = 12
mpl.rcParams['text.color'] = 'k'

%matplotlib inline

In [30]:
X_train = pd.read_csv("X_train.csv")
y = pd.read_csv('y_train.csv')
X_test = pd.read_csv('X_test.csv')

In [31]:
for i in X_test.keys():
    X_train[i].fillna(X_train[i].median(), inplace = True)
for i in X_test.keys():
    X_test[i].fillna(X_test[i].median(), inplace = True)

In [32]:
# outlier detection amonge features by interquartile range
outliers_percentage = []
variables = []
for k, v in X_train.items():
    Q1 = v.quantile(0.25)
    Q3 = v.quantile(0.75)
    IRQ = Q3 - Q1
    v_col = v[(v <= Q1 - 1.5 * IRQ) | (v >= Q3 + 1.5 * IRQ)]
    perc = np.shape(v_col)[0] * 100.0 / np.shape(X_train)[0]
    outliers_percentage.append(perc)
    variables.append(k)
#     print("Column %s outliers = %.2f%%" % (k, perc))

outliers_inTrain = pd.DataFrame({'Variable':variables, '% Outliers':outliers_percentage })
outliers_inTrain.sort_values(by=["% Outliers"],ascending=False)

Unnamed: 0,Variable,% Outliers
630,x629,100.000000
194,x193,100.000000
340,x339,100.000000
298,x297,100.000000
86,x85,13.366337
...,...,...
384,x383,0.000000
383,x382,0.000000
381,x380,0.000000
380,x379,0.000000


In [33]:
# outlier elemination of features
X_train = X_train.drop(['id', 'x629','x193','x339','x297'], axis=1)
X_test = X_test.drop(['id', 'x629','x193','x339','x297'], axis=1)
y = y.drop(['id'], axis=1)

In [34]:
# outlier detection of data
x = X_train.values
clf = IsolationForest(random_state=0,contamination=.1).fit(x)
prediction = clf.predict(x)
idx = np.where(prediction<0)
# eleminate outliers 
x[idx[0],:]=np.nan
# knn imputation
imp= KNNImputer(n_neighbors=2)
x_clean = imp.fit_transform(x)
X_train = pd.DataFrame(data = x_clean, columns=X_train.keys())

In [35]:
# LSE 
# a = sm.OLS(y, X_train).fit()

In [36]:
# Baseline performance with simple linear regression
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

seed = 1234
kfolds = KFold(10,shuffle=True,random_state=seed)
model = linear_model.LinearRegression()
model.fit(X_train , y)
reg_scores = cross_val_score(model, X_train , y, cv=kfolds)
yp = model.predict(X_train )

print("R2=%.3f"%(r2_score(y, yp)))
print("10-fold Crossvalidation: R2 Mean %.3f StdDev %.3f"%(reg_scores.mean(),reg_scores.std()))

R2=0.063
10-fold Crossvalidation: R2 Mean 0.041 StdDev 0.059


In [51]:
# apply the procedure to take the best k variables based on mutual_info_regression
feature_selection_univariate_model = SelectKBest(mutual_info_regression, k=100)

# fit the feature selection model and select the four variables
X_selected_features_univariate = feature_selection_univariate_model.fit_transform(X_train ,y['y'])

mask = feature_selection_univariate_model.get_support() #list of booleans
print("Reduced data set shape = ",X_selected_features_univariate.shape)
print("     Selected features = ",X_train .keys()[mask])
print("      Deleted Features = ", X_train .keys()[~mask])

Reduced data set shape =  (1212, 100)
     Selected features =  Index(['x7', 'x30', 'x41', 'x52', 'x68', 'x75', 'x76', 'x81', 'x85', 'x87',
       'x101', 'x129', 'x134', 'x139', 'x140', 'x148', 'x170', 'x180', 'x183',
       'x184', 'x192', 'x196', 'x204', 'x210', 'x214', 'x222', 'x223', 'x224',
       'x227', 'x234', 'x239', 'x240', 'x247', 'x262', 'x282', 'x285', 'x289',
       'x292', 'x300', 'x301', 'x302', 'x334', 'x352', 'x360', 'x375', 'x382',
       'x384', 'x387', 'x390', 'x408', 'x417', 'x441', 'x442', 'x447', 'x448',
       'x457', 'x464', 'x465', 'x466', 'x471', 'x473', 'x482', 'x494', 'x496',
       'x498', 'x504', 'x508', 'x514', 'x523', 'x537', 'x559', 'x578', 'x579',
       'x583', 'x590', 'x597', 'x599', 'x619', 'x648', 'x661', 'x662', 'x669',
       'x672', 'x685', 'x686', 'x711', 'x717', 'x723', 'x726', 'x757', 'x759',
       'x772', 'x776', 'x785', 'x789', 'x791', 'x792', 'x800', 'x819', 'x823'],
      dtype='object')
      Deleted Features =  Index(['x0', 'x1', 'x

In [56]:
# the best result(6387) piplien --> imputation by median, outlier detection of features,
# eleminate outlier features, outlier detection on data by isolation forest, impute by knn, mutual info reg feature selection,
# extra trees regressor
etr = ExtraTreesRegressor(n_estimators=100, random_state=0).fit(X_train.values[: ,mask], y['y'].values)
yp = etr.predict(X_train.values[: ,mask])
print("R2=%.3f"%(r2_score(y['y'], yp)))

R2=0.851


In [48]:
scores = cross_val_score(etr, X_train, y['y'],cv=10)
print("10-fold Crossvalidation R2: Mean %.3f StdDev %.3f"%(scores.mean(),scores.std()))

10-fold Crossvalidation R2: Mean 0.417 StdDev 0.048


In [50]:
yp = etr.predict(X_test.values[: ,mask])
idx = [float(i) for i in range(len(yp))]
result = pd.DataFrame({'id':idx, 'y':yp})
result.to_csv('result.csv',index=False)

array([70.51, 70.93, 71.61, 68.05, 71.06, 73.38, 71.57, 71.48, 80.06,
       78.7 , 65.37, 78.9 , 71.08, 74.67, 61.56, 83.41, 69.58, 75.1 ,
       68.24, 71.58, 69.79, 68.57, 75.12, 76.57, 62.59, 66.41, 58.43,
       72.41, 64.32, 64.77, 58.6 , 69.72, 62.76, 66.93, 75.36, 64.71,
       72.45, 58.61, 59.46, 67.41, 73.36, 77.61, 73.77, 78.64, 77.56,
       70.02, 66.76, 72.72, 76.09, 63.95, 73.84, 69.28, 66.2 , 79.55,
       75.57, 74.54, 72.81, 75.46, 66.4 , 62.65, 60.12, 68.98, 65.43,
       62.96, 73.41, 59.99, 75.15, 78.23, 64.97, 74.18, 69.91, 76.92,
       62.66, 59.44, 66.7 , 76.89, 72.32, 60.06, 83.77, 84.2 , 70.45,
       75.09, 71.86, 73.59, 63.04, 67.67, 55.12, 71.32, 77.91, 79.88,
       68.66, 65.33, 76.7 , 66.15, 67.61, 66.18, 67.81, 67.62, 68.64,
       75.98, 76.5 , 62.44, 76.86, 79.48, 78.21, 66.11, 64.23, 64.55,
       72.53, 61.93, 70.59, 66.19, 76.25, 65.46, 68.62, 74.14, 77.82,
       75.52, 71.82, 69.07, 75.04, 79.33, 60.56, 59.11, 75.93, 66.87,
       61.55, 74.98,

In [57]:
# best result(6374) pipline: same as extra trees regressor
model_ridge = RidgeCV(alphas = [.1, .9, 10, 50, 100], cv=KFold(10, shuffle=True, random_state=12345678)).fit(X_train.values[:,mask], y['y'])
ridge_scores = cross_val_score(model_ridge, X_train.values[:,mask], y['y'], cv=kfolds)
yp = model_ridge.predict(X_train.values[:,mask])
print("R2=%.3f"%(r2_score(y['y'], yp)))
print("10-fold Crossvalidation R2: Mean %.3f StdDev %.3f"%(ridge_scores.mean(),ridge_scores.std()))

R2=0.356
10-fold Crossvalidation R2: Mean 0.205 StdDev 0.114


In [58]:
yp = model_ridge.predict(X_test.values[: ,mask])
idx = [float(i) for i in range(len(yp))]
result = pd.DataFrame({'id':idx, 'y':yp})
result.to_csv('result.csv',index=False)

In [115]:
# poor result (.0269)!
model_lasso = LassoCV(alphas = [.1, .9, 10, 50, 100],cv=KFold(10, shuffle=True, random_state=12345678), max_iter=10000, tol=0.001).fit(X_train.values[:,mask], y['y'].values)
model_lasso.fit(X_train.values[:,mask], y['y'].values)
yp = model_lasso.predict(X_train.values[:,mask])
lasso_scores = cross_val_score(model_lasso, X_train.values[:,mask], y['y'].values, cv=kfolds)
print("R2=%.3f"%(r2_score(y['y'], yp)))
print("10-fold Crossvalidation R2: Mean %.3f StdDev %.3f"%(lasso_scores.mean(),lasso_scores.std()))

R2=0.311
10-fold Crossvalidation R2: Mean 0.256 StdDev 0.114


In [96]:
yp = reg.predict(X_test.values[: ,mask])
idx = [float(i) for i in range(len(yp))]
result = pd.DataFrame({'id':idx, 'y':yp})
result.to_csv('result.csv',index=False)

In [59]:
xgbr = xgb.XGBRegressor(verbosity=0)
xgbr.fit(X_train, y['y'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             gamma=0, gpu_id=-1, importance_type=None,
             interaction_constraints='', learning_rate=0.300000012,
             max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=8,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=0)

In [60]:
# The best result(.5857) pipline: imputation by median, outlier detection of features, eleminate outlier features,
score = xgbr.score(X_train, y['y'])  
print("Training score: ", score)

Training score:  0.8505395143252663


In [61]:
scores = cross_val_score(xgbr, X_train, y['y'],cv=10)
print("10-fold Crossvalidation R2: Mean %.3f StdDev %.3f"%(scores.mean(),scores.std()))

10-fold Crossvalidation R2: Mean 0.356 StdDev 0.049


In [29]:
yp = xgbr.predict(X_test)

In [30]:
idx = [float(i) for i in range(len(yp))]
result = pd.DataFrame({'id':idx, 'y':yp})
result.to_csv('result.csv',index=False)

In [70]:
# best result (.5296) pipline: same as extra trees regressor
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X_train.values[:,mask], y['y'])
score = regr.score(X_train.values[:,mask], y['y']) 
print("Training score: ", score)

Training score:  0.3436620650077785


In [117]:
scores = cross_val_score(regr, X_train.values[:,mask], y['y'],cv=10)
print("10-fold Crossvalidation R2: Mean %.3f StdDev %.3f"%(scores.mean(),scores.std()))

10-fold Crossvalidation R2: Mean 0.298 StdDev 0.062


In [98]:
# with estimator [ridge, randomfores] and final extratreesregressor --> .48
# with estimator [ridge, extratreesregressor] and final xgbr --> .5
estimators = [
     ('lr', ExtraTreesRegressor(n_estimators=100, random_state=0)),
     ('svr', xgbr),
     ('rd', model_ridge)]
stack_reg = StackingRegressor(
      estimators=estimators,
      final_estimator= model)
stack_reg.fit(X_train.values[:,mask], y['y'])
score = stack_reg.score(X_train.values[:,mask], y['y'])
print("Training score: ", score)

Training score:  0.8318315207295708


In [93]:
scores = cross_val_score(stack_reg, X_train.values[:,mask], y['y'],cv=10)
print("10-fold Crossvalidation R2: Mean %.3f StdDev %.3f"%(scores.mean(),scores.std()))

10-fold Crossvalidation R2: Mean 0.458 StdDev 0.057


In [99]:
yp = stack_reg.predict(X_test.values[: ,mask])
idx = [float(i) for i in range(len(yp))]
result = pd.DataFrame({'id':idx, 'y':yp})
result.to_csv('result.csv',index=False)

In [78]:
from sklearn.neural_network import MLPRegressor

mlp = MLPRegressor(random_state=1, max_iter=500).fit(X_train.values[:,mask], y['y'])
yp = mlp.predict(X_train.values[:,mask])
score = mlp.score(X_train.values[:,mask], y['y'])
print("Training score: ", score)

Training score:  -8.642678081760631e+31


In [29]:
# xgboost with l1 regularization
dm = xgb.DMatrix(data = X_train, label = y)
params = {'objective':'reg:linear', 'max_depth':'4'}
l1_params = [.01, .1]
rmse_l1 = []
for reg in l1_params:
    params['alpha'] = reg
    cv_results = xgb.cv(dtrain = dm, params=params, nfold = 10, num_boost_round = 10,
                        metrics = 'rmse', as_pandas = True, seed = 123)
    rmse_l1.append(cv_results['test-rmse-mean'].tail(1).values[0])
print('best rmse as a function of l1:')
print(pd.DataFrame(list(zip(l1_params, rmse_l1)), columns = ['l1','rmse']))

best rmse as a function of l1:
     l1      rmse
0  0.01  7.916308
1  0.10  8.001649


In [1]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import (RationalQuadratic, Exponentiation)
kernel = CompoundKernel([WhiteKernel(noise_level=3.0), RBF(length_scale=2.0)])
gpr = GaussianProcessRegressor(kernel=kernel,
         random_state=0).fit(X_train.values[:,mask], y['y'])
score =  gpr.score(X_train.values[:,mask], y['y'])
print("Training score: ", score)

In [70]:
scores = cross_val_score(gpr, X_train.values[:,mask], y['y'],cv=10)
print("10-fold Crossvalidation R2: Mean %.3f StdDev %.3f"%(scores.mean(),scores.std()))



10-fold Crossvalidation R2: Mean -68217245571488464.000 StdDev 137323722037408768.000


