In [4]:
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import SelectFwe
from tpot.builtins import StackingEstimator
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import Lasso
from sklearn.ensemble import ExtraTreesRegressor, RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVR
from sklearn.linear_model import LassoLarsCV
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline, make_union
from sklearn.feature_selection import f_regression
from sklearn.model_selection import train_test_split
from tpot.export_utils import set_param_recursive
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import numpy as np
import re
import joblib
import optuna
from joblib import dump, load
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor


In [5]:
df_clean=pd.read_csv("df_clean_v2.csv", index_col=0)
df_gd=pd.get_dummies(df_clean)


y=df_gd['Global_Sales']
X=df_gd.drop('Global_Sales', axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y,  test_size=0.2, random_state=42)


In [None]:
parameters_lr = {
    'fit_intercept': [True, False],
    'positive' : [True, False],
    'copy_X': [True, False],
    'n_jobs': [-1, 10]
}

lr=LinearRegression()
grid_search = GridSearchCV(lr, parameters_lr, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_lr = grid_search.best_params_
dump(best_params_lr, 'best_params_lr.joblib')  
best_score_lr = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_lr, 'best_score_lr.joblib')  

lr_final = LinearRegression(**best_params_lr)
lr_final.fit(X_train, y_train)

y_pred_lr = lr_final.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
score_lr=lr_final.score(X_test, y_test)
print(y_pred_lr)
print(mse_lr)
print(score_lr)


In [None]:
parameters_knn = {
    'n_neighbors': [3, 5, 7],
    'weights': ['uniform', 'distance'],
    'p': [1, 2],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'leaf_size' : range(1, 100),
    'metric' : ['minkowski', 'euclidean', 'manhattan', 'chebyshev']            
}

knn=KNeighborsRegressor()
grid_search = GridSearchCV(knn, parameters_knn, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_knn = grid_search.best_params_
dump(best_params_knn, 'best_params_knn.joblib')  
best_score_knn = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_knn, 'best_score_knn.joblib')  

knn_final = KNeighborsRegressor(**best_params_knn)
knn_final.fit(X_train, y_train)

y_pred_knn = knn_final.predict(X_test)
mse_knn = mean_squared_error(y_test, y_pred_knn)
score_knn=lr_final.score(X_test, y_test)
print(y_pred_knn)
print(mse_knn)
print(score_knn)
          


In [None]:
parameters_lass = {
    'alpha': [0.1, 0.5, 1.0],
    'fit_intercept': [True, False],
    'normalize': [True, False],
    'max_iter': [1000, 2000, 3000],
    'precompute' : [True, False],
    'copy_X' : [True, False],
    'positive' : [True, False],
    'selection' : ['cyclic', 'random']
}

lass=Lasso()
grid_search = GridSearchCV(lass, parameters_lass, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_lass = grid_search.best_params_
dump(best_params_lass, 'best_params_lass.joblib') 
best_score_lass = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_lass, 'best_score_lass.joblib')  

lass_final = Lasso(**best_params_lass)
lass_final.fit(X_train, y_train)

y_pred_lass = lass_final.predict(X_test)
mse_lass = mean_squared_error(y_test, y_pred_lass)
score_lass=lass_final.score(X_test, y_test)
print(y_pred_lass)
print(mse_lass)
print(score_lass)

In [None]:
lasso_cv = LassoLarsCV(cv=5)
lasso_cv.fit(X_train, y_train)

best_alpha_lasso_cv = lasso_cv.alpha_
best_score_lasso_cv = lasso_cv.mse_path_.mean(axis=1).min()
dump(best_score_lasso_cv, 'best_score_lasso_cv.joblib')  
best_params_lasso_cv = lasso_cv.get_params()
dump(best_params_lasso_cv, 'best_params_lasso_cv.joblib') 


y_pred_lasso_cv = lasso_cv.predict(X_test)
mse_lasso_cv = mean_squared_error(y_test, y_pred_lasso_cv)
score_lasso_cv=lasso_cv.score(X_test, y_test)
print(y_pred_lasso_cv)
print(mse_lasso_cv)
print(score_lasso_cv)
print(best_score_lasso_cv)
print(best_alpha_lasso_cv)



In [6]:
parameters_line = {
    'epsilon': [0.1, 0.2, 0.5],
    'C': [0.1, 1, 10],
    'fit_intercept': [True, False],
    'max_iter': [1000, 2000, 3000],
    'loss' : ['epsilon_insensitive', 'squared_epsilon_insensitive'],
    'dual' : [True, False],
    'intercept_scaling' : [0.1, 1, 10]
}
 
line=LinearSVR()
grid_search = GridSearchCV(line, parameters_line, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_line = grid_search.best_params_
dump(best_params_line, 'best_params_line.joblib') 
best_score_line = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_line, 'best_score_line.joblib')  

line_final = LinearSVR(**best_params_line)
line_final.fit(X_train, y_train)

y_pred_line = line_final.predict(X_test)
mse_line = mean_squared_error(y_test, y_pred_line)
score_line=line_final.score(X_test, y_test)
print(y_pred_line)
print(mse_line)
print(score_line)



[ 0.22729933  1.5894658   1.81160522 ... -0.08706129  0.05635669
  1.05600272]
1.5247494330446563
0.28344427567247366


810 fits failed out of a total of 3240.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
810 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\debor\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\debor\anaconda3\lib\site-packages\sklearn\svm\_classes.py", line 518, in fit
    self.coef_, self.intercept_, n_iter_ = _fit_liblinear(
  File "c:\Users\debor\anaconda3\lib\site-packages\sklearn\svm\_base.py", line 1223, in _fit_liblinear
    solver_type = _get_liblinear_solver_type(multi_class, penalty, loss, dual)
  File "c:\Users\debor\anaconda3\lib\site-packages\sklearn\svm\_base.py", lin

In [7]:
parameters_svr  = {
    'C': [0.1, 1, 10],
    'epsilon': [0.1, 0.2, 0.5],
    'kernel': ['linear', 'rbf', 'poly']
}
 
svr=SVR()
grid_search = GridSearchCV(svr, parameters_svr , scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_svr = grid_search.best_params_
dump(best_params_svr , 'best_params_svr .joblib') 
best_score_svr  = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_svr , 'best_score_svr .joblib')  

svr_final = SVR(**best_params_svr)
svr_final.fit(X_train, y_train)

y_pred_svr  = svr_final.predict(X_test)
mse_svr = mean_squared_error(y_test, y_pred_svr)
score_svr=svr_final.score(X_test, y_test)
print(y_pred_svr)
print(mse_svr)
print(score_svr)

[ 1.47475102  0.57674098  1.35625524 ... -0.16493313 -0.72071828
  0.50411532]
1.730337722622221
0.18682809562422398


In [8]:
parameters_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', 'auto', None],
    'criterion' : ['squared_error', 'absolute_error', 'friedman_mse', 'poisson'],
    'min_weight_fraction_leaf' : [0.0, 0.1, 0.2]
    
}

rf=RandomForestRegressor()
grid_search = GridSearchCV(rf, parameters_rf, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_rf = grid_search.best_params_
dump(best_params_rf, 'best_params_rf.joblib') 
best_score_rf = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_rf, 'best_score_rf.joblib')  

rf_final = RandomForestRegressor(**best_params_rf)
rf_final.fit(X_train, y_train)

y_pred_rf = rf_final.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
score_rf=rf_final.score(X_test, y_test)
print(y_pred_rf)
print(mse_rf)
print(score_rf)


In [None]:
parameters_dt = {
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
 
svr=DecisionTreeRegressor()
grid_search = GridSearchCV(dt, parameters_dt , scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_dt = grid_search.best_params_
dump(best_params_dt , 'best_params_dt .joblib') 
best_score_dt  = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_dt , 'best_score_dt .joblib')  

dt_final = DecisionTreeRegressor(**best_params_dt)
dt_final.fit(X_train, y_train)

y_pred_dt  = dt_final.predict(X_test)
mse_dt = mean_squared_error(y_test, y_pred_dt)
score_dt=dt_final.score(X_test, y_test)
print(y_pred_dt)
print(mse_dt)
print(score_dt)

In [None]:
parameters_ab = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.1, 0.5, 1.0]
}
 
ab=AdaBoostRegressor()
grid_search = GridSearchCV(ab, parameters_ab , scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_params_ab = grid_search.best_params_
dump(best_params_ab , 'best_params_ab .joblib') 
best_score_ab  = -grid_search.best_score_ # Négation pour obtenir la valeur de MSE
dump(best_score_ab , 'best_score_ab .joblib')  

ab_final = AdaBoostRegressor(**best_params_ab)
ab_final.fit(X_train, y_train)

y_pred_ab  = ab_final.predict(X_test)
mse_ab = mean_squared_error(y_test, y_pred_ab)
score_ab=ab_final.score(X_test, y_test)
print(y_pred_ab)
print(mse_ab)
print(score_ab)

NameError: name 'X_train' is not defined

In [None]:
'''
Ridge Regression : Ridge
ElasticNet : ElasticNet
** Support Vector Regression (SVR) : SVR
Gradient Boosting Regression : GradientBoostingRegressor
** Decision Tree Regression : DecisionTreeRegressor
Bayesian Ridge Regression : BayesianRidge
Passive Aggressive Regression : PassiveAggressiveRegressor
Extra Trees Regression : ExtraTreesRegressor
** AdaBoost Regression : AdaBoostRegressor
Gaussian Process Regression : GaussianProcessRegressor
XGBoost Regression : XGBRegressor
LightGBM Regression : LGBMRegressor
CatBoost Regression : CatBoostRegressor
'''  