In [220]:
# Librairies
import pandas as pd
import numpy as np
from prophet import Prophet
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score

import pickle
import time
from sklearn.model_selection import GridSearchCV


In [186]:

df = pd.read_csv('bus_trafic_clean.csv')
df = df.astype({"horodatage": "datetime64",
                "horodatage_maj": "datetime64",
                "Heure_estimee_de_passage_a_L_arret": "datetime64",
                "date_heure": "datetime64",
                "date": "datetime64",
                "date_heure": "datetime64",
                "numero_de_parc_du_vehicule": "category"
                })

In [187]:
# sample for tests 
df = df.sample(frac=0.1, random_state=1)
df.shape

(73941, 62)

## Préparation des données

In [188]:
#Sélection des variables
df_RLM = df[['date',
             'ecart_horaire_en_secondes',
             'nom_de_la_ligne',
             'etat_SAE_du_vehicule',
             'month',
             'day',
             'jour_semaine',
             'OPINION']]

# transfo saisonnalité 
df_RLM['day'] = (df_RLM['day']) * (1/31)
df_RLM['month'] = (df_RLM['month'] - 7) * (1/5)
Days = {'Lundi': 0, 'Mardi': 1/6, 'Mercredi': 2/6, 'Jeudi': 3/6, 'Vendredi': 4/6, 'Samedi': 5/6, 'Dimanche': 6/6}
df_RLM['jour_semaine'] = df_RLM['jour_semaine'].map(Days)

# différence en secondes entre l'heure estimée de passage et l'horodatage
df_RLM['diff_estimee'] = (
    df['Heure_estimee_de_passage_a_L_arret'] - df['horodatage']).dt.total_seconds() / 60

# différence en secondes entre l'horodatage et sa mise à jour
df_RLM['diff_maj'] = (
    df['horodatage_maj'] - df['horodatage']).dt.total_seconds() / 60

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_RLM['day'] = (df_RLM['day']) * (1/31)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_RLM['month'] = (df_RLM['month'] - 7) * (1/5)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_RLM['jour_semaine'] = df_RLM['jour_semaine'].map(Days)
A value is trying to be set on a copy of a slice from a Da

On veut prédire le trafic le jour suivant, cela implique de grouper nos données par jour.

...

 il y a donc en entrée les données de la veille (voir les données des jours d'avant),
	et en sortie la moyenne des écarts à l'horaire. (puisque les requetes à l'API ne sont pas constantes)
	
On ignore donc la saisonnalité journalières et horaires

Il y a peu de données, on ne peut donc pas utiliser un LSTM qui aurait été adapté. ( à noter que pour une problématique de prédiction des écarts par bus au cours de la journée, il aurait montré ses qualités)
	

In [189]:
# Aggrégation par jour et par ligne
df_group = df_RLM.groupby(['date','nom_de_la_ligne']).agg(
    {'ecart_horaire_en_secondes' : 'mean',
     'diff_estimee' : 'mean',
     'diff_maj' : 'mean',
     'month' : 'first',
     'day' : 'first',
     'jour_semaine' : 'first',
     'OPINION' : 'first'
    }).merge((df_RLM
  .groupby(["date",'nom_de_la_ligne', 'etat_SAE_du_vehicule'])
  .size()
  .unstack('etat_SAE_du_vehicule', fill_value=0)
  .add_prefix("nombre_etat_")
), on=['date','nom_de_la_ligne'], how='left')

# Les valeurs d'écarts prochaines pour chaque ligne
df_group['next_ecart'] = df_group.groupby('nom_de_la_ligne')['ecart_horaire_en_secondes'].shift()
df_group.dropna(inplace=True)    

df_group.reset_index(inplace=True, level=['nom_de_la_ligne'])

df_group.groupby(['date']).agg(
    {'nom_de_la_ligne' : "nunique"}).describe()

Unnamed: 0,nom_de_la_ligne
count,126.0
mean,26.166667
std,8.512344
min,2.0
25%,27.0
50%,29.0
75%,31.0
max,34.0


Problème : certains jours, il n'y a que très peu de lignes de bus actives. et certaines lignes de bus n'ont que très peu de données.
Gardons uniquement les 15 lignes de bus les plus actives.

In [190]:
lignes_keep = df_RLM['nom_de_la_ligne'].value_counts().index[:15]
df_group = df_group[df_group['nom_de_la_ligne'].isin(lignes_keep)]

In [191]:
# One Hot Encoding
df_group = pd.get_dummies(df_group, columns=['OPINION','nom_de_la_ligne'])
df_group.shape

(1500, 39)

In [202]:
df_group.head()

Unnamed: 0_level_0,ecart_horaire_en_secondes,diff_estimee,diff_maj,month,day,jour_semaine,nombre_etat_DEV,nombre_etat_DEVP,nombre_etat_HC,nombre_etat_HL,...,nom_de_la_ligne_BOUCHEMAINE <> Z I EST,nom_de_la_ligne_CIRCULAIRE VERNEAU GARE EUROPE,nom_de_la_ligne_ESPACE ANJOU <> EVENTARD,nom_de_la_ligne_HOPITAL <> MONTREUIL JUIGNE,nom_de_la_ligne_LAC MAINE <> STE GEMMES CL ANJOU,nom_de_la_ligne_M-MARCILLE <> ST AUBIN LA SALLE,nom_de_la_ligne_MURS ERIGNE <> ADEZIERE SALETTE,nom_de_la_ligne_PONTS CE <> AQUAVITA H. RECULEE,nom_de_la_ligne_ST LEZIN SORGES <> SCHWEITZER,nom_de_la_ligne_ST SYLVAIN BANCHAIS <>TRELAZE
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2019-08-06,106.243902,1.453252,-0.010569,0.2,0.193548,0.166667,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
2019-08-06,10.526316,-73.858772,-0.004386,0.2,0.193548,0.166667,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-08-06,49.648649,0.791441,-0.005856,0.2,0.193548,0.166667,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-08-06,210.698413,0.461111,-0.006349,0.2,0.193548,0.166667,0,2,0,0,...,0,0,0,0,0,0,0,0,0,0
2019-08-06,39.481481,0.67963,-0.011111,0.2,0.193548,0.166667,1,0,0,0,...,1,0,0,0,0,0,0,0,0,0


# Modèle Prophet

Dans un premier temps, essayons de prédire la variable expliqué avec uniquement la variable date. 
Pour cela, nous allons utiliser le modèle Prophet de Facebook. Ce modèle est basé sur la décomposition de la série temporelle en trois composantes : tendance, saisonnalité et bruit. Il est donc particulièrement adapté à la prédiction de séries temporelles à condition qu'il y ait un lien entre la variable à expliqué et la date.

In [210]:
m = Prophet()

df_prophet = df_group['next_ecart'].reset_index()
#df_prophet = df_prophet.drop_duplicates(subset='date', keep='first')

df_prophet = df_prophet.rename(
    columns={'date': 'ds', 'next_ecart': 'y'})

In [211]:
train , test = train_test_split(df_prophet, test_size=0.2, random_state=0)
m.fit(train)

22:52:52 - cmdstanpy - INFO - Chain [1] start processing
22:52:52 - cmdstanpy - INFO - Chain [1] done processing


<prophet.forecaster.Prophet at 0x236eee7af10>

Nous allons utilisés comme indicateurs de performances : MSE, MAE et MAP.
Il est important de pouvoir évaluer nos différents modèles de prédiction par rapport à une prédiction naive. 

Pour réaliser nos prédictions nous avons de nombreuses variables de différents types, de très nombreuses variables qualitatives ainsi qu'une donnée GPS et des dates. 

##### Prédiction par la moyenne

In [212]:
mean_pred = pd.Series([train['y'].mean()] * len(test))
y = test['y']

print(
    "r2_score : ", r2_score(y, mean_pred))
print(
    'mean_squared_error : ', mean_squared_error(y, mean_pred))
print(
    'mean_absolute_error : ', mean_absolute_error(y, mean_pred))
print(
    'mean_absolute_percentage_error', mean_absolute_percentage_error(y, mean_pred))

r2_score :  -0.0025563559317427487
mean_squared_error :  4746.572923750909
mean_absolute_error :  54.8440815369568
mean_absolute_percentage_error 1.6085141991834717


##### Prédiction par modèle Prophet

In [213]:
predictions = m.predict(test[['ds']])
predictions = predictions['yhat']
y = test['y']

print(
    "r2_score : ", r2_score(y, predictions))
print(
    'mean_squared_error : ', mean_squared_error(y, predictions))
print(
    'mean_absolute_error : ', mean_absolute_error(y, predictions))
print(
    'mean_absolute_percentage_error', mean_absolute_percentage_error(y, predictions))


r2_score :  -0.29990182629457074
mean_squared_error :  6154.346113032118
mean_absolute_error :  62.920970382232674
mean_absolute_percentage_error 1.709110375789912


En comparant les résultats de la prédiction par la moyenne et par le modèle Prophet, on peut voir que le modèle Prophet a des performances excécrables. 

Cela est dû au fait que l'horodatage ne contient peu ou pas d'information sur la variable à expliqué ou que la connaissance n'est disponible que par combinaisons avec d'autres variables.

Continuons dans les modèles explicables avec une régression linéaire multiple qui implique un lien linéaire entre la variable à expliqué et les variables explicatives.

# Modèle de Régression linéaire Multiple

Nous avons limité lors du croisement des données, en effet nos données ont été collectés en 2019, il en résulte une incompatibilité avec les données GTFS de parcours des lignes ainsi que les coordonnées des arrêts.

In [204]:
Xtrain , Xtest , Ytrain , Ytest = train_test_split(df_group.drop('next_ecart', axis=1), df_group['next_ecart'], test_size = 0.2, random_state = 0)

collones = df_group.columns.drop('next_ecart')

In [205]:
poly = PolynomialFeatures(degree=2, interaction_only=True)
poly.fit(Xtrain)
Xtrain_poly = poly.transform(Xtrain)
Xtest_poly = poly.transform(Xtest)
Xtrain_poly.shape


(1200, 742)

Nous avons essayer de tirer les liens polynomiales entre nos variables et la variable à expliquer par combinaisons de nos variables entre elles et avec elles même. Cependant, il en ressort 742 collones à cause de nos variables qualitative mis en one-hot-encoding. 
Ce qui détruirait l'explicabilité, alors que cela est l'objectif d'avoir un modèle simple.

In [206]:
#On remets les noms de collones
Xtrain = pd.DataFrame(Xtrain, columns= collones)
Xtest = pd.DataFrame(Xtest, columns= collones)

Nous avons en entrée la matrix Xtrain et un vecteur Ytrain pour chaque variable $X[:, i]$ calculons $\rho_i = \frac{(X[:, i] - mean(X[:, i])) * (y - mean(y))}{std(X[:, i]) * std(y)} $ auquel on associe la F-statistique $F_i = \frac{\rho_i^2}{1 - \rho_i^2}*(n-2)$

In [207]:
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectKBest

# On affiche les 10 variables les plus liés à la variable à expliquer
select = SelectKBest(f_regression, k=10)
select.fit(Xtrain, Ytrain)

features = zip(list(Xtrain.columns[select.get_support(indices=True)]), list(
    select.scores_[select.get_support(indices=True)]))
print('Meilleures variables avec p-value: %s' % list(features))




Meilleures variables avec p-value: [('ecart_horaire_en_secondes', 220.53481462930077), ('month', 112.72298794660337), ('jour_semaine', 63.78693877278675), ('nombre_etat_LIGN', 35.77269397539561), ('nom_de_la_ligne_ARDENNE <> ROSERAIE', 35.14449799376397), ('nom_de_la_ligne_BELLE BEILLE <> MONPLAISIR', 54.722573969381266), ('nom_de_la_ligne_CIRCULAIRE VERNEAU GARE EUROPE', 36.8186182570096), ('nom_de_la_ligne_ESPACE ANJOU <> EVENTARD', 54.96109961485969), ('nom_de_la_ligne_HOPITAL <> MONTREUIL JUIGNE', 23.592628219527068), ('nom_de_la_ligne_M-MARCILLE <> ST AUBIN LA SALLE', 24.0753360568993)]


  corr /= X_norms


On remarque des variables cohérentes avec la variable à estimé. Mais les p-values sont plutot faible, le lien linéaire n'est pas très fort.

In [218]:
#rename des var à l'arrache.
X_train = Xtrain 
X_test = Xtest 
y_train = Ytrain 
y_test = Ytest

Copie des précédents modèles.

In [222]:
df_result = pd.DataFrame(columns=["model", "CV", "R2", "MSE", "MAE", "MAPE", "Temps d'execution"])

In [223]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

time_start = time.time()

lr= LinearRegression()

lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["lr", "false", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

R2:  0.2969591273325679
MSE:  3328.525873632424
MAE:  43.84763702570485
MAPE:  1.1314362366377335


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [224]:
time_start = time.time()

param_grid = {'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False]}
grid = GridSearchCV(LinearRegression(), param_grid, refit = True, verbose = 3, cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["lr_grid", "true", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5] END copy_X=True, fit_intercept=True, normalize=True;, score=0.384 total time=   0.0s
[CV 2/5] END copy_X=True, fit_intercept=True, normalize=True;, score=0.367 total time=   0.0s
[CV 3/5] END copy_X=True, fit_intercept=True, normalize=True;, score=0.294 total time=   0.0s
[CV 4/5] END copy_X=True, fit_intercept=True, normalize=True;, score=0.277 total time=   0.0s
[CV 5/5] END copy_X=True, fit_intercept=True, normalize=True;, score=0.364 total time=   0.0s
[CV 1/5] END copy_X=True, fit_intercept=True, normalize=False;, score=0.384 total time=   0.0s
[CV 2/5] END copy_X=True, fit_intercept=True, normalize=False;, score=0.367 total time=   0.0s
[CV 3/5] END copy_X=True, fit_intercept=True, normalize=False;, score=0.294 total time=   0.0s
[CV 4/5] END copy_X=True, fit_intercept=True, normalize=False;, score=0.291 total time=   0.0s
[CV 5/5] END copy_X=True, fit_intercept=True, normalize=False;, score=0.364 total time=   

  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [225]:
from sklearn.linear_model import Ridge

time_start = time.time()

ridge = Ridge()
ridge.fit(X_train, y_train)
y_pred = ridge.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["ridge", "false", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

R2:  0.2987127920302455
MSE:  3320.223200847912
MAE:  43.82298998164725
MAPE:  1.1332837827842692


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [226]:
time_start = time.time()

param_grid = {'alpha': [0.1, 1, 10, 100, 1000], 'fit_intercept': [True, False], 'normalize': [True, False], 'copy_X': [True, False]}
grid = GridSearchCV(Ridge(), param_grid, refit = True, verbose = 3, cv=5)
grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["ridge_grid", "true", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

Fitting 5 folds for each of 40 candidates, totalling 200 fits
[CV 1/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=True;, score=0.382 total time=   0.0s
[CV 2/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=True;, score=0.365 total time=   0.0s
[CV 3/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=True;, score=0.299 total time=   0.0s
[CV 4/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=True;, score=0.302 total time=   0.0s
[CV 5/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=True;, score=0.365 total time=   0.0s
[CV 1/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=False;, score=0.384 total time=   0.0s
[CV 2/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=False;, score=0.367 total time=   0.0s
[CV 3/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=False;, score=0.294 total time=   0.0s
[CV 4/5] END alpha=0.1, copy_X=True, fit_intercept=True, normalize=False;, score=0.291 total ti

  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [227]:
from sklearn.ensemble import RandomForestRegressor

time_start = time.time()

clf = RandomForestRegressor(n_estimators=100)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["RandomForestRegressor", "false", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

Pkl_Filename = "Model_1_full_regressor.pkl"  
with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(clf, file)

R2:  0.3891897225577031
MSE:  2891.8628935945594
MAE:  41.68675298921537
MAPE:  0.9637400196020594


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [228]:
time_start = time.time()

param_grid = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

rf = RandomForestRegressor()
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)

print(grid_search.best_params_)

y_pred = grid_search.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["RandomForestRegressor", "true", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

Fitting 3 folds for each of 288 candidates, totalling 864 fits
{'bootstrap': True, 'max_depth': 80, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 200}
R2:  0.36183166269988787
MSE:  3021.3888054289896
MAE:  42.55982814989449
MAPE:  1.1614147625279794


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [229]:
from sklearn.linear_model import SGDRegressor

time_start = time.time()

sgdr = SGDRegressor().fit(X_train, y_train)

y_pred = sgdr.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["SGDRegressor", "false", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

R2:  -1.2186674511121594e+23
MSE:  5.7697444061681624e+26
MAE:  21286156267745.273
MAPE:  326333234940.31714


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [230]:
# grid search SGDRegressor
from sklearn.model_selection import GridSearchCV

time_start = time.time()

param_grid = {
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2', 'elasticnet'],
    'loss': ['huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'eta0': [0.01, 0.1, 1, 10, 100]
}

sgdr = SGDRegressor()
grid_search = GridSearchCV(estimator = sgdr, param_grid = param_grid,
                            cv = 3, n_jobs = -1, verbose = 2)
    
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["SGDRegressor", "true", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

Fitting 3 folds for each of 1260 candidates, totalling 3780 fits
R2:  0.3253784463790411
MSE:  3193.975462077692
MAE:  43.15036966686141
MAPE:  1.1513421440501064


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [231]:
from sklearn.neural_network import MLPRegressor

time_start = time.time()

mlp = MLPRegressor(hidden_layer_sizes=(100, 100, 100), max_iter=1000, random_state=42)

mlp.fit(X_train, y_train)
y_pred = mlp.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["MLPRegressor", "false", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

R2:  0.2217601392447387
MSE:  3684.553221760332
MAE:  45.49565690973741
MAPE:  1.1560683330668629


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [232]:
time_start = time.time()

param_grid = {'hidden_layer_sizes': [(100, 100, 100), (100, 100, 100, 100), (100, 100, 100, 100, 100)],
                'max_iter': [1000, 2000, 3000],
                'random_state': [42]}
grid = GridSearchCV(MLPRegressor(), param_grid, refit=True, verbose=3, n_jobs=-1)

grid.fit(X_train, y_train)
y_pred = grid.predict(X_test)

print("R2: ", r2_score(y_test, y_pred))
print("MSE: ", mean_squared_error(y_test, y_pred))
print("MAE: ", mean_absolute_error(y_test, y_pred))
print("MAPE: ", mean_absolute_percentage_error(y_test, y_pred))

time_end = time.time()

tab = ["MLPRegressor", "true", r2_score(y_test, y_pred), mean_squared_error(y_test, y_pred), mean_absolute_error(y_test, y_pred), mean_absolute_percentage_error(y_test, y_pred), time_end - time_start]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
R2:  0.1590477448876796
MSE:  3981.4631672986675
MAE:  48.88192704503004
MAPE:  1.2766710536999524


  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [238]:
mean_pred = pd.Series([y_train.mean()] * len(y_test))

tab = ["Mean", "false", r2_score(y, mean_pred), mean_squared_error(y, mean_pred), mean_absolute_error(y, mean_pred), mean_absolute_percentage_error(y, mean_pred), 0]
df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)

  df_result = df_result.append(pd.Series(tab, index=df_result.columns), ignore_index=True)


In [233]:
df_result["Temps d'execution"] = df_result["Temps d'execution"].apply(lambda x: round(x, 2))

# export result to csv
df_result.to_csv("result.csv", index=False)


In [239]:
df_result

Unnamed: 0,model,CV,R2,MSE,MAE,MAPE,Temps d'execution
0,lr,False,0.2969591,3328.526,43.84764,1.131436,0.01
1,lr_grid,True,0.2969591,3328.526,43.84764,1.131436,0.15
2,ridge,False,0.2987128,3320.223,43.82299,1.133284,0.01
3,ridge_grid,True,0.3045661,3292.511,43.76037,1.153935,0.66
4,RandomForestRegressor,False,0.3891897,2891.863,41.68675,0.96374,0.56
5,RandomForestRegressor,True,0.3618317,3021.389,42.55983,1.161415,79.52
6,SGDRegressor,False,-1.218667e+23,5.769744e+26,21286160000000.0,326333200000.0,0.02
7,SGDRegressor,True,0.3253784,3193.975,43.15037,1.151342,11.2
8,MLPRegressor,False,0.2217601,3684.553,45.49566,1.156068,1.39
9,MLPRegressor,True,0.1590477,3981.463,48.88193,1.276671,25.01
