In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import preprocessing
from sklearn.model_selection import train_test_split, cross_val_score, cross_validate
from sklearn.metrics import mean_squared_error

from datetime import datetime
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
# préparation des données.

df = pd.read_csv("./Data/Donnees_Quotidiennes.csv", index_col=0,parse_dates=['Date'],sep=";")

df = df[df['Code INSEE région'] == 11]
df['Jour'] = df['Date'].apply(lambda x : int(x.strftime("%y%m%d")))
#df['Date'] = df['Date'].apply(lambda x : x.strftime('%Y%W'))

# Dichotomisation de la colonne Jour
df = df.join(pd.get_dummies(df.Jour, prefix='D'))
df = df.drop(['Région','Jour','Code INSEE région'], axis=1)

## paramétrage de la date en tant qu'index.
df.set_index('Date',inplace=True)

In [None]:
# normalisation des colonnes ['Consommation (MW)', 'Température (°C)']
scaller = preprocessing.StandardScaler().fit(df[['Consommation (MW)', 'Température (°C)']])
df[['Consommation (MW)','Température (°C)']] = scaller.fit_transform(df[['Consommation (MW)','Température (°C)']])

# Découpage des variables explicatives et la variable cible
target = df[['Consommation (MW)']]
data = df.drop(['Consommation (MW)'], axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data, target, test_size=0.01, random_state = 42, shuffle=False)

In [None]:
from sklearn.linear_model import LassoCV

#lassoAlphas = (10, 1, 0.1, 0.001, 0.0005)
#lasso_alphas = None
lasso_model = LassoCV(alphas = [0.001], cv = 42).fit(X_train, y_train)

print(lasso_model.coef_)

In [None]:
from sklearn.linear_model import lasso_path

mes_alphas = (0.001,0.01,0.02,0.025,0.05,0.1,0.25,0.5,0.8,1.0)

alpha_path, coefs_lasso, _ = lasso_path(X_train, y_train['Consommation (MW)'], alphas=mes_alphas)

coefs_lasso.shape

In [None]:
import matplotlib.cm as cm

plt.figure(figsize=(10,7))

for i in range(coefs_lasso.shape[0]):
    plt.plot(alpha_path, coefs_lasso[i,:], '--')

plt.xlabel('Alpha')
plt.ylabel('Coefficients')
plt.title('Lasso path')
plt.show()

Tous les coeeficients sont nulls

In [None]:
from sklearn.linear_model import LassoCV

model_lasso = LassoCV(cv=10).fit(X_train, y_train)

alphas = model_lasso.alphas_

plt.figure(figsize = (10,8))

plt.plot(alphas, model_lasso.mse_path_, ':')

plt.plot(alphas, model_lasso.mse_path_.mean(axis=1), 'k',
         label='Moyenne', linewidth=2)

plt.axvline(model_lasso.alpha_, linestyle='--', color='k',
            label='alpha: estimation CV')

plt.legend()

plt.xlabel('Alpha')
plt.ylabel('Mean square error')
plt.title('Mean square error pour chaque échantillon ')
plt.show()

In [None]:
# Alpha choisi
model_lasso.alpha_

In [None]:
# Afficher l'intercept et les coefficients estimés pour chaque variable de data

coeffs = list(model_lasso.coef_)
coeffs = np.insert(coeffs, 0, model_lasso.intercept_)

feats = list(data.columns)
feats.insert(0, 'intercept')

valeurs = pd.DataFrame({'valeur estimée': coeffs}, index = feats)

In [None]:
valeurs.head()

In [None]:
print( "alpha sélectionné par c-v :" ,model_lasso.alpha_)
print("score train :", model_lasso.score(X_train, y_train))
print("score test :", model_lasso.score(X_test, y_test))

In [None]:
# Afficher les racines des erreurs quadratiques moyennes pour les deux échantillons, train et test

lasso_pred_train = model_lasso.predict(X_train)
lasso_pred_test = model_lasso.predict(X_test)

print("mse train:", mean_squared_error(lasso_pred_train, y_train))
print("mse test:", mean_squared_error(lasso_pred_test, y_test))


In [None]:
moyenne = scaller.mean_[0]
ecart = scaller.scale_[0]
print("moyenne :", moyenne)
print("ecart-type", ecart)

In [None]:
lasso_pred_test

In [None]:
lasso_values = []
for i in lasso_pred_test:
    lasso_values.append(i)
lasso_values

lasso_model = pd.DataFrame({'predits': lasso_values}, index = X_test.index)

In [None]:
lasso_model_1 = pd.DataFrame({'Reel': np.round((y_test['Consommation (MW)']*ecart)+moyenne),
    'predits' : np.round((lasso_model['predits']*ecart)+moyenne),
    'T' : (X_test['Température (°C)']*scaller.scale_[-1])+scaller.mean_[-1]}, index = X_test.index)
lasso_model_1.head()

In [None]:
plt.figure(figsize=[15,8]);
plt.plot(lasso_model_1.index,lasso_model_1.Reel,label="Valeur Réelle");
plt.plot(lasso_model_1.index,lasso_model_1.predits,label="Prédiction");
plt.legend();
plt.xticks(rotation=45);

In [None]:
print("train score: ", model_lasso.score(X_train, y_train))
print("test score: ", model_lasso.score(X_test, y_test))

In [None]:
from math import pi
from bokeh.plotting import figure, output_notebook, show, ColumnDataSource
from bokeh.models import Legend, DatetimeTickFormatter, formatters, HoverTool, LinearAxis, Range1d

output_notebook()

In [None]:
#Source
source = ColumnDataSource(lasso_model_1)


# List de tools
TOOLS="crosshair,pan,wheel_zoom,box_zoom,reset"


y_overlimit = 0.05 
p = figure(plot_width = 600, plot_height = 400,     
           title = "Prédiction avec le modèle Ridge Quotidien",                    
           x_axis_label = 'Date', x_axis_type="datetime",
           y_axis_label = 'Consommation Moyenne',
           toolbar_location="below",
           tools=TOOLS)  


p.title.text_color = "darkblue"
p.title.text_font = "times"
p.title.text_font_size = "20px"
p.title.align = 'center'


p.line(x='Date', y = 'Reel', color = "navy", legend_label = "Valeurs réellees", source = source)   
p.circle(x='Date', y ='Reel', color = "navy",fill_color='white', size=8, source = source)

p.line(x='Date', y ='predits', color = "red", legend_label = "Valeurs predites", source = source) 
p.circle(x='Date', y='predits', color = "red", fill_color='white',size=8, source = source)

# axis y, gauche
p.y_range = Range1d(lasso_model_1.Reel.min() * (1 - y_overlimit), lasso_model_1.Reel.max() * (1 + y_overlimit))

p.xaxis.major_label_orientation = pi/4
p.xgrid.grid_line_color = None
p.ygrid.grid_line_color = None
p.xaxis.ticker.desired_num_ticks = 29

# Axis y, droite
y_column2_range = "T" + "_range"
p.extra_y_ranges = {
    y_column2_range: Range1d(
        start=lasso_model_1['T'].min() * (1 - y_overlimit),
        end=lasso_model_1['T'].max() * (1 + y_overlimit),
    )
}
p.add_layout(LinearAxis(y_range_name=y_column2_range), "right")

p.line( x='Date', y = 'T', color="grey", legend_label="T (C°)", y_range_name=y_column2_range, source = source)
p.circle(x='Date', y = 'T', color = "grey",fill_color='white', size=8, y_range_name=y_column2_range, source = source)


# Activation de l'interaction avec la légende
p.legend.location = "top_center"
p.legend.click_policy = 'hide'

# Style hover
p.add_tools(HoverTool(
    tooltips=[('Date', '@Date{%Y-%m-%d}'),
        ('Prédiction', '@Reel{0.00}'),
        ('Valeur réelle', '@predits{0.00}'),
        ('C°', "@T{0.00}")],
    formatters={'@Date': 'datetime'}
))

show(p);


In [None]:
# Racine carrée de la Moyenne des résidus au carré.
# RMSE

def rmse(predictions, targets):
    return np.sqrt(((predictions - targets) ** 2).mean())

rmse(lasso_model_1.predits, lasso_model_1.Reel)

plus l'erreur quadratique moyenne est proche de 0, plus précises sont les prédictions.

Nous pourrons donc comparer ce résultat avec le RMSE des autres modèles

In [None]:
# Calculer et afficher le score MAPE
y_true, y_pred = np.array(lasso_model_1.Reel), np.array(lasso_model_1.predits)
MAPE = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
print("Mean Absolute Prediction Error : %0.2f%%"% MAPE)

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedKFold
from sklearn.linear_model import Lasso

cv = RepeatedKFold(n_splits = 10, n_repeats = 3, random_state = 42)

model = Lasso()

grid = dict()
grid['alpha'] = np.arange(0, 1, 0.01)

search = GridSearchCV(model, grid, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)

results = search.fit(X_train, y_train)

print('MAE: %.3f' % results.best_score_)
print('Config: %s' % results.best_params_)