In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pformat

# Prétraitement des données
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Modèles
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Évaluation des modèles
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from scipy.stats import spearmanr

# Validation croisée et recherche de modèle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold

# Ridge

In [2]:
results = {
    'fr':{},
    'de':{}
}

## France

In [3]:
data_fr = pd.read_csv('data/Data_FR.csv', index_col='ID')
data_fr.shape

(680, 39)

In [4]:
data_fr.columns

Index(['DE_CONSUMPTION', 'FR_CONSUMPTION', 'DE_FR_EXCHANGE', 'FR_DE_EXCHANGE',
       'DE_NET_EXPORT', 'FR_NET_EXPORT', 'DE_NET_IMPORT', 'FR_NET_IMPORT',
       'DE_GAS', 'FR_GAS', 'DE_COAL', 'FR_COAL', 'DE_HYDRO', 'FR_HYDRO',
       'DE_NUCLEAR', 'FR_NUCLEAR', 'DE_SOLAR', 'FR_SOLAR', 'DE_WINDPOW',
       'FR_WINDPOW', 'DE_LIGNITE', 'DE_RESIDUAL_LOAD', 'FR_RESIDUAL_LOAD',
       'DE_RAIN', 'FR_RAIN', 'DE_WIND', 'FR_WIND', 'DE_TEMP', 'FR_TEMP',
       'GAS_RET', 'COAL_RET', 'CARBON_RET', 'TARGET', 'FR_PROD_RENEWABLE',
       'DE_PROD_RENEWABLE', 'DE_PROD_FOSIL', 'FR_PROD_FOSSIL',
       'FR_CONS_RENEWABLE', 'DE_CONS_RENEWABLE'],
      dtype='object')

In [5]:
X = data_fr.drop('TARGET', axis=1)
y = data_fr['TARGET']

In [6]:
from print_test_results import print_test_results

pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', None),
    ('model', Ridge())
])

param_grid = {
    'imputer__strategy': ['constant', 'mean', 'median'],
    'imputer__fill_value': [None],
    'scaler' : [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'model__alpha': np.logspace(-3, 3).tolist()
}

scoring = {
    'r2' : 'r2',
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error'
}

refit = 'RMSE'
refit = 'neg_root_mean_squared_error'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=5, refit=refit, n_jobs=-1)
grid_search.fit(X_train, y_train)
results['fr'] = print_test_results(grid_search, X_test, y_test)

Best estimator: 
  ('imputer', SimpleImputer(strategy='constant'))
  ('scaler', StandardScaler())
  ('model', Ridge(alpha=1000.0))
rmse: 1.41
r2: 0.005
Adjusted R-squared: -0.385
spearman: 0.256
Done


## Allemagne

In [7]:
data_de = pd.read_csv('data/Data_DE.csv', index_col='ID')
data_de.shape

(596, 39)

In [8]:
data_de.columns

Index(['DE_CONSUMPTION', 'FR_CONSUMPTION', 'DE_FR_EXCHANGE', 'FR_DE_EXCHANGE',
       'DE_NET_EXPORT', 'FR_NET_EXPORT', 'DE_NET_IMPORT', 'FR_NET_IMPORT',
       'DE_GAS', 'FR_GAS', 'DE_COAL', 'FR_COAL', 'DE_HYDRO', 'FR_HYDRO',
       'DE_NUCLEAR', 'FR_NUCLEAR', 'DE_SOLAR', 'FR_SOLAR', 'DE_WINDPOW',
       'FR_WINDPOW', 'DE_LIGNITE', 'DE_RESIDUAL_LOAD', 'FR_RESIDUAL_LOAD',
       'DE_RAIN', 'FR_RAIN', 'DE_WIND', 'FR_WIND', 'DE_TEMP', 'FR_TEMP',
       'GAS_RET', 'COAL_RET', 'CARBON_RET', 'TARGET', 'FR_PROD_RENEWABLE',
       'DE_PROD_RENEWABLE', 'DE_PROD_FOSIL', 'FR_PROD_FOSSIL',
       'FR_CONS_RENEWABLE', 'DE_CONS_RENEWABLE'],
      dtype='object')

In [9]:
X = data_de.drop('TARGET', axis=1)
y = data_de['TARGET']

In [10]:
from print_test_results import print_test_results

pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', None),
    ('model', Ridge())
])

param_grid = {
    'imputer__strategy': ['constant', 'mean', 'median'],
    'imputer__fill_value': [None],
    'scaler' : [StandardScaler(), MinMaxScaler(), RobustScaler()],
    'model__alpha': np.logspace(-3, 3).tolist()
}

scoring = {
    'r2' : 'r2',
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error'
}

refit = 'RMSE'
refit = 'neg_root_mean_squared_error'

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search = GridSearchCV(pipeline, param_grid, scoring=scoring, cv=5, refit=refit, n_jobs=-1)
grid_search.fit(X_train, y_train)
results['de'] = print_test_results(grid_search, X_test, y_test)

Best estimator: 
  ('imputer', SimpleImputer(strategy='constant'))
  ('scaler', MinMaxScaler())
  ('model', Ridge(alpha=8.286427728546842))
rmse: 1.045
r2: 0.041
Adjusted R-squared: -0.409
spearman: 0.358
Done


# Conclusion