In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pformat

# Prétraitement des données
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Modèles
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Évaluation des modèles
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from scipy.stats import spearmanr

# Validation croisée et recherche de modèle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold

from print_test_results import get_test_results, print_test_results, get_comparison_table
from IPython.display import Markdown

# RandomForestRegressor

In [2]:
pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', None),
    ('model', RandomForestRegressor())
])

param_grid = {
    'imputer__strategy': ['constant', 'mean', 'median'],
    'imputer__fill_value': [None],
    'scaler' : [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
    'model__max_depth': range(1, 21),
    'model__min_samples_split': range(2, 11),
    'model__min_samples_leaf': range(1, 11)
}

scoring = {
    'r2' : 'r2',
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error'
}

refit = 'r2'
cv = 5
n_iter = 30

test_results = {
    'fr':{},
    'de':{}
}

## France

In [3]:
data_fr = pd.read_csv('data/Data_FR.csv', index_col='ID')
data_fr.shape

(680, 39)

In [4]:
X = data_fr.drop('TARGET', axis=1)
y = data_fr['TARGET']

In [5]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search = RandomizedSearchCV(pipeline, param_grid, scoring=scoring,n_iter=n_iter, cv=cv, refit=refit, n_jobs=-1, verbose=1, random_state=42)
grid_search.fit(X_train, y_train)

test_results['fr'] = get_test_results(grid_search, X_test, y_test)
print_test_results(test_results['fr'], refit, title='RandomForestRegressor FR')

Fitting 5 folds for each of 30 candidates, totalling 150 fits

RandomForestRegressor FR
 Best estimator: 
  imputer : SimpleImputer(strategy='constant')
  scaler : StandardScaler()
  model : RandomForestRegressor(max_depth=1, min_samples_leaf=6, min_samples_split=4)
 Metrics:
  rmse: 1.426
  r2: -0.018
  r2 ajusté: -0.417
  spearman: 0.082
Wall time: 15.8 s


## Allemagne

In [6]:
data_de = pd.read_csv('data/Data_DE.csv', index_col='ID')
data_de.shape

(596, 39)

In [7]:
X = data_de.drop('TARGET', axis=1)
y = data_de['TARGET']

In [8]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search = RandomizedSearchCV(pipeline, param_grid, scoring=scoring, n_iter=n_iter, cv=cv, refit=refit, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

test_results['de'] = get_test_results(grid_search, X_test, y_test)
print_test_results(test_results['de'], refit, title='RandomForestRegressor DE')

Fitting 5 folds for each of 30 candidates, totalling 150 fits

RandomForestRegressor DE
 Best estimator: 
  imputer : SimpleImputer(strategy='constant')
  scaler : None
  model : RandomForestRegressor(max_depth=1, min_samples_leaf=9, min_samples_split=10)
 Metrics:
  rmse: 1.035
  r2: 0.06
  r2 ajusté: -0.381
  spearman: 0.327
Wall time: 9.78 s


# Conclusion

In [9]:
markdown_table = get_comparison_table(test_results, title='RandomForestRegressor')
Markdown(markdown_table)

| RandomForestRegressor      | FR                                       | DE                                       |
|-------------|-----------------------------------------|-----------------------------------------|
| Imputer     | SimpleImputer(strategy='constant')| SimpleImputer(strategy='constant')|
| Scaler      | StandardScaler() | None |
| Model       | RandomForestRegressor(max_depth=1, min_samples_leaf=6, min_samples_split=4)  | RandomForestRegressor(max_depth=1, min_samples_leaf=9, min_samples_split=10)  |
| RMSE        | 1.426            | 1.035            |
| R2          | -0.018              | 0.06              |
| R2 ajusté   | -0.417       | -0.381       |
| Spearman    | 0.082        | 0.327        |