In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pformat

# Prétraitement des données
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Modèles
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Évaluation des modèles
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from scipy.stats import spearmanr

# Validation croisée et recherche de modèle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold

from print_test_results import get_test_results, print_test_results, get_comparison_table
from IPython.display import Markdown
from sklearn.feature_selection import RFECV

# KNeighborsRegressor

In [2]:
pipeline = Pipeline([
    ('imputer', SimpleImputer()),
    ('scaler', None),
    ('selection', None),
    ('model', KNeighborsRegressor())
])

param_grid = {
    'imputer__strategy': ['constant', 'mean', 'median'],
    'imputer__fill_value': [None],
    'scaler' : [None, StandardScaler(), MinMaxScaler(), RobustScaler()],
    'selection' : [None, RFECV(DecisionTreeRegressor())],
    'model__n_neighbors': range(1, 21),
    'model__metric': ['euclidean', 'manhattan', 'minkowski']
}

scoring = {
    'r2' : 'r2',
    'neg_root_mean_squared_error': 'neg_root_mean_squared_error'
}

refit = 'r2'
cv = 5

test_results = {
    'fr':{},
    'de':{}
}

## France

In [3]:
data_fr = pd.read_csv('data/Data_FR.csv', index_col='ID')
data_fr.shape

(680, 39)

In [4]:
X = data_fr.drop('TARGET', axis=1)
y = data_fr['TARGET']

In [5]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search = RandomizedSearchCV(pipeline, param_grid, scoring=scoring, cv=cv, refit=refit, n_jobs=-1, verbose=1, error_score=0)
grid_search.fit(X_train, y_train)

test_results['fr'] = get_test_results(grid_search, X_test, y_test)
print_test_results(test_results['fr'], refit, title='KNeighborsRegressor FR')

Fitting 5 folds for each of 10 candidates, totalling 50 fits

KNeighborsRegressor FR
 Best estimator: 
  imputer : SimpleImputer()
  scaler : None
  selection : None
  model : KNeighborsRegressor(metric='euclidean', n_neighbors=20)
 Metrics:
  rmse: 1.423
  r2: -0.014
  r2 ajusté: -0.411
  spearman: 0.093
Wall time: 7.48 s


## Allemagne

In [6]:
data_de = pd.read_csv('data/Data_DE.csv', index_col='ID')
data_de.shape

(596, 39)

In [7]:
X = data_de.drop('TARGET', axis=1)
y = data_de['TARGET']

In [8]:
%%time

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid_search = RandomizedSearchCV(pipeline, param_grid, scoring=scoring, cv=cv, refit=refit, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)

test_results['de'] = get_test_results(grid_search, X_test, y_test)
print_test_results(test_results['de'], refit, title='KNeighborsRegressor DE')

Fitting 5 folds for each of 10 candidates, totalling 50 fits

KNeighborsRegressor DE
 Best estimator: 
  imputer : SimpleImputer()
  scaler : RobustScaler()
  selection : RFECV(estimator=DecisionTreeRegressor())
  model : KNeighborsRegressor(metric='manhattan', n_neighbors=19)
 Metrics:
  rmse: 1.06
  r2: 0.013
  r2 ajusté: -0.45
  spearman: 0.215
Wall time: 7.06 s


# Conclusion

In [9]:
markdown_table = get_comparison_table(test_results, title='KNeighborsRegressor')
Markdown(markdown_table)

| KNeighborsRegressor      | FR                                       | DE                                       |
|-------------|--------------------------------------------|--------------------------------------------|
| Imputer     | SimpleImputer()   | SimpleImputer()   |
| Scaler      | None    | RobustScaler()    |
| Selection   | None | RFECV(estimator=DecisionTreeRegressor()) |
| Model       | KNeighborsRegressor(metric='euclidean', n_neighbors=20)     | KNeighborsRegressor(metric='manhattan', n_neighbors=19)     |
| RMSE        | 1.423               | 1.06               |
| R2          | -0.014                 | 0.013                 |
| R2 ajusté   | -0.411          | -0.45          |
| Spearman    | 0.093           | 0.215           |