In [1]:
# Installation des packages
# !pip install numpy pandas matplotlib scikit-learn scipy seaborn
print('Done')

Done


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pformat

# Prétraitement des données
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import Pipeline

# Modèles
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# Évaluation des modèles
from sklearn.metrics import make_scorer, r2_score, mean_squared_error
from scipy.stats import spearmanr

# Validation croisée et recherche de modèle
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold


from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import f_regression


In [3]:
raw_data_x = pd.read_csv("data/Data_X.csv", index_col='ID')
raw_dataNew_x = pd.read_csv("data/DataNew_X.csv", index_col='ID')
raw_data_y = pd.read_csv("data/Data_Y.csv", index_col='ID')

In [4]:
data_xy = pd.merge(raw_data_x, raw_data_y, on='ID').copy()

In [11]:
data_fr = data_xy.drop('COUNTRY', axis=1).copy().dropna()

In [12]:
X = data_fr.drop('TARGET', axis=1)
y = data_fr['TARGET']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
test_name = 'polynomial 2'
feature_selection = PolynomialFeatures(2)

In [15]:
def spearmanr_scorer(y, y_pred):
    try:
        correlation, _ = spearmanr(y, y_pred)
        if np.isfinite(correlation):
            return correlation
        else:
            print("NOT FINITE")
            return 0.0 
    except Exception as e:
        print(f"Error calculating Spearman correlation: {e}")
        return 0.0 

def r2_scorer(y, y_pred):
    return r2_score(y, y_pred)

def rmse_scorer(y, y_pred):
    return mean_squared_error(y, y_pred, squared=False)  # RMSE

r2_scoring = make_scorer(r2_scorer, greater_is_better=True)
rmse_scoring = make_scorer(rmse_scorer, greater_is_better=False)
spearman_scoring = make_scorer(spearmanr_scorer, greater_is_better=True)

scorers = {
    'spearmanr': spearman_scoring,
    'r2': r2_scoring,
    'rmse': rmse_scoring
}

In [17]:
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.feature_selection import SelectKBest, SelectFromModel, RFE
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.linear_model import LassoCV

def create_pipelines():
    return {
        'linear': Pipeline([('imputer', None),('feature_selection', None), ('scaler', None), ('model', LinearRegression())]),
        'ridge': Pipeline([('imputer', None),('feature_selection', None), ('scaler', None), ('model', Ridge())]),
        'lasso': Pipeline([('imputer', None),('feature_selection', None), ('scaler', None), ('model', Lasso())]),
        'knn': Pipeline([('imputer', None),('feature_selection', None), ('scaler', None), ('model', KNeighborsRegressor())]),  
        'decision_tree': Pipeline([('imputer', None),('feature_selection', None), ('scaler', None), ('model', DecisionTreeRegressor())]),
        'random_forest': Pipeline([('imputer', None),('feature_selection', None), ('scaler', None), ('model', RandomForestRegressor(n_jobs=-1))])
    }

def run_cross_validation(model_name, pipeline, imputer, feature_selection, scaler):
    pipeline.named_steps['imputer'] = imputer
    pipeline.named_steps['feature_selection'] = feature_selection
    pipeline.named_steps['scaler'] = scaler

    print(f"Cross-validation for {model_name.capitalize()} model with {imputer_name}, {feature_selection_name}, and {scaler_name}...")
    rs_inner = GridSearchCV(pipeline, params[model_name], cv=5, n_jobs=-1, scoring=scorers, refit='spearmanr', verbose=4)

    %time rs_inner.fit(X_train, y_train)
    return rs_inner

def store_results(rs_inner, model_name, cv_results):
    for metric in ['spearmanr', 'r2', 'rmse']:
        best_index = rs_inner.cv_results_['rank_test_'+metric].argmin()
        cv_results[metric][model_name] = rs_inner.cv_results_['mean_test_'+metric][best_index]

    result = {
        'best_parameters': rs_inner.best_params_,
        'best_score': rs_inner.best_score_,
        'metrics': {
            'spearmanr': cv_results['spearmanr'][model_name],
            'r2': cv_results['r2'][model_name],
            'rmse': cv_results['rmse'][model_name]
        },
        'y_pred': rs_inner.predict(X_test),
        'pipeline_steps': [step[0] for step in pipeline.steps]
    }

    if hasattr(rs_inner.best_estimator_.named_steps['model'], 'feature_importances_'):
        result['feature_importances'] = dict(zip(X.columns, rs_inner.best_estimator_.named_steps['model'].feature_importances_))
    elif hasattr(rs_inner.best_estimator_.named_steps['model'], 'coef_'):
        result['feature_importances'] = dict(zip(X.columns, rs_inner.best_estimator_.named_steps['model'].coef_))
    else:
        print(f' No feature importance or coef found for {model_name.capitalize()}')   

    return result

imputers = [
    ('simple_imputer', SimpleImputer()),
    # ('knn_imputer', KNNImputer())
]

feature_selections = [
    ('select_k_best', SelectKBest(f_regression)),
    ('select_from_model', SelectFromModel(LassoCV())),
    ('rfe', RFE(estimator=RandomForestRegressor()))
]

scalers = [
    ('standard_scaler', StandardScaler()),
    ('min_max_scaler', MinMaxScaler()),
    ('robust_scaler', RobustScaler())
]

params = {
    'linear': {},
    'ridge': {'model__alpha': [0.1, 1.0, 10.0]},
    'lasso': {'model__alpha': [0.1, 1.0, 10.0]},
    'knn': {'model__n_neighbors': [3, 5, 7]},
    'decision_tree': {'model__max_depth': [None, 5, 10]},
    'random_forest': {'model__n_estimators': [100, 200, 300]}
}
cv_results = {
    'spearmanr': {},
    'r2': {},
    'rmse': {}
}


pipelines = create_pipelines()
results = {}

for model_name, pipeline in pipelines.items():
    for imputer_name, imputer in imputers:
        for feature_selection_name, feature_selection in feature_selections:
            for scaler_name, scaler in scalers:
                rs_inner = run_cross_validation(model_name, pipeline, imputer, feature_selection, scaler)
                results[model_name] = store_results(rs_inner, model_name, cv_results)
                print(f"Score: {results[model_name]['best_score']:.3f}\n")

print('Done.')


Cross-validation for Linear model with simple_imputer, select_k_best, and standard_scaler...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Wall time: 1.12 s
Score: 0.180

Cross-validation for Linear model with simple_imputer, select_k_best, and min_max_scaler...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Wall time: 871 ms
Score: 0.180

Cross-validation for Linear model with simple_imputer, select_k_best, and robust_scaler...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Wall time: 23.9 ms
Score: 0.180

Cross-validation for Linear model with simple_imputer, select_from_model, and standard_scaler...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Wall time: 27.6 ms
Score: 0.180

Cross-validation for Linear model with simple_imputer, select_from_model, and min_max_scaler...
Fitting 5 folds for each of 1 candidates, totalling 5 fits
Wall time: 20.8 ms
Score: 0.180

Cross-validation for Linear model with simple_imputer, select_from_model,

In [None]:
# common_imputer_params = {
#     'imputer__strategy': ['mean', 'median', 'constant'],
#     'imputer__fill_value': [None]
# }

# params = {
#     'linear': {
#         **common_imputer_params,
#         'model__fit_intercept': [True, False],
#     },
#     'ridge': {
#         **common_imputer_params,
#         'model__alpha': np.logspace(start=-3, stop=3).tolist()
#     },
#     'lasso': {
#         **common_imputer_params,
#         'model__alpha': np.logspace(start=-3, stop=3).tolist()
#     },
#     'knn': {
#         **common_imputer_params,
#         'scaler__k': [10],
#         'model__n_neighbors': [1, 3, 8, 15],
#         'model__weights': ['uniform', 'distance'],
#         'model__metric': ['euclidean', 'manhattan', 'minkowski']
#     },
#     'decision_tree': {
#         **common_imputer_params,
#         'model__max_depth': [3, 6, 9],
#         'model__min_samples_split': [3, 8],
#         'model__min_samples_leaf': [3, 7],
#     },
#     'random_forest': {
#         **common_imputer_params,
#         'scaler__k': [10],
#         'model__n_estimators': [10, 20],  
#         'model__max_depth': [None, 3],  
#     }
# }

# pipelines = {
#     'linear': Pipeline([('imputer', SimpleImputer()),('feature_selection', feature_selection), ('scaler', StandardScaler()), ('model', LinearRegression())]),
#     'ridge': Pipeline([('imputer', SimpleImputer()),('feature_selection', feature_selection), ('scaler', StandardScaler()), ('model', Ridge())]),
#     'lasso': Pipeline([('imputer', SimpleImputer()),('feature_selection', feature_selection), ('scaler', StandardScaler()), ('model', Lasso())]),
#     'knn': Pipeline([('imputer', SimpleImputer()),('feature_selection', None), ('scaler', SelectKBest(f_regression)), ('model', KNeighborsRegressor())]),  
#     'decision_tree': Pipeline([('imputer', SimpleImputer()),('feature_selection', feature_selection), ('scaler', StandardScaler()), ('model', DecisionTreeRegressor())]),
#     'random_forest': Pipeline([('imputer', SimpleImputer()),('feature_selection', None), ('scaler', SelectKBest(f_regression)), ('model', RandomForestRegressor(n_jobs=-1))])
# }

# cv_outer = KFold(n_splits=5, shuffle=True, random_state=42)
# results = {}
# cv_results = {'spearmanr': {}, 'r2': {}, 'rmse': {}}


# print("Starting nested cross-validation for model selection and evaluation...\n")

# for model_name, pipeline in pipelines.items():
#     print(f"Cross-validation for {model_name.capitalize()} model...")
#     rs_inner = GridSearchCV(pipeline, params[model_name], cv=cv_outer, n_jobs=-1,scoring=scorers, refit='spearmanr', verbose=4)
    
#     %time rs_inner.fit(X_train, y_train)
    
#     for metric in ['spearmanr', 'r2', 'rmse']:
#         best_index = rs_inner.cv_results_['rank_test_'+metric].argmin()
#         cv_results[metric][model_name] = rs_inner.cv_results_['mean_test_'+metric][best_index]
    
#     results[model_name] = {
#         'best_parameters': rs_inner.best_params_,
#         'best_score': rs_inner.best_score_,
#         'metrics': {
#             'spearmanr': cv_results['spearmanr'][model_name],
#             'r2': cv_results['r2'][model_name],
#             'rmse': cv_results['rmse'][model_name]
#         },
#         'y_pred': rs_inner.predict(X_test),
#         'pipeline_steps': [step[0] for step in pipeline.steps]
#     }
    
#     if hasattr(rs_inner.best_estimator_.named_steps['model'], 'feature_importances_'):
#         results[model_name]['feature_importances'] = dict(zip(X.columns, rs_inner.best_estimator_.named_steps['model'].feature_importances_))
#     elif hasattr(rs_inner.best_estimator_.named_steps['model'], 'coef_'):
#         results[model_name]['feature_importances'] = dict(zip(X.columns, rs_inner.best_estimator_.named_steps['model'].coef_))
#     else:
#         print(f' No feature importance or coef found for {model_name.capitalize()}')   
        
#     print(f"Score: {results[model_name]['best_score']:.3f}\n")
# print('Done.')

In [None]:
%%capture cap
print("Summary :\n")
# Displaying results
for model_name, pipeline in pipelines.items():
    best_parameters = results[model_name]['best_parameters']
    str_best_parameters = ' ' + pformat(best_parameters, indent=6)[1:-1] 
    print(f"{model_name.capitalize()} model:")
    print(f"   Best transformers:")
    for step in pipeline.steps:
        print(f"      {step[0]:18.20s}: {str(step[1])[:-2]}", end='\n')
    print(f"   Best parameters: \n{str_best_parameters}", end='\n')
    print(f"   Best cross-validation score: {results[model_name]['best_score']:.5f}")
    print(f"   Spearman correlation       : {results[model_name]['metrics']['spearmanr']:.5f}")
    print(f"   R2 score                   : {results[model_name]['metrics']['r2']:.8f}")
    print(f"   RMSE                       : {results[model_name]['metrics']['rmse']:.4f}\n")
    
with open(f'output/{test_name}.txt', 'w') as f:
    f.write(cap.stdout)

In [None]:
for metric in ['spearmanr', 'r2', 'rmse']:
    sorted_results = sorted(cv_results[metric].items(), key=lambda x: x[1], reverse=True)
    print(f"\nModel ranking for {metric}:")
    for i, (model_name, score) in enumerate(sorted_results):
        print(f"{i+1}. {model_name}: {score:.4}")

In [None]:
fig, axs = plt.subplots(nrows=2, ncols=3, figsize=(8, 6))

for i, (model_name, result) in enumerate(results.items()):
    y_pred = result['y_pred']
    row, col = i // 3, i % 3
    axs[row, col].scatter(y_test, y_pred)
    axs[row, col].set_xlabel('True Values')
    axs[row, col].set_ylabel('Predictions')
    axs[row, col].set_title(model_name)

plt.tight_layout()
plt.show()

In [None]:
n_rows = 3
n_cols = 2
n = 13
fig, axs = plt.subplots(n_rows, n_cols, figsize=(10, 13))
axs = axs.flatten()

def foo(ax, model_name, results):
    importances = results[model_name].get('feature_importances', None)
    if importances :
        importances_df = pd.DataFrame.from_dict(importances, orient='index', columns=['importance'])
        importances_df = importances_df.sort_values(by='importance', key=lambda x: x.abs(), ascending=False)
        importances_df = pd.DataFrame(importances_df.head(n))
        importances_df.sort_values('importance', ascending=False, inplace=True)
        sns.barplot(data=importances_df, y=importances_df.index, x='importance', palette='RdBu', ax=ax)
        ax.set_title(f'{model_name.capitalize()}')
        ax.set_xlabel('Importance')
        ax.set_ylabel('Feature')

foo(axs[0], 'linear', results)
foo(axs[1], 'lasso', results)
foo(axs[2], 'ridge', results)
foo(axs[3], 'decision_tree', results)
foo(axs[4], 'random_forest', results)
axs[5].set_title('Knn')

plt.tight_layout()
plt.show()