# Pretraitement des données

In [3]:
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.base import BaseEstimator
import pandas as pd
import numpy as np
from pandas import DataFrame
import mlflow
import mlflow.sklearn

In [4]:
random_state = 42

In [5]:
housing = fetch_california_housing(as_frame=True)
X = housing.data
y = housing.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random_state)

In [6]:
X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

# Utils

In [7]:
def train_model(model: BaseEstimator, params_grid: dict, X_train: DataFrame, y_train: DataFrame, nb_jobs: int = None) -> tuple[BaseEstimator,dict]:
    # Define the search
    search = GridSearchCV(model, params_grid, scoring='neg_mean_squared_error', n_jobs=nb_jobs)
    
    # Fit the search
    search.fit(X_train, y_train)
    best_estimator = search.best_estimator_
    best_estimator.fit(X_train, y_train)
    return best_estimator, search.best_params_

In [8]:
def evalute_model(model: BaseEstimator, X_test: DataFrame, y_test: DataFrame) -> dict:
    y_pred = model.predict(X_test)
    scores = {
        "mean_squared_error": mean_squared_error(y_test, y_pred),
        "mean_absolute_error": mean_absolute_error(y_test, y_pred),
        "r2": r2_score(y_test, y_pred),
    }

    return scores

In [9]:
def log_metrics(run_name, params, metrics, tags=None)  -> None:
    rel_path = "mlflow.db"
    mlflow.set_tracking_uri(f"sqlite:///{rel_path}")
    mlflow.set_experiment("Imo")
    with mlflow.start_run(run_name=run_name):
        mlflow.log_params(params)
        mlflow.log_metrics(metrics)
        if tags:
            mlflow.set_tags(tags)

In [10]:
def train_evalute_log(model: BaseEstimator, params_grid: dict, X_train: DataFrame, y_train: DataFrame,
                       X_test: DataFrame, y_test: DataFrame, run_name: str, tags: dict =None, n_jobs: int = None) -> None:
    best_estimator, best_params = train_model(model, params_grid, X_train, y_train, n_jobs)
    metrics = evalute_model(best_estimator, X_test, y_test)
    log_metrics(run_name, best_params, metrics, tags)


# Linear Model

In [11]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

In [12]:
lr = LinearRegression()
lr.fit(X_train, y_train)
scores = evalute_model(lr, X_test, y_test)
log_metrics("LinearRegression", {}, scores)

2025/01/15 19:38:17 INFO mlflow.store.db.utils: Creating initial MLflow database tables...
2025/01/15 19:38:17 INFO mlflow.store.db.utils: Updating database tables
INFO  [alembic.runtime.migration] Context impl SQLiteImpl.
INFO  [alembic.runtime.migration] Will assume non-transactional DDL.
INFO  [alembic.runtime.migration] Running upgrade  -> 451aebb31d03, add metric step
INFO  [alembic.runtime.migration] Running upgrade 451aebb31d03 -> 90e64c465722, migrate user column to tags
INFO  [alembic.runtime.migration] Running upgrade 90e64c465722 -> 181f10493468, allow nulls for metric values
INFO  [alembic.runtime.migration] Running upgrade 181f10493468 -> df50e92ffc5e, Add Experiment Tags Table
INFO  [alembic.runtime.migration] Running upgrade df50e92ffc5e -> 7ac759974ad8, Update run tags with larger limit
INFO  [alembic.runtime.migration] Running upgrade 7ac759974ad8 -> 89d4b8295536, create latest metrics table
INFO  [89d4b8295536_create_latest_metrics_table_py] Migration complete!
INFO  

In [13]:
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
scores = evalute_model(lr, X_test_scaled, y_test)
log_metrics("LinearRegression_scaled", {}, scores)

Séléction de variable pour le modèle linéaire

In [14]:
mod_seq_backwark = SequentialFeatureSelector(
    estimator=LinearRegression(), direction="backward", n_features_to_select="auto", cv=3,
)
mod_seq_forward = SequentialFeatureSelector(
    estimator=LinearRegression(), direction="forward", n_features_to_select="auto", cv=3
)
mod_seqs = { 'backward': mod_seq_backwark, 
'forward': mod_seq_forward
}
for mode, mod_seq in mod_seqs.items():
    X_train_seq = mod_seq.fit_transform(X_train, y_train)
    model = LinearRegression()
    model.fit(X_train_seq, y_train)
    X_test_seq = mod_seq.transform(X_test)
    scores = evalute_model(model, X_test_seq, y_test)
    log_metrics(f"LinearRegression_{mode}", {}, scores, {"features": X.columns[mod_seq.get_support()].to_list()})


In [15]:
mod_seq_backwark = SequentialFeatureSelector(
    estimator=LinearRegression(), direction="backward", n_features_to_select="auto", cv=3,
)
mod_seq_forward = SequentialFeatureSelector(
    estimator=LinearRegression(), direction="forward", n_features_to_select="auto", cv=3,
)
mod_seqs = { 'backward': mod_seq_backwark, 
'forward': mod_seq_forward
}
for mode, mod_seq in mod_seqs.items():
    X_train_seq = mod_seq.fit_transform(X_train_scaled, y_train)
    model = LinearRegression()
    model.fit(X_train_seq, y_train)
    X_test_seq = mod_seq.transform(X_test_scaled)
    scores = evalute_model(model, X_test_seq, y_test)
    log_metrics(f"LinearRegression_scaled_{mode}", {}, scores, {"features": X.columns[mod_seq.get_support()].to_list()})

In [16]:
params_grid = { "alpha": np.logspace(-3, 3, 50) }

In [17]:
train_evalute_log(Ridge(random_state=random_state), params_grid, X_train, y_train, X_test, y_test, "Ridge")

In [18]:
train_evalute_log(Ridge(random_state=random_state), params_grid, X_train_scaled, y_train, X_test_scaled, y_test, "Ridge_scaled")

In [19]:
train_evalute_log(Lasso(random_state=random_state), params_grid, X_train, y_train, X_test, y_test, "Lasso")

In [20]:
train_evalute_log(Lasso(random_state=random_state), params_grid, X_train_scaled, y_train, X_test_scaled, y_test, "Lasso_scaled")

In [21]:
params_grid = { "alpha": np.logspace(-3, 3, 50), "l1_ratio": np.linspace(0.001, 0.999, 50) }

In [22]:
train_evalute_log(ElasticNet(random_state=random_state), params_grid, X_train, y_train, X_test, y_test, "ElasticNet", n_jobs=2)

In [23]:
train_evalute_log(ElasticNet(random_state=random_state), params_grid, X_train_scaled, y_train, X_test_scaled, y_test, "ElasticNet_scaled", n_jobs=2)

# tree

In [24]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

In [25]:
params_grid = { "max_depth": [None, 3, 5, 7, 9, 11, 13, 15] }

In [26]:
train_evalute_log(DecisionTreeRegressor(random_state=random_state), params_grid, X_train, y_train, X_test, y_test, "DecisionTree")

In [27]:
train_evalute_log(DecisionTreeRegressor(random_state=random_state), params_grid, X_train_scaled, y_train, X_test_scaled, y_test, "DecisionTree_scaled")

In [28]:
params_grid = { "max_depth": [None, 3, 4, 5],
                "n_estimators": [75, 100, 150] }

In [29]:
train_evalute_log(RandomForestRegressor(random_state=random_state), params_grid, X_train, y_train, X_test, y_test, "RandomForest", n_jobs=2)

In [30]:
params_grid = { "max_depth": [None, 3, 4, 5],
                "n_estimators": [75, 100, 150],
                 "learning_rate": [0.05, 0.1, 0.15] }

In [62]:
train_evalute_log(GradientBoostingRegressor(random_state=random_state), params_grid, X_train, y_train, X_test, y_test, "GradientBoosting", n_jobs=2)