In [2]:
pip install mlflow


Collecting mlflow
  Using cached mlflow-3.9.0-py3-none-any.whl.metadata (31 kB)
Collecting mlflow-skinny==3.9.0 (from mlflow)
  Using cached mlflow_skinny-3.9.0-py3-none-any.whl.metadata (32 kB)
Collecting mlflow-tracing==3.9.0 (from mlflow)
  Using cached mlflow_tracing-3.9.0-py3-none-any.whl.metadata (19 kB)
Collecting Flask-CORS<7 (from mlflow)
  Using cached flask_cors-6.0.2-py3-none-any.whl.metadata (5.3 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Using cached alembic-1.18.4-py3-none-any.whl.metadata (7.2 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Using cached docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Using cached graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting huey<3,>=2.5.4 (from mlflow)
  Using cached huey-2.6.0-py3-none-any.whl.metadata (4.3 kB)
Collecting skops<1 (from mlflow)
  Using cached skops-0.13.0-py3-none-any.whl.metadata (5.6 kB)
Collecting waitress<4 (from mlflow)
  Using cached waitress-3.0.2-p

In [7]:
import mlflow
import mlflow.sklearn
import numpy as np
import pandas as pd
from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.pipeline import Pipeline

In [None]:
# ‚îÄ‚îÄ‚îÄ‚îÄ Debug rapide ‚îÄ‚îÄ‚îÄ‚îÄ
coef = pipeline.named_steps['lasso'].coef_
print("Type des coefficients :", type(coef))
print("Longueur coef :", len(coef))
print("Nombre de colonnes dans features :", len(features.columns))
print("\nQuelques exemples de coef :", coef[:10])

# V√©rifie si tous les coef sont num√©riques
print("\nTous num√©riques ?", all(isinstance(x, (int, float, np.number)) for x in coef))

# Si tu vois False ‚Üí probl√®me confirm√©

In [12]:
def train_lasso_model(df):
    """Entra√Æne Lasso avec logging MLflow complet"""
    
    # Pr√©processing 
    features = df.select_dtypes(include=[np.number]).drop(['SalePrice', 'Id'], axis=1).fillna(0)
    y = np.log(df['SalePrice'])
    
    X_train, X_test, y_train, y_test = train_test_split(
        features, y, test_size=0.2, random_state=42
    )
    
    with mlflow.start_run(run_name="Lasso_Best"):
        # Hyperparam√®tres
        params = {
            "alpha": 0.001, 
            "max_iter": 10000,
            "random_state": 42,
            "selection": "random"
        }
        mlflow.log_params(params)
        
        # Pipeline 
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('lasso', Lasso(**params))
        ])
        
        # Entra√Ænement
        pipeline.fit(X_train, y_train)
        
        # M√©triques test set 
        y_pred = pipeline.predict(X_test)
        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        test_r2 = r2_score(y_test, y_pred)
        
        # Cross-validation 
        cv_scores = cross_val_score(pipeline, X_train, y_train, cv=5, scoring='neg_root_mean_squared_error')
        cv_rmse = -cv_scores.mean()
        
        # Logging m√©triques d√©taill√©es
        mlflow.log_metric("test_rmse", test_rmse)
        mlflow.log_metric("test_r2", test_r2)
        mlflow.log_metric("cv_rmse", cv_rmse)
        mlflow.log_metric("residuals_mean", np.mean(y_test - y_pred))
        mlflow.log_metric("residuals_median", np.median(y_test - y_pred))
        mlflow.log_metric("residuals_std", np.std(y_test - y_pred))
        
        # Statistiques r√©sidus 
        residuals = y_test - y_pred
        mlflow.log_metric("residuals_mean_target", -0.004416)
        mlflow.log_metric("residuals_median_target", 0.002235)
        mlflow.log_metric("residuals_std_target", 0.122054)
        
        # Sauvegarde mod√®le + feature importante
        mlflow.sklearn.log_model(pipeline, "lasso_champion_pipeline")
        
        # Feature importante (top 10)
        feature_importance = pd.DataFrame({
            'feature': features.columns,
            'importance': pipeline.named_steps['lasso'].coef_
        })

        # Nettoyage & abs
        feature_importance['importance'] = pd.to_numeric(feature_importance['importance'], errors='coerce')
        feature_importance = feature_importance.dropna(subset=['importance'])  # enl√®ve les non-num√©riques
        feature_importance['importance'] = feature_importance['importance'].abs()
        feature_importance = feature_importance.nlargest(10, 'importance')
        
        feature_importance.to_csv('feature_importance.csv', index=False)
        mlflow.log_artifact('feature_importance.csv')
        
        print(f"üèÜ Lasso Champion - Test RMSE: {test_rmse:.4f}, R¬≤: {test_r2:.4f}")
        print(f"   CV RMSE: {cv_rmse:.4f}")
        print(f"   R√©sidus - Moy: {np.mean(residuals):.6f}, Med: {np.median(residuals):.6f}, Std: {np.std(residuals):.6f}")
        
        return pipeline, test_rmse

In [13]:
if __name__ == "__main__":
    df = pd.read_csv('../data/train.csv')
    model, rmse = train_lasso_model(df)

  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)


üèÜ Lasso Champion - Test RMSE: 0.1509, R¬≤: 0.8780
   CV RMSE: 0.1579
   R√©sidus - Moy: 0.001492, Med: 0.004154, Std: 0.150895
