In [2]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler , OneHotEncoder , MinMaxScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
# import mlflow
# import mlflow.sklearn

In [3]:
df = pd.read_csv('../data/gold.csv')
# Drop the variables from the dataframe
df = df.drop(['charles_river_dummy'], axis=1)

In [4]:
# Define the features and the target
X = df.drop('median_home_value', axis=1)
y = df['median_home_value']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

# Define the pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', ExtraTreesRegressor())
])

# Define the grid of hyperparameters to search
param_grid =  {
    'model__n_estimators': [100, 200, 300],
    'model__max_depth': [5, 10, 15],
    'model__min_samples_split': [2, 5, 10]
}

In [5]:
# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error')

# Fit GridSearchCV
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions
y_pred = best_model.predict(X_test)

ValueError: 'root_mean_squared_error' is not a valid scoring value. Use sklearn.metrics.get_scorer_names() to get valid options.

In [None]:
# meilleur model
print(best_model)

Pipeline(steps=[('scaler', StandardScaler()),
                ('model', ExtraTreesRegressor(max_depth=15))])


In [None]:
# # Define the features and the target
# X = df.drop('median_home_value', axis=1)
# y = df['median_home_value']

# # Split the data into training and test sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# # Define the pipeline with the best model
# pipeline = Pipeline([
#     ('scaler', StandardScaler()),
#     ('model', ExtraTreesRegressor(max_depth=15, n_estimators=300, random_state=42))
# ])

# # Fit the pipeline on the training data
# pipeline.fit(X_train, y_train)

# # Make predictions
# y_pred = pipeline.predict(X_test)

In [None]:
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print metrics
print("Mean Absolute Error: ", mae)
print("Root Mean Squared Error: ", rmse)
print("R2 Score: ", r2)

Mean Absolute Error:  1.9718360333192797
Root Mean Squared Error:  2.912636471579449
R2 Score:  0.8879014478744653


In [None]:
# # Log experiment with MLflow
# with mlflow.start_run():
#     mlflow.log_param("model", best_model.named_steps['model'].__class__.__name__)
#     mlflow.log_param("n_estimators", best_model.named_steps['model'].n_estimators)
#     mlflow.log_param("max_depth", best_model.named_steps['model'].max_depth)
#     mlflow.log_metric("mae", mae)
#     mlflow.log_metric("rmse", rmse)
#     mlflow.log_metric("r2", r2)
#     mlflow.sklearn.log_model(best_model, "model")