In [224]:
# Import necessary libraries
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler , OneHotEncoder , MinMaxScaler
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
import mlflow
import mlflow.sklearn
from sklearn.preprocessing import PolynomialFeatures

In [225]:
df = pd.read_csv('../data/gold.csv')

## variables remove

In [226]:
# Drop the variables from the dataframe
df = df.drop(['charles_river_dummy'], axis=1)

In [227]:
df.head()

Unnamed: 0,crime_rate,residential_zone_pct,business_acres_pct,nitric_oxides_concentration,average_rooms,age,distances_to_employment_centres,accessibility_to_highways,property_tax_rate,pupil_teacher_ratio,black_population,lower_status_pct,median_home_value
0,0.00632,18.0,2.31,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [228]:
# Define the features and the target
X = df.drop('median_home_value', axis=1)
y = df['median_home_value']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [229]:
# Define the pipeline with the best model
pipeline = Pipeline([
    ('PolynomialFeatures', PolynomialFeatures(degree=3)),
    ('scaler', StandardScaler()),
    ('model', ExtraTreesRegressor(max_depth=15 , random_state=42 , n_estimators=300))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

In [230]:
# Calculate metrics
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

# Print metrics
print("Mean Absolute Error: ", mae)
print("Root Mean Squared Error: ", rmse)
print("R2 Score: ", r2)

Mean Absolute Error:  1.889015418490074
Root Mean Squared Error:  2.751667081114745
R2 Score:  0.8999495115836048


In [231]:
# Log experiment with MLflow
with mlflow.start_run():
    mlflow.log_param("max_depth", 15)
    mlflow.log_param("n_estimators", 300)
    mlflow.log_metric("mae", mae)
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.sklearn.log_model(pipeline, "model")