In [10]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
import mlflow
import mlflow.xgboost
import dagshub
import joblib

In [11]:
# Initialize DagsHub to log MLflow runs
dagshub.init(repo_owner='AntoineD01', repo_name='House-prediction', mlflow=True)

# Set experiment (will create if doesn't exist)
experiment_name = "HousePricePrediction"
mlflow.set_experiment(experiment_name)

<Experiment: artifact_location='mlflow-artifacts:/d44301c7c7484c8e99fc60d12f9f3860', creation_time=1751112538551, experiment_id='0', last_update_time=1751112538551, lifecycle_stage='active', name='HousePricePrediction', tags={}>

In [12]:
# Load processed data
data_path = '../../data/processed/processed_train.csv'
df = pd.read_csv(data_path)
print(f"Data shape: {df.shape}")
df.head()

Data shape: (1460, 10)


Unnamed: 0,OverallQual,GrLivArea,GarageCars,GarageArea,ExterQual,BsmtQual,KitchenQual,GarageFinish,GarageType,SalePrice
0,7,1710,2,548,2,2,2,2,1,208500
1,6,1262,2,460,3,2,3,2,1,181500
2,7,1786,2,608,2,2,2,2,1,223500
3,7,1717,3,642,3,4,2,3,5,140000
4,8,2198,3,836,2,2,2,2,1,250000


In [13]:
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f"Train shape: {X_train.shape}")
print(f"Test shape: {X_test.shape}")


Train shape: (1168, 9)
Test shape: (292, 9)


In [14]:
def train_and_log_model(n_estimators, max_depth, learning_rate):
    with mlflow.start_run():
        model = xgb.XGBRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            learning_rate=learning_rate,
            random_state=42
        )
        model.fit(X_train, y_train)

        # Predictions
        preds = model.predict(X_test)
        r2 = r2_score(y_test, preds)
        rmse = mean_squared_error(y_test, preds, squared=False)

        # Log parameters and metrics
        mlflow.log_param("n_estimators", n_estimators)
        mlflow.log_param("max_depth", max_depth)
        mlflow.log_param("learning_rate", learning_rate)
        mlflow.log_metric("r2", r2)
        mlflow.log_metric("rmse", rmse)

       # Save model to file
        model_file = f"../../ml/models/xgb_model_{n_estimators}_{max_depth}_{learning_rate}.pkl"
        joblib.dump(model, model_file)
        print(f"Saved model locally to {model_file}")

        # Log the path to MLflow as a param or tag
        mlflow.log_param("model_path", model_file)

        print(f"Logged Model - R2: {r2:.4f}, RMSE: {rmse:.2f}")
        return model

In [15]:
print("Training and Logging Model 1...")
model_1 = train_and_log_model(
    n_estimators=100,
    max_depth=5,
    learning_rate=0.1
)

Training and Logging Model 1...




Saved model locally to ../../ml/models/xgb_model_100_5_0.1.pkl
Logged Model - R2: 0.8510, RMSE: 33809.81
🏃 View run nervous-snail-856 at: https://dagshub.com/AntoineD01/House-prediction.mlflow/#/experiments/0/runs/f791707cbc304af9a0f2a1f037248084
🧪 View experiment at: https://dagshub.com/AntoineD01/House-prediction.mlflow/#/experiments/0


In [16]:
print("Training and Logging Model 2...")
model_2 = train_and_log_model(
    n_estimators=200,
    max_depth=8,
    learning_rate=0.05
)

Training and Logging Model 2...




Saved model locally to ../../ml/models/xgb_model_200_8_0.05.pkl
Logged Model - R2: 0.8375, RMSE: 35303.56
🏃 View run nervous-grub-926 at: https://dagshub.com/AntoineD01/House-prediction.mlflow/#/experiments/0/runs/0a0294c2c44c42f696507c639355ca31
🧪 View experiment at: https://dagshub.com/AntoineD01/House-prediction.mlflow/#/experiments/0


In [17]:
import json

feature_columns = [col for col in df.columns if col != 'SalePrice']
with open(r'C:\Users\Antoine Dupont\Pictures\House-Prediction\ml\models\feature_columns.json', 'w') as f:
    json.dump(feature_columns, f)

print("Saved new feature_columns.json!")


Saved new feature_columns.json!
