In [19]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("data/train.csv")

train_df.drop(columns=["Id"], inplace=True)

train_df.dropna(subset=["SalePrice"], inplace=True)

X = train_df.drop(columns=["SalePrice"])
y = train_df["SalePrice"]

#### Cleaning:

In [40]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()

num_pipe_clean = Pipeline([
    ('imputer', SimpleImputer(strategy='median'))
])

cat_pipe_clean = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent'))
])

#### Feature Engineering:

In [41]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipe_eng = Pipeline([
    ('scaler', StandardScaler())
])

cat_pipe_eng = Pipeline([
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#### Preprocessing:

In [42]:
num_pipe = Pipeline([
  ('clean', num_pipe_clean),
  ('eng', num_pipe_eng)
])

cat_pipe = Pipeline([
  ('clean', cat_pipe_clean),
  ('eng', cat_pipe_eng)
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, numerical_cols),
    ('cat', cat_pipe, categorical_cols)
])

#### Training:

In [43]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

model_pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', Ridge())
])

param_grid = {
    'model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
val_preds = best_model.predict(X_test)
val_rmse = root_mean_squared_error(np.log1p(y_test), np.log1p(val_preds))
print("Validation RMSE (log scale):", val_rmse)

Validation RMSE (log scale): 0.16037999121906835


#### Logging:

In [44]:
import mlflow
import dagshub

dagshub.init(repo_owner='gnada22', repo_name='ml_assignment_1', mlflow=True)

with mlflow.start_run():
  mlflow.log_param("model_type", "Ridge")
  mlflow.log_param("best_alpha", grid.best_params_['model__alpha'])
  mlflow.log_metric("val_rmse_log", val_rmse)

  mlflow.sklearn.log_model(
    grid.best_estimator_,
    artifact_path="model"
  )



🏃 View run illustrious-flea-191 at: https://dagshub.com/gnada22/ml_assignment_1.mlflow/#/experiments/0/runs/421f5f0abaae41f0b20fbae9ed2aa2a1
🧪 View experiment at: https://dagshub.com/gnada22/ml_assignment_1.mlflow/#/experiments/0
