In [70]:
import pandas as pd
import numpy as np

train_df = pd.read_csv("data/train.csv")

train_df.drop(columns=["Id"], inplace=True)

train_df.dropna(subset=["SalePrice"], inplace=True)

X = train_df.drop(columns=["SalePrice"])
y = train_df["SalePrice"]

#### Cleaning:

In [71]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

class DropHighNaNColumns(BaseEstimator, TransformerMixin):
    def __init__(self, threshold=0.3):
        self.threshold = threshold
        self.columns_to_keep_ = None

    def fit(self, X, y=None):
        nan_ratio = pd.isnull(X).mean()
        self.columns_to_keep_ = nan_ratio[nan_ratio <= self.threshold].index
        return self

    def transform(self, X):
        return X[self.columns_to_keep_]

#### Feature Engineering:

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

num_pipe = Pipeline([
  ('imputer', SimpleImputer(strategy='median')),
  ('scaler', StandardScaler())
])

cat_pipe = Pipeline([
  ('imputer', SimpleImputer(strategy='most_frequent')),
  ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', num_pipe, make_column_selector(dtype_include=['int64', 'float64'])),
    ('cat', cat_pipe, make_column_selector(dtype_include=['object']))
])

#### Training:

In [78]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import Ridge
from sklearn.metrics import root_mean_squared_error

model_pipeline = Pipeline([
    ('clean', DropHighNaNColumns()),
    ('feature_engineering', preprocessor),
    ('model', Ridge())
])

param_grid = {
    'clean__threshold': [0.1, 0.2, 0.25, 0.3, 0.5, 0.8],
    'model__alpha': [0.01, 0.1, 1.0, 10.0, 100.0]
}

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

grid = GridSearchCV(model_pipeline, param_grid, cv=5, scoring='neg_root_mean_squared_error')
grid.fit(X_train, y_train)

best_model = grid.best_estimator_
val_preds = best_model.predict(X_test)
val_rmse = root_mean_squared_error(np.log1p(y_test), np.log1p(val_preds))
print("Validation RMSE (log scale):", val_rmse)

Validation RMSE (log scale): 0.1572517998903266


#### Logging:

In [79]:
import mlflow
import dagshub

dagshub.init(repo_owner='gnada22', repo_name='ml_assignment_1', mlflow=True)

with mlflow.start_run():
  mlflow.log_param("model_type", "Ridge")
  mlflow.log_param("best_alpha", grid.best_params_['model__alpha'])
  mlflow.log_param("best_nan_threshold", grid.best_params_['clean__threshold'])
  mlflow.log_metric("val_rmse_log", val_rmse)

  mlflow.sklearn.log_model(
    grid.best_estimator_,
    artifact_path="model"
  )



🏃 View run handsome-fowl-738 at: https://dagshub.com/gnada22/ml_assignment_1.mlflow/#/experiments/0/runs/1440420fb44f4f8aaa4fe6831509ba82
🧪 View experiment at: https://dagshub.com/gnada22/ml_assignment_1.mlflow/#/experiments/0
