# Component Modelling exercise

In this exercise, you will practice key Python concepts by completing a series of code snippets. Certain parts of the code are left incomplete, marked with `# TODO: COMPLETE`. Your task is to fill in these sections correctly to make the program work as intended.

Work carefully through each step, test your code as you go, and make sure you understand why your solution works. This hands-on practice will help solidify your understanding of Python syntax, logic, and functions.

## Libraries

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

## Null imputer component

In [None]:
class NullImputer:
    def __init__(self, strategy="mean"):
        assert strategy in ["mean", "median", "most_frequent"], \
            "Strategy must be one of: 'mean', 'median', 'most_frequent'"
        self.strategy = strategy
        self.fill_values = {}

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("NullImputer expects a pandas DataFrame")

        for col in X.columns:
            if self.strategy == "mean":
                self.fill_values[col] = X[col].mean()
            elif self.strategy == "median":
                self.fill_values[col] = X[col].median()
            elif self.strategy == "most_frequent":
                self.fill_values[col] = X[col].mode().dropna()
                self.fill_values[col] = self.fill_values[col].iloc[0] if not self.fill_values[col].empty else None

        return self

    def transform(self, X):
        if not self.fill_values:
            raise ValueError("You must call 'fit' before 'transform'")

        return X.fillna(self.fill_values)

## Drop null component

In [None]:
class DropNull:
    """
    Transformer that removes columns with a proportion of missing values above a given threshold.

    Parameters
    ----------
    threshold : float, default=0.2
        Maximum allowed proportion of missing values in a column.
        Columns exceeding this proportion will be dropped during transformation.

    Attributes
    ----------
    col_drop : list
        List of column names to be removed after calling `fit`.

    Methods
    -------
    fit(X, y=None):
        Identifies columns that exceed the missing value threshold and stores them in `col_drop`.

    transform(X):
        Drops columns listed in `col_drop` from `X`.
        Raises a ValueError if `fit` has not been called first.

    """

    # TODO: COMPLETE
    pass

## Scaler component

In [None]:
class Scaler:
    def __init__(self, method="standard"):
        assert method in ["standard", "minmax", "robust"], \
            "method must be one of: 'standard', 'minmax', 'robust'"
        self.method = method
        self.params = {}

    def fit(self, X, y=None):
        if not isinstance(X, pd.DataFrame):
            raise TypeError("Scaler expects a pandas DataFrame")

        if self.method == "standard":
            std = X.std()
            std[std == 0] = 1  # avoid zero division

            self.params["mean"] = X.mean()
            self.params["std"] = std

        elif self.method == "minmax":
            # TODO: COMPLETE
            pass

        elif self.method == "robust":
            # TODO: COMPLETE
            pass

        return self
    
    def transform(self, X):
        if not self.params:
            raise ValueError("You must call 'fit' before 'transform'")

        if self.method == "standard":
            return (X - self.params["mean"]) / self.params["std"]
        elif self.method == "minmax":
            # TODO: COMPLETE
            pass
        elif self.method == "robust":
            # TODO: COMPLETE
            pass


## Predictor model and Pipeline implementation

In [None]:
class EstimatorStep:
    def __init__(self, estimator):
        self.estimator = estimator

    def fit(self, X, y):
        self.estimator.fit(X, y)
        return self

    # Note: transform not implemented; .predict will be used in pipeline
    def predict(self, X):
        return self.estimator.predict(X)
    

In [None]:
class Pipeline:
    def __init__(self, steps):
        self.steps = steps
        self.named_steps = dict(steps)

    def fit(self, X, y=None):
        for i, (name, step) in enumerate(self.steps):
            is_last = (i == len(self.steps)-1)
            if hasattr(step, 'transform') and not is_last:
                step.fit(X, y)
                X = step.transform(X)
            else:
                # Last step or not transform method (model)
                step.fit(X, y)
        return self

    def transform(self, X):
        # Applies only if preprocessing (Has transform)
        for i, (name, step) in enumerate(self.steps):
            is_last = (i == len(self.steps)-1)
            if hasattr(step, 'transform') and not is_last:
                X = step.transform(X)
        return X

    def predict(self, X):
        X_proc = self.transform(X)
        last_step = self.steps[-1][1]
        return last_step.predict(X_proc)

## Main code

In [None]:
# ---- Config: choose model ----
MODEL = "rf"  # "rf" (RandomForest) or "dt" (DecisionTree)

if MODEL == "rf":
    estimator = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
elif MODEL == "dt":
    estimator = DecisionTreeRegressor(random_state=42)
else:
    raise ValueError("MODEL should be 'rf' or 'dt'")

np.random.seed(42)

1. ----------- Load data -----------

In [None]:
# 1. ----------- Load data -----------
housing = fetch_california_housing(as_frame=True)
data = housing.frame

# 1.1. ----------- Making dataset realistic -----------
n_rows = data.shape[0]

# Insert ~10% of NaNs in 'AveRooms' and 'HouseAge'
for col in ["AveRooms", "HouseAge"]:
    missing_indices = np.random.choice(n_rows, size=int(0.1 * n_rows), replace=False)
    data.loc[missing_indices, col] = np.nan

# Constant column
data["constant_col"] = 42

# Almost all NaNs column
data["mostly_null"] = np.nan
data.loc[10:20, "mostly_null"] = 1

print('\n', data.head(), '\n')
print(data.dtypes, '\n')
print(data.describe())

In [None]:
# 2. ----------- Split features and target -----------
X = data.drop(columns=["MedHouseVal"])
y = data["MedHouseVal"]

In [None]:
# 3. ----------- Train/Test Split (before any preprocessing!) -----------
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print(f'\nInstances of train: {len(X_train)}')
print(f'Instances of test: {len(X_test)}')

In [None]:
# 4. ----------- Pipeline -----------
pipeline = Pipeline([
    ("dropnull", DropNull(threshold=0.2)),
    ("imputer", NullImputer(strategy="median")),
    ("scaler", Scaler(method="standard")),
    ("model", EstimatorStep(estimator))
])

In [None]:
# 5. ----------- pipeline fit (preproc + model) -----------
pipeline.fit(X_train, y_train)

# Check drop null
dropnull_component = dict(pipeline.steps)["dropnull"]
if hasattr(dropnull_component, "col_drop"):
    print("\nDropped columns by DropNull:", dropnull_component.col_drop)

# We can check params of the steps
scaler_component = dict(pipeline.steps)["scaler"]
print("\nCalculated mean in scaler fit:")
print(scaler_component.params["mean"])
print("\nCalculated STD in scaler fit:")
print(scaler_component.params["std"])

In [None]:
# 6. ----------- Predictions and metrics using pipeline -----------
y_pred_train = pipeline.predict(X_train)
y_pred_test  = pipeline.predict(X_test)

rmse_train = root_mean_squared_error(y_train, y_pred_train)
rmse_test  = root_mean_squared_error(y_test,  y_pred_test)

print(f"\n[Model trained: {MODEL}]")
print(f"R2 Train: {r2_score(y_train, y_pred_train):.3f} | RMSE Train: {rmse_train:.3f}")
print(f"R2 Test : {r2_score(y_test,  y_pred_test ):.3f} | RMSE Test : {rmse_test:.3f}")