# **Experiment Notebook**



In [1]:
# Do not modify this code
!pip install -q utstd

from utstd.ipyrenders import *

In [2]:
# Do not modify this code
import warnings
warnings.simplefilter(action='ignore')

## 0. Import Packages

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import altair as alt

---
## A. Project Description


In [4]:
student_name = "Fang Yee Tan"
student_id = "25677648"

In [5]:
# Do not modify this code
print_tile(size="h1", key='student_name', value=student_name)

In [6]:
# Do not modify this code
print_tile(size="h1", key='student_id', value=student_id)

---
## C. Data Understanding

### C.1   Load Datasets


In [7]:
from pathlib import Path
file_path = Path.home() / "Desktop" / "36120"

X_train = pd.read_csv(file_path / "X_train.csv")
X_val = pd.read_csv(file_path / "X_val.csv")
X_test = pd.read_csv(file_path / "X_test.csv")
y_train = pd.read_csv(file_path / "y_train.csv").squeeze()
y_val = pd.read_csv(file_path / "y_val.csv").squeeze()
y_test = pd.read_csv(file_path / "y_test.csv").squeeze()

---
## I. Selection of Performance Metrics

> Provide some explanations on why you believe the performance metrics you chose is appropriate


In [8]:
from sklearn.metrics import mean_squared_error, r2_score

In [9]:
performance_metrics_explanations = """Root Mean Squared Error (RMSE) and R squared (R²) are selected as the primary performance metrics for this project. RMSE is chosen because it shares the same units as the target variable, precipitation in millimeters, making the magnitude of errors straightforward and easy to interpret. Furthermore, RMSE penalises larger errors more heavily, which is crucial for accurately reflecting the impact of significant deviations in precipitation forecasts. R squared is included as it quantifies the proportion of variance in the observed data explained by the model, providing valuable insight into the overall goodness of fit and facilitating a comprehensive evaluation of model performance."""

In [10]:
# Do not modify this code
print_tile(size="h3", key='performance_metrics_explanations', value=performance_metrics_explanations)

## J. Train Machine Learning Model

Null Hypothesis: There is no statistically significant difference in predictive performance between the linear regression model and the baseline model.

Alternative Hypothesis: There is a statistically significant difference in predictive performance between the linear regression model and the baseline model.

### J.1 Import Algorithm

> Provide some explanations on why you believe this algorithm is a good fit


In [11]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [12]:
algorithm_selection_explanations = """Ridge, Lasso and ElasticNet regression models are employed in this project as they are relatively simple yet effective approaches for addressing multicollinearity and high dimensionality. Ridge regression utilises L2 regularisation to shrink coefficient values without eliminating predictors, thereby reducing overfitting and improving model stability when input features are highly correlated. Lasso regression incorporates L1 regularisation, which not only shrinks coefficients but also performs variable selection by setting some coefficients exactly to zero, enhancing interpretability and reducing the influence of less relevant variables. ElasticNet combines both L1 and L2 regularisation, making it particularly effective in scenarios involving numerous and correlated predictors, as it leverages the strengths of both Ridge and Lasso. Collectively, these models offer a strong foundation for developing precipitation prediction models that are interpretable, generalisable, and robust."""

In [13]:
# Do not modify this code
print_tile(size="h3", key='algorithm_selection_explanations', value=algorithm_selection_explanations)

### J.2 Set Hyperparameters

> Provide some explanations on why you believe this algorithm is a good fit


In [14]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, mean_squared_error

In [23]:
# Time series cross valaidation
tscv = TimeSeriesSplit(n_splits=5)

# Models and hyperparameters
models = {
    'Ridge': {
        'model': Ridge(),
        'params': {'model__alpha': [0.01, 0.1, 1, 10, 100, 1000]}
    },
    'Lasso': {
        'model': Lasso(max_iter=100000),
        'params': {'model__alpha': [0.001, 0.01, 0.1, 1, 10, 100]}
    },
    'ElasticNet': {
        'model': ElasticNet(max_iter=100000),
        'params': {
            'model__alpha': [0.001, 0.01, 0.1, 1, 10],
            'model__l1_ratio': [0.1, 0.5, 0.9, 1]
        }
    }
}

In [28]:
hyperparameters_selection_explanations = """The alpha parameter is tuned to control the strength of regularisation across all three models. Regularisation helps prevent overfitting by penalising large coefficient values, thereby improving the model’s ability to generalise to unseen data. In the case of ElasticNet, an additional hyperparameter, l1_ratio, determines the balance between L1 and L2 regularisation. When l1_ratio is close to 0, ElasticNet behaves more like Ridge regression by applying primarily L2 regularisation. Conversely, when l1_ratio approaches 1, it acts more like Lasso regression, placing greater emphasis on L1 regularisation. Tuning these parameters enables the model to optimise its regularisation strategy, ultimately improving predictive performance.
"""

In [29]:
# Do not modify this code
print_tile(size="h3", key='hyperparameters_selection_explanations', value=hyperparameters_selection_explanations)

### J.3 Fit Model

In [27]:
scoring = {
    'rmse': 'neg_root_mean_squared_error',
    'r2': 'r2'
}

best_models = {}

for name, config in models.items():
    print(f"\nGridSearchCV for {name}")
    
    pipeline = Pipeline([
        ('model', config['model'])
    ])
    
    grid = GridSearchCV(
        estimator=pipeline,
        param_grid=config['params'],
        cv=tscv,
        scoring=scoring,
        refit='rmse',  
        n_jobs=-1,
        return_train_score=True
    )
    
    grid.fit(X_train, y_train)
    
    best_rmse = -grid.best_score_  
    best_r2 = grid.cv_results_['mean_test_r2'][grid.best_index_]
    
    print(f"Best params: {grid.best_params_}")
    print(f"Best RMSE: {best_rmse:.4f}")
    print(f"Best R2: {best_r2:.4f}")
    
    best_models[name] = grid.best_estimator_


GridSearchCV for Ridge
Best params: {'model__alpha': 100}
Best RMSE: 14.3880
Best R2: 0.0936

GridSearchCV for Lasso
Best params: {'model__alpha': 0.01}
Best RMSE: 14.3940
Best R2: 0.0914

GridSearchCV for ElasticNet
Best params: {'model__alpha': 0.01, 'model__l1_ratio': 0.9}
Best RMSE: 14.3937
Best R2: 0.0915


### J.4 Model Technical Performance

> Provide some explanations on model performance


In [43]:
# Validation dataset
for name, model in best_models.items():
    print(f"\nPredicting with best {name} model:")
    
    # Predict on validation set
    y_val_pred = model.predict(X_val)
    
    val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
    val_r2 = r2_score(y_val, y_val_pred)
    
    print(f"{name} Validation RMSE: {val_rmse:.4f}")
    print(f"{name} Validation R2: {val_r2:.4f}")


Predicting with best Ridge model:
Ridge Validation RMSE: 11.0580
Ridge Validation R2: 0.0019

Predicting with best Lasso model:
Lasso Validation RMSE: 11.1443
Lasso Validation R2: -0.0137

Predicting with best ElasticNet model:
ElasticNet Validation RMSE: 11.1381
ElasticNet Validation R2: -0.0126


In [45]:
# Testing dataset
for name, model in best_models.items():
    print(f"\nPredicting with best {name} model:")
    
    # Predict on test set
    y_test_pred = model.predict(X_test)
    
    test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
    test_r2 = r2_score(y_test, y_test_pred)
    
    print(f"{name} Testing RMSE: {test_rmse:.4f}")
    print(f"{name} Testing R2: {test_r2:.4f}")


Predicting with best Ridge model:
Ridge Testing RMSE: 13.9550
Ridge Testing R2: 0.1032

Predicting with best Lasso model:
Lasso Testing RMSE: 14.0800
Lasso Testing R2: 0.0870

Predicting with best ElasticNet model:
ElasticNet Testing RMSE: 14.0684
ElasticNet Testing R2: 0.0885


In [49]:
model_performance_explanations = """From the performance metrics on both the validation and testing datasets, it is evident that the models struggle to effectively learn the underlying patterns. The predictive performance of Ridge, Lasso and ElasticNet regression models is poor, with R² scores around 0.08 on the testing dataset, indicating that each model explains only about 8% of the variance in cumulative precipitation over the next three days. This limited performance suggests that these linear models are unable to capture the full complexity of precipitation patterns. For ElasticNet, the optimal l1_ratio of 0.9 reflects a strong preference for L1 regularisation, which encourages sparsity by shrinking some coefficients to zero. However, this also highlights that these models may be too simplistic for the dataset, struggling to capture nonlinear relationships and complex interactions likely present in precipitation data.
"""

In [50]:
# Do not modify this code
print_tile(size="h3", key='model_performance_explanations', value=model_performance_explanations)

### J.5 Business Impact from Current Model Performance

> Provide some analysis on the model impacts from the business point of view


In [38]:
business_impacts_explanations = """Since the three models explain only about 9% of the variance in precipitation and have an RMSE of approximately 14 millimeters, their predictive accuracy is limited. From a business perspective, such errors could have significant consequences. Inaccurate rainfall forecasts may lead to poor decision-making in industries that depend heavily on weather conditions. Overestimating rainfall might result in unnecessary resource allocation, leading to increased operational costs. Conversely, underestimating precipitation could leave businesses unprepared for heavy rain, causing operational disruptions, financial losses, and, more importantly, potential safety risks for personnel and the public.
"""

In [39]:
# Do not modify this code
print_tile(size="h3", key='business_impacts_explanations', value=business_impacts_explanations)

## H. Project Outcomes

In [34]:
experiment_outcome = """Hypothesis Rejected"""

In [35]:
# Do not modify this code
print_tile(size="h2", key='experiment_outcomes_explanations', value=experiment_outcome)

In [40]:
experiment_results_explanations = """The experiment results indicate that the current linear regression models, Ridge, Lasso and ElasticNet, are only capturing about 9% of the variance in precipitation, with relatively high RMSE values around 14 millimeters. This suggests that the relationship between the features and precipitation is likely nonlinear and more complex than what simple linear models can capture. As a result, it is clear that more advanced models such as Random Forest and Gradient Boosting Machines should be explored next, as they are better suited to model nonlinear interactions and can potentially improve predictive performance significantly.
"""

In [41]:
# Do not modify this code
print_tile(size="h2", key='experiment_results_explanations', value=experiment_results_explanations)