# **Experiment Notebook**



In [5]:
# Do not modify this code
!pip install -q utstd

from utstd.ipyrenders import *

In [6]:
# Do not modify this code
import warnings
warnings.simplefilter(action='ignore')

## 0. Import Packages

In [7]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import altair as alt

---
## A. Project Description


In [8]:
student_name = "Fang Yee Tan"
student_id = "25677648"

In [9]:
# Do not modify this code
print_tile(size="h1", key='student_name', value=student_name)

In [10]:
# Do not modify this code
print_tile(size="h1", key='student_id', value=student_id)

---
## C. Data Understanding

### C.1   Load Datasets


In [11]:
from pathlib import Path
file_path = Path.home() / "Desktop" / "36120"

X_train = pd.read_csv(file_path / "X_train.csv")
X_val = pd.read_csv(file_path / "X_val.csv")
X_test = pd.read_csv(file_path / "X_test.csv")
y_train = pd.read_csv(file_path / "y_train.csv").squeeze()
y_val = pd.read_csv(file_path / "y_val.csv").squeeze()
y_test = pd.read_csv(file_path / "y_test.csv").squeeze()

---
## I. Selection of Performance Metrics

> Provide some explanations on why you believe the performance metrics you chose is appropriate


In [12]:
from sklearn.metrics import mean_squared_error, r2_score

In [13]:
performance_metrics_explanations = """Root Mean Squared Error (RMSE) and R squared (R²) are selected as the primary performance metrics for this project. RMSE is chosen because it shares the same units as the target variable, precipitation in millimeters, making the magnitude of errors straightforward and easy to interpret. Furthermore, RMSE penalises larger errors more heavily, which is crucial for accurately reflecting the impact of significant deviations in precipitation forecasts. R squared is included as it quantifies the proportion of variance in the observed data explained by the model, providing valuable insight into the overall goodness of fit and facilitating a comprehensive evaluation of model performance."""

In [14]:
# Do not modify this code
print_tile(size="h3", key='performance_metrics_explanations', value=performance_metrics_explanations)

## J. Train Machine Learning Model

Null Hypothesis: There is no statistically significant difference in predictive performance between XGBoost and the baseline model.

Alternative Hypothesis: There is a statistically significant difference in predictive performance between XGBoost and the baseline model.

### J.1 Import Algorithm

> Provide some explanations on why you believe this algorithm is a good fit


In [27]:
from xgboost import XGBRegressor

In [33]:
xgb = XGBRegressor(random_state=12)

In [28]:
algorithm_selection_explanations = """XGBoost is considered a suitable choice because it builds an ensemble of weak learners sequentially, where each new tree focuses on correcting the errors made by the previous ones. This boosting technique improves predictive accuracy across iterations, making it well-suited for complex regression tasks. In addition, XGBoost offers a wide range of hyperparameters that can be tuned to better capture nonlinear relationships in the data. It also provides feature importance scores, which can offer valuable insights into the most influential variables in the prediction process.
"""

In [29]:
# Do not modify this code
print_tile(size="h3", key='algorithm_selection_explanations', value=algorithm_selection_explanations)

### J.2 Set Hyperparameters

> Provide some explanations on why you believe this algorithm is a good fit


In [101]:
param_grid = {
    'learning_rate': [0.005],
    'n_estimators': [400],
    'max_depth': [3],
    'subsample': [0.6],
    'colsample_bytree': [0.5],
    'min_child_weight': [6]
}

In [102]:
hyperparameters_selection_explanations = """The n_estimators parameter determines how many trees are used in the ensemble, directly influencing the model’s ability to capture patterns in the data. The learning_rate controls how much each tree contributes to the final prediction, with lower values slow the learning process but help prevent overfitting. The max_depth parameter sets a limit on how deep each tree can grow, helping to control model complexity and reduce the risk of learning from noise. Parameters such as subsample and colsample_bytree introduce randomness by restricting the fraction of training instances and features used for each tree, which enhances the model’s generalisation ability and mitigates overfitting. Lastly, min_child_weight specifies the minimum sum of instance weights required in a leaf node, acting as a form of regularisation that prevents the model from capturing overly specific patterns in the data.
"""

In [103]:
# Do not modify this code
print_tile(size="h3", key='hyperparameters_selection_explanations', value=hyperparameters_selection_explanations)

### J.3 Fit Model

In [104]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

In [105]:
# Time series split
tscv = TimeSeriesSplit(n_splits=5)

# Scoring metrics
scoring = {
    'rmse': 'neg_root_mean_squared_error',
    'r2': 'r2'
}

# Grid search
grid_search = GridSearchCV(
    estimator=xgb,
    param_grid=param_grid,
    scoring=scoring,
    refit='rmse',
    cv=tscv,
    n_jobs=-1,
)

# Fit model
grid_search.fit(X_train, y_train)

In [106]:
print("Best parameters:", grid_search.best_params_)

best_rmse = -grid_search.best_score_
print(f"Best RMSE: {best_rmse:.4f}")

best_r2 = grid_search.cv_results_['mean_test_r2'][grid_search.best_index_]
print(f"Best R2: {best_r2:.4f}")

Best parameters: {'colsample_bytree': 0.5, 'learning_rate': 0.005, 'max_depth': 3, 'min_child_weight': 6, 'n_estimators': 400, 'subsample': 0.6}
Best RMSE: 14.4187
Best R2: 0.0916


### J.4 Model Technical Performance

> Provide some explanations on model performance


In [107]:
# Predict on validation dataset
y_val_pred = grid_search.predict(X_val)

In [108]:
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

RMSE: 10.98
R2: 0.02


In [109]:
# Predict on testing dataset
y_test_pred = grid_search.predict(X_test)

In [110]:
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2 = r2_score(y_test, y_test_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

RMSE: 13.95
R2: 0.10


In [113]:
model_performance_explanations = """Despite using a more advanced model like XGBoost, the results still indicate limited predictive power. The RMSE is 10.98 for the validation dataset and 13.95 for the testing dataset, while the R² scores are only 0.02 and 0.10, respectively. These metrics suggest that XGBoost, although more complex than previous models, still struggles to capture the underlying patterns in the data. 
"""

In [114]:
# Do not modify this code
print_tile(size="h3", key='model_performance_explanations', value=model_performance_explanations)

### J.5 Business Impact from Current Model Performance

> Provide some analysis on the model impacts from the business point of view


In [115]:
business_impacts_explanations = """Since the RMSE remains high and the R² score is still low at 0.1 on the testing dataset, the model is only able to capture around 10% of the variance in precipitation. From a business perspective, this level of accuracy is insufficient, especially for industries that are heavily weather-dependent. For example, predicting low rainfall when high rainfall actually occurs can result in under-preparation, leading to significant financial losses and serious safety risks for outdoor workers. Conversely, overestimating rainfall may lead to unnecessary precautionary measures, wasting valuable resources and increasing operational costs. Therefore, the current model's limited predictive power poses real challenges and potential consequences in real-world applications.
"""

In [116]:
# Do not modify this code
print_tile(size="h3", key='business_impacts_explanations', value=business_impacts_explanations)

## H. Project Outcomes

In [34]:
experiment_outcome = """Hypothesis Rejected"""

In [35]:
# Do not modify this code
print_tile(size="h2", key='experiment_outcomes_explanations', value=experiment_outcome)

In [40]:
experiment_results_explanations = """The experiment results indicate that the current linear regression models, Ridge, Lasso and ElasticNet, are only capturing about 9% of the variance in precipitation, with relatively high RMSE values around 14 millimeters. This suggests that the relationship between the features and precipitation is likely nonlinear and more complex than what simple linear models can capture. As a result, it is clear that more advanced models such as Random Forest and Gradient Boosting Machines should be explored next, as they are better suited to model nonlinear interactions and can potentially improve predictive performance significantly.
"""

In [41]:
# Do not modify this code
print_tile(size="h2", key='experiment_results_explanations', value=experiment_results_explanations)