# **Experiment Notebook**



In [81]:
# Do not modify this code
!pip install -q utstd

from utstd.ipyrenders import *

In [82]:
# Do not modify this code
import warnings
warnings.simplefilter(action='ignore')

## 0. Import Packages

In [83]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import altair as alt

---
## A. Project Description


In [84]:
student_name = "Fang Yee Tan"
student_id = "25677648"

In [85]:
# Do not modify this code
print_tile(size="h1", key='student_name', value=student_name)

In [86]:
# Do not modify this code
print_tile(size="h1", key='student_id', value=student_id)

---
## C. Data Understanding

### C.1   Load Datasets


In [87]:
from pathlib import Path
file_path = Path.home() / "Desktop" / "36120" / "Assignment2"

X_train = pd.read_csv(file_path / "X_train.csv")
X_val = pd.read_csv(file_path / "X_val.csv")
X_test = pd.read_csv(file_path / "X_test.csv")
y_train = pd.read_csv(file_path / "y_train.csv").squeeze()
y_val = pd.read_csv(file_path / "y_val.csv").squeeze()
y_test = pd.read_csv(file_path / "y_test.csv").squeeze()

---
## I. Selection of Performance Metrics

> Provide some explanations on why you believe the performance metrics you chose is appropriate


In [88]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, classification_report

In [89]:
# Critical Success Index (CSI)
def critical_success_index(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    csi = tp / (tp + fp + fn)
    return csi

In [90]:
performance_metrics_explanations = """Accuracy measures how well the model correctly predicts both rain and no-rain cases, providing an overall view of performance. However, due to the dataset’s imbalance, precision, recall, and F1-score offer more meaningful insights. In this project, the F1-score is the primary metric because it balances precision and recall, which is essential when both false positives and false negatives have significant impacts. The confusion matrix further breaks down prediction results into true positives, true negatives, false positives, and false negatives, giving a clearer picture of the model’s strengths and weaknesses. Additionally, the Critical Success Index (CSI) is introduced to assess forecast skill specifically in meteorological contexts. Together, these metrics provide a comprehensive evaluation of model effectiveness."""

In [91]:
# Do not modify this code
print_tile(size="h3", key='performance_metrics_explanations', value=performance_metrics_explanations)

## J. Train Machine Learning Model

Null hypothesis: The performance of the Random Forest model is not significantly better than that of the baseline Logistic Regression model.

Alternative hypothesis: The Random Forest model demonstrates statistically significant improvement in performance compared to the baseline Logistic Regression model.

### J.1 Import Algorithm

> Provide some explanations on why you believe this algorithm is a good fit


In [92]:
from sklearn.ensemble import RandomForestClassifier

In [93]:
algorithm_selection_explanations = """Random Forest is selected for this task because of its capability to model complex and non-linear relationships between the input features and the target variable, which logistic regression may not capture effectively. It functions by constructing an ensemble of decision trees, each trained on a random subset of the data, and then aggregates their predictions to improve overall accuracy and reduce model variance. This ensemble approach contributes to increased robustness and improved generalisation to unseen data, helping to minimise overfitting. In addition, Random Forest provides interpretable feature importance scores, which offer valuable insights into the relative contribution of each variable in predicting rainfall."""

In [94]:
# Do not modify this code
print_tile(size="h3", key='algorithm_selection_explanations', value=algorithm_selection_explanations)

### J.2 Set Hyperparameters

> Provide some explanations on why you believe this algorithm is a good fit


In [95]:
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [96]:
# Set up the parameter grid
param_grid = {
    'class_weight': ['balanced'],
    'n_estimators': [200],
    'max_depth': [2],
    'min_samples_split': [2],
    'min_samples_leaf': [4],
    'max_features': ['log2']
}

In [97]:
# Time series cross validation
tscv = TimeSeriesSplit(n_splits=5)

# Grid search
grid_search = GridSearchCV(
    estimator=RandomForestClassifier(random_state=34),
    param_grid=param_grid,
    cv=tscv,
    scoring='f1', 
    n_jobs=-1,     
)

# Fit grid search
grid_search.fit(X_train, y_train)

# Best parameters
print(f"\nBest parameters: {grid_search.best_params_}")
print(f"Best cross-validation F1 score: {grid_search.best_score_:.4f}")


Best parameters: {'class_weight': 'balanced', 'max_depth': 2, 'max_features': 'log2', 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 200}
Best cross-validation F1 score: 0.4227


In [98]:
hyperparameters_selection_explanations = """Class weight is calibrated to address class imbalance by assigning greater importance to minority classes, thereby enhancing the model’s ability to detect infrequent events. The number of estimators defines the size of the ensemble, striking a balance between predictive performance and computational efficiency. Maximum depth constrains the complexity of individual trees, serving as a regularisation mechanism to mitigate overfitting. The minimum samples required for node splitting and leaf formation regulate tree growth, promoting model smoothness and improving generalisation to unseen data. Lastly, the maximum features parameter introduces randomness in feature selection at each split, reducing correlation between trees and increasing the robustness of the ensemble."""

In [99]:
# Do not modify this code
print_tile(size="h3", key='hyperparameters_selection_explanations', value=hyperparameters_selection_explanations)

### J.3 Fit Model

In [113]:
best_rf = grid_search.best_estimator_

### J.4 Model Technical Performance

> Provide some explanations on model performance


In [114]:
# Predictions on validation dataset
y_val_pred = best_rf.predict(X_val)

In [115]:
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
csi_score = critical_success_index(y_val, y_val_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Critical Success Index (CSI): {csi_score:.2f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))

Accuracy: 0.62
Precision: 0.44
Recall: 0.59
F1 Score: 0.51
Critical Success Index (CSI): 0.34

Confusion Matrix:
[[158  89]
 [ 48  70]]


In [116]:
# Predictions on testing dataset
y_test_pred = best_rf.predict(X_test)

In [117]:
accuracy = accuracy_score(y_test, y_test_pred)
precision = precision_score(y_test, y_test_pred)
recall = recall_score(y_test, y_test_pred)
f1 = f1_score(y_test, y_test_pred)
csi_score = critical_success_index(y_test, y_test_pred)

print(f"Accuracy: {accuracy:.2f}")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")
print(f"Critical Success Index (CSI): {csi_score:.2f}")

# Confusion Matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_test_pred))

Accuracy: 0.55
Precision: 0.40
Recall: 0.50
F1 Score: 0.44
Critical Success Index (CSI): 0.29

Confusion Matrix:
[[134  95]
 [ 65  64]]


In [118]:
# Feature importance
best_rf.feature_importances_

array([0.06187462, 0.03390073, 0.02357514, 0.02011551, 0.03554064,
       0.0188766 , 0.12897881, 0.10574731, 0.02543281, 0.0074486 ,
       0.03256362, 0.01478148, 0.12896021, 0.03237444, 0.07982254,
       0.02227146, 0.00848298, 0.03136408, 0.04824626, 0.02262904,
       0.00412612, 0.01185693, 0.00193208, 0.00590561, 0.        ,
       0.00062986, 0.        , 0.0038691 , 0.        , 0.        ,
       0.        , 0.00134871, 0.00086206, 0.        , 0.00312824,
       0.02936173, 0.01765249, 0.00469556, 0.00029035, 0.00030792,
       0.01550505, 0.01115932, 0.00042903, 0.00283627, 0.        ,
       0.00111666])

In [125]:
feat_imp_rf_df = pd.DataFrame(
    {
        'feature': X_train.columns,
        'feature_importance': best_rf.feature_importances_
     }
)
feat_imp_rf_df

Unnamed: 0,feature,feature_importance
0,relative_humidity_2m_mean,0.061875
1,cloud_cover_mean,0.033901
2,cloud_cover_max,0.023575
3,cloud_cover_min,0.020116
4,wind_gusts_10m_mean,0.035541
5,wind_speed_10m_mean,0.018877
6,dew_point_2m_mean,0.128979
7,wet_bulb_temperature_2m_mean,0.105747
8,pressure_msl_mean,0.025433
9,vapour_pressure_deficit_max,0.007449


In [126]:
alt.Chart(feat_imp_rf_df).mark_bar().encode(
    x='feature_importance',
    y=alt.Y("feature", sort='-x'),
    tooltip=['feature','feature_importance'])

In [137]:
model_performance_explanations = """The model appears to be overfitting, indicating poor generalisation to unseen data. Although it shows improved accuracy and precision compared to logistic regression, its recall and F1 score have dropped significantly (to 0.5 and 0.44, respectively). This suggests that while the model is better at correctly identifying non-rain events, it struggles to accurately predict rain events, leading to a performance imbalance. The decline in recall and F1 score highlights its limited effectiveness in detecting positive cases, which is critical in applications like weather prediction."""

In [138]:
# Do not modify this code
print_tile(size="h3", key='model_performance_explanations', value=model_performance_explanations)

### J.5 Business Impact from Current Model Performance

> Provide some analysis on the model impacts from the business point of view


In [135]:
business_impacts_explanations = """From a business perspective, accurately predicting rain events is more critical than predicting non-rainy days, as the consequences of missing a rain event can be significant, potentially leading to operational disruptions, financial losses or safety risks depending on the industry. The experimental results show that while the model improves in overall accuracy and precision compared to logistic regression, its recall and F1 score have dropped (to 0.5 and 0.44), indicating it fails to detect half of the actual rain events. This underperformance in recall is particularly concerning, as false negatives (predicting no rain when it actually rains) can have higher costs than false positives. In contrast, incorrectly predicting rain when it doesn't occur (false positives) may cause minor inconveniences or unnecessary precautions but are generally less damaging. Therefore, despite improvements in some metrics, the model's failure to reliably detect rain events undermines its value for the business objective."""

In [136]:
# Do not modify this code
print_tile(size="h3", key='business_impacts_explanations', value=business_impacts_explanations)

## H. Project Outcomes

In [139]:
experiment_outcome = """Hypothesis Partially Confirmed"""

In [140]:
# Do not modify this code
print_tile(size="h2", key='experiment_outcomes_explanations', value=experiment_outcome)

In [143]:
experiment_results_explanations = """The outcome of the experiment indicates that although the model showed improvements in certain performance metrics, particularly accuracy and precision, it did not perform better across all key areas, with notable declines in recall and F1 score. As a result, the initial hypothesis is only partially confirmed. The drop in recall is especially concerning from a business standpoint, as failing to correctly predict rain events could lead to significant operational disruptions and financial losses. Additionally, signs of overfitting suggest that the model may be learning from irrelevant or non-informative features. Feature importance analysis supports this finding, revealing several variables with minimal predictive value. These features will be removed in the next experiment to reduce model complexity and enhance generalisation. Removing irrelevant features is expected to improve the model's performance compared to the current version."""

In [144]:
# Do not modify this code
print_tile(size="h2", key='experiment_results_explanations', value=experiment_results_explanations)