# **Experiment Notebook**



In [11]:
# Do not modify this code
!pip install -q utstd

from utstd.ipyrenders import *

In [12]:
# Do not modify this code
import warnings
warnings.simplefilter(action='ignore')

## 0. Import Packages

In [13]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import altair as alt

---
## A. Project Description


In [14]:
student_name = "Fang Yee Tan"
student_id = "25677648"

In [15]:
# Do not modify this code
print_tile(size="h1", key='student_name', value=student_name)

In [16]:
# Do not modify this code
print_tile(size="h1", key='student_id', value=student_id)

---
## C. Data Understanding

### C.1   Load Datasets


In [17]:
from pathlib import Path
file_path = Path.home() / "Desktop" / "36120"

X_train = pd.read_csv(file_path / "X_train.csv")
X_val = pd.read_csv(file_path / "X_val.csv")
X_test = pd.read_csv(file_path / "X_test.csv")
y_train = pd.read_csv(file_path / "y_train.csv").squeeze()
y_val = pd.read_csv(file_path / "y_val.csv").squeeze()
y_test = pd.read_csv(file_path / "y_test.csv").squeeze()

---
## I. Selection of Performance Metrics

> Provide some explanations on why you believe the performance metrics you chose is appropriate


In [18]:
from sklearn.metrics import mean_squared_error, r2_score

In [19]:
performance_metrics_explanations = """Root Mean Squared Error (RMSE) and R squared (R²) are selected as the primary performance metrics for this project. RMSE is chosen because it shares the same units as the target variable, precipitation in millimeters, making the magnitude of errors straightforward and easy to interpret. Furthermore, RMSE penalises larger errors more heavily, which is crucial for accurately reflecting the impact of significant deviations in precipitation forecasts. R squared is included as it quantifies the proportion of variance in the observed data explained by the model, providing valuable insight into the overall goodness of fit and facilitating a comprehensive evaluation of model performance."""

In [20]:
# Do not modify this code
print_tile(size="h3", key='performance_metrics_explanations', value=performance_metrics_explanations)

## J. Train Machine Learning Model

Null Hypothesis: There is no statistically significant difference in predictive performance between the Random Forest model and the baseline model.

Alternative Hypothesis: There is a statistically significant difference in predictive performance between the Random Forest model and the baseline model.

### J.1 Import Algorithm

> Provide some explanations on why you believe this algorithm is a good fit


In [40]:
from sklearn.ensemble import RandomForestRegressor

In [41]:
algorithm_selection_explanations = """Random Forest is selected for this task due to its ability to aggregate predictions from multiple decision trees, which helps reduce overfitting and enhances predictive accuracy. This ensemble approach increases model robustness, particularly in complex and noisy datasets. Moreover, Random Forest provides feature importance scores, enabling the identification of the most influential predictors and offering valuable insights into the underlying data relationships.
"""

In [42]:
# Do not modify this code
print_tile(size="h3", key='algorithm_selection_explanations', value=algorithm_selection_explanations)

### J.2 Set Hyperparameters

> Provide some explanations on why you believe this algorithm is a good fit


In [89]:
# Hyperparameter grid
param_grid = {
    'n_estimators': [100],
    'max_depth': [5],
    'min_samples_split': [12],
    'min_samples_leaf': [3],
    'max_features': ['sqrt']
}

In [90]:
hyperparameters_selection_explanations = """The parameter n estimators refers to the number of decision trees used in the random forest. Increasing this number can improve the stability and accuracy of the model but may also increase the computational time. The max depth parameter controls the maximum depth of each individual tree. While deeper trees can capture more complex relationships, they also increase the risk of overfitting. The min samples split defines the minimum number of samples required to split an internal node, and min samples leaf specifies the minimum number of samples that must be present in a leaf node. These parameters help to control the complexity of the trees and enhance the model’s generalisation capability. The max features parameter determines the number of features considered when selecting the best split at each node, which influences both model performance and diversity among trees.
"""

In [91]:
# Do not modify this code
print_tile(size="h3", key='hyperparameters_selection_explanations', value=hyperparameters_selection_explanations)

### J.3 Fit Model

In [92]:
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV

In [93]:
# Time series split
tscv = TimeSeriesSplit(n_splits=5)

rf_model = RandomForestRegressor(random_state=18)

# Scoring metrics
scoring = {
    'rmse': 'neg_root_mean_squared_error',
    'r2': 'r2'
}

# Grid search
grid_search = GridSearchCV(
    estimator=rf_model,
    param_grid=param_grid,
    cv=tscv,
    scoring=scoring,
    refit='rmse',
    n_jobs=-1,
    return_train_score=True
)

# Fit the model
grid_search.fit(X_train, y_train)

# Best results
print("Best parameters:", grid_search.best_params_)
print("Best RMSE:", -grid_search.best_score_)
print("Best R2:", grid_search.cv_results_['mean_test_r2'][grid_search.best_index_])

best_rf_model = grid_search.best_estimator_

Best parameters: {'max_depth': 5, 'max_features': 'sqrt', 'min_samples_leaf': 3, 'min_samples_split': 12, 'n_estimators': 100}
Best RMSE: 14.405774583135713
Best R2: 0.09306993560294277


### J.4 Model Technical Performance

> Provide some explanations on model performance


In [94]:
# Predict on validation dataset
y_val_pred = best_rf_model.predict(X_val)

In [95]:
rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
r2 = r2_score(y_val, y_val_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

RMSE: 11.26
R2: -0.03


In [96]:
# Predict on testing dataset
y_test_pred = best_rf_model.predict(X_test)

In [98]:
rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2 = r2_score(y_test, y_test_pred)

print(f"RMSE: {rmse:.2f}")
print(f"R2: {r2:.2f}")

RMSE: 13.92
R2: 0.11


In [101]:
# Feature importance
best_rf_model.feature_importances_

array([0.05281982, 0.11859538, 0.01184215, 0.07993196, 0.01043646,
       0.02698202, 0.03602833, 0.03012833, 0.03572213, 0.02611317,
       0.02218358, 0.0371348 , 0.1146004 , 0.02492807, 0.01273523,
       0.01253018, 0.01647334, 0.10005539, 0.03883187, 0.04761785,
       0.01608291, 0.03463446, 0.01673215, 0.01103577, 0.03047875,
       0.01355864, 0.02178685])

In [102]:
feat_imp_rf_df = pd.DataFrame(
    {
        'feature': X_train.columns,
        'feature_importance': best_rf_model.feature_importances_
     }
)
feat_imp_rf_df

Unnamed: 0,feature,feature_importance
0,relative_humidity_2m_mean,0.05282
1,cloud_cover_mean,0.118595
2,cloud_cover_max,0.011842
3,cloud_cover_min,0.079932
4,wind_gusts_10m_mean,0.010436
5,wind_speed_10m_mean,0.026982
6,wet_bulb_temperature_2m_mean,0.036028
7,pressure_msl_mean,0.030128
8,vapour_pressure_deficit_max,0.035722
9,shortwave_radiation_sum,0.026113


In [103]:
alt.Chart(feat_imp_rf_df).mark_bar().encode(
    x='feature_importance',
    y=alt.Y("feature", sort='-x'),
    tooltip=['feature','feature_importance'])

In [None]:
from sklearn.feature_selection import SelectFromModel
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
import numpy as np

# Assuming X_train, X_val, X_test, y_train, y_val, y_test are already defined

# Feature importances from Random Forest
importances = np.array([
    0.052820, 0.118595, 0.011842, 0.079932, 0.010436, 0.026982, 0.036028, 0.030128,
    0.035722, 0.026113, 0.022184, 0.037135, 0.114600, 0.024928, 0.012735, 0.012530,
    0.016473, 0.100055, 0.038832, 0.047618, 0.016083, 0.034634, 0.016732, 0.011036,
    0.030479, 0.013559, 0.021787
])
top_indices = importances.argsort()[::-1][:15]  # Top 15 features
X_train_selected = X_train.iloc[:, top_indices]
X_val_selected = X_val.iloc[:, top_indices]
X_test_selected = X_test.iloc[:, top_indices]

# Define the model
xgb_model = XGBRegressor(random_state=18, verbosity=0)

# Define hyperparameter grid
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.05, 0.1]
}

# Time series split
tscv = TimeSeriesSplit(n_splits=5)

# Scoring metrics
scoring = {
    'rmse': 'neg_root_mean_squared_error',
    'r2': 'r2'
}

# Grid search
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring=scoring,
    refit='rmse',
    cv=tscv,
    n_jobs=-1,
    verbose=1
)

# Fit model
grid_search.fit(X_train_selected, y_train)

# Predictions
y_val_pred = grid_search.best_estimator_.predict(X_val_selected)
y_test_pred = grid_search.best_estimator_.predict(X_test_selected)

# Evaluation
val_rmse = mean_squared_error(y_val, y_val_pred, squared=False)
val_r2 = r2_score(y_val, y_val_pred)

test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)
test_r2 = r2_score(y_test, y_test_pred)

print("Best parameters:", grid_search.best_params_)
print(f"Validation RMSE: {val_rmse:.4f}, R2: {val_r2:.4f}")
print(f"Test RMSE: {test_rmse:.4f}, R2: {test_r2:.4f}")


In [104]:
model_performance_explanations = """From the performance metrics on both the validation and testing datasets, it is clear that the models struggle to effectively capture the underlying patterns. The RMSE values for the validation and testing sets are 11.26 and 13.92, respectively, indicating that the model's predictions deviate considerably from the actual values. Furthermore, the R² scores of -0.03 (validation) and 0.11 (testing) suggest that the model explains very little of the variance in cumulative precipitation. These results indicate that the model is not effectively capturing the complex precipitation patterns.
"""

In [105]:
# Do not modify this code
print_tile(size="h3", key='model_performance_explanations', value=model_performance_explanations)

### J.5 Business Impact from Current Model Performance

> Provide some analysis on the model impacts from the business point of view


In [106]:
business_impacts_explanations = """Despite applying more advanced models, the testing RMSE remains relatively high at 13.92, and the R² score is still low at 0.11, indicating that the model is not effectively learning the underlying patterns in the data. From a business perspective, such poor predictive performance can lead to significant consequences. Inaccurate precipitation forecasts may result in operational inefficiencies, increased costs, or even safety risks, especially in industries that are highly weather-dependent. A high RMSE means the predicted rainfall could deviate substantially from actual values, potentially causing disruption, inconvenience, or poor decision-making that affects both resource planning and public safety.
"""

In [107]:
# Do not modify this code
print_tile(size="h3", key='business_impacts_explanations', value=business_impacts_explanations)

## H. Project Outcomes

In [108]:
experiment_outcome = """Hypothesis Rejected"""

In [109]:
# Do not modify this code
print_tile(size="h2", key='experiment_outcomes_explanations', value=experiment_outcome)

In [110]:
experiment_results_explanations = """Since the Random Forest model does not show significant improvement over the baseline or linear regression models, it suggests that the current approach may not be sufficient to capture the complexity of the data. Rather than relying solely on feature importance from Random Forest, the next step is to explore a more advanced model, XGBoost. Given its ability to model complex nonlinear relationships and handle feature interactions more effectively, XGBoost is expected to deliver better predictive performance and improve upon the current results.
"""

In [111]:
# Do not modify this code
print_tile(size="h2", key='experiment_results_explanations', value=experiment_results_explanations)