In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
downsample_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/downsample_data.csv')

In [7]:
# choose only top 14 features which produce the optimal model
X = downsample_data.drop(columns=['FSO_Att','RFL_Att','WindSpeed','WindDirection',
                                  'WindSpeedMin','TemperatureDifference','WindSpeedMax',
                                  'Visibility','Frequency','VisibilityMax','Time',
                                  'VisibilityMin','SYNOPCode','RelativeHumidity',
                                  'ParticulateMin', 'ParticulateMax'])
y_fso = downsample_data['RFL_Att']

In [8]:
X

Unnamed: 0,AbsoluteHumidity,AbsoluteHumidityMax,AbsoluteHumidityMin,Distance,Particulate,RainIntensity,RainIntensityMax,RainIntensityMin,Temperature,TemperatureMax,TemperatureMin
0,20.244567,20.819340,18.475689,2115.667536,0.000000,0.000000,0.000000,0.000000,23.201133,25.021254,21.063413
1,20.164626,20.833908,19.261558,2119.138170,0.000000,0.000000,0.000000,0.000000,23.017296,25.182210,20.854708
2,17.515085,18.017632,16.110214,2962.180194,0.000000,0.000000,0.000000,0.000000,23.470992,24.086457,22.270371
3,17.796149,18.858701,17.232316,2964.409031,0.000000,0.000000,0.000000,0.000000,23.413134,24.634976,23.073526
4,17.943657,17.995399,16.521335,2963.216445,0.000000,0.000000,0.000000,0.000000,22.956074,23.263533,22.161422
...,...,...,...,...,...,...,...,...,...,...,...
1332,9.383492,10.103008,9.215229,2012.267998,14.072659,0.021041,0.023141,0.020234,16.422097,17.954418,14.862038
1333,19.386257,19.538264,18.324844,2961.452049,19.982826,0.000000,0.000000,0.000000,25.469319,25.702272,23.133961
1334,20.457950,21.183070,19.437991,2958.056188,0.000000,0.000000,0.000000,0.000000,23.206548,24.304511,23.110257
1335,5.717047,6.069735,5.180714,4822.782771,117.289839,0.065223,0.066140,0.060542,5.571695,5.623157,5.365653


In [9]:
# Split the data
X_train_fso, X_test_fso, y_train_fso, y_test_fso = train_test_split(X, y_fso, test_size=0.25, random_state=42)
# Initial ExtraTreesRegressor with OOB Score
et_fso = ExtraTreesRegressor(random_state=42, oob_score=True, bootstrap=True)

In [10]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=et_fso, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train_fso, y_train_fso)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [11]:
best_model = grid_search.best_estimator_

# Predictions
y_pred_fso = best_model.predict(X_test_fso)

best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Calculate RMSE
rmse = round(np.sqrt(mean_squared_error(y_test_fso, y_pred_fso)),2)
print(f"Root Mean Squared Error (RMSE) on Test Data: {rmse}")

# Calculate R² score
r2 = round(r2_score(y_test_fso, y_pred_fso),2)
print(f"R-squared (R²) on Test Data: {r2}")

Best Parameters: {'max_depth': None, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Root Mean Squared Error (RMSE) on Test Data: 0.74
R-squared (R²) on Test Data: 0.92
