In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import numpy as np
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

In [3]:
# Load dataset
downsample_data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Final project/downsample_df.csv')

In [4]:
# Separate the features and target variable
target = downsample_data['FSO_Att']

features = downsample_data[[
    'WindSpeedMax', 'TemperatureDifference', 'RelativeHumidity', 'SYNOPCode',
    'AbsoluteHumidityMax', 'ParticulateMin', 'AbsoluteHumidityMin', 'Time',
    'TemperatureMax', 'ParticulateMax', 'TemperatureMin', 'VisibilityMin',
    'AbsoluteHumidity', 'Particulate', 'VisibilityMax', 'Temperature',
    'Visibility', 'Distance'
]]

# Split the data into training and testing sets (70% train, 30% test)
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.3, random_state=42
)

# Initial ExtraTreesRegressor with OOB Score
etr_model = ExtraTreesRegressor(random_state=42, oob_score=True, bootstrap=True)

In [8]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search = GridSearchCV(estimator=etr_model, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


In [9]:
best_model = grid_search.best_estimator_

# Predictions
y_pred = best_model.predict(X_test)

best_params = grid_search.best_params_
print(f"Best Parameters: {best_params}")

# Calculate RMSE
rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)),2)
print(f"Root Mean Squared Error (RMSE) on Test Data: {rmse}")

# Calculate R² score
r2 = round(r2_score(y_test, y_pred),2)
print(f"R-squared (R²) on Test Data: {r2}")

Best Parameters: {'max_depth': 20, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
Root Mean Squared Error (RMSE) on Test Data: 1.13
R-squared (R²) on Test Data: 0.93
