In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [2]:
df1 = pd.read_csv('data/cleaned_flood_data.csv')
df2 = pd.read_csv('data/cleaned_non_flood_data.csv')

In [3]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9308 entries, 0 to 9307
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   YEAR                          9308 non-null   int64  
 1   MONTH                         9308 non-null   int64  
 2   BEGIN_TIME                    9308 non-null   int64  
 3   BEGIN_LAT                     9308 non-null   float64
 4   BEGIN_LON                     9308 non-null   float64
 5   STATE                         9308 non-null   object 
 6   EVENT_TYPE                    9308 non-null   object 
 7   FLOOD_CAUSE                   8782 non-null   object 
 8   EVENT_NARRATIVE               9215 non-null   object 
 9   temperature_2m_mean           9308 non-null   float64
 10  wind_speed_10m_mean           9308 non-null   float64
 11  cloud_cover_mean              9308 non-null   float64
 12  relative_humidity_2m_mean     9308 non-null   float64
 13  dew

In [4]:
df = pd.concat([df1, df2])

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18616 entries, 0 to 9307
Data columns (total 31 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   YEAR                          18616 non-null  int64  
 1   MONTH                         18616 non-null  int64  
 2   BEGIN_TIME                    18616 non-null  int64  
 3   BEGIN_LAT                     18616 non-null  float64
 4   BEGIN_LON                     18616 non-null  float64
 5   STATE                         18616 non-null  object 
 6   EVENT_TYPE                    18616 non-null  object 
 7   FLOOD_CAUSE                   8782 non-null   object 
 8   EVENT_NARRATIVE               9215 non-null   object 
 9   temperature_2m_mean           18616 non-null  float64
 10  wind_speed_10m_mean           18616 non-null  float64
 11  cloud_cover_mean              18616 non-null  float64
 12  relative_humidity_2m_mean     18616 non-null  float64
 13  dew_poi

In [6]:
# drop precip sum, flood cause, event narrative, state
# precip sum because its like 97% correlated with rain sum
df = df.drop(['precipitation_sum', 'STATE', 'FLOOD_CAUSE', 'EVENT_NARRATIVE'], axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18616 entries, 0 to 9307
Data columns (total 27 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   YEAR                          18616 non-null  int64  
 1   MONTH                         18616 non-null  int64  
 2   BEGIN_TIME                    18616 non-null  int64  
 3   BEGIN_LAT                     18616 non-null  float64
 4   BEGIN_LON                     18616 non-null  float64
 5   EVENT_TYPE                    18616 non-null  object 
 6   temperature_2m_mean           18616 non-null  float64
 7   wind_speed_10m_mean           18616 non-null  float64
 8   cloud_cover_mean              18616 non-null  float64
 9   relative_humidity_2m_mean     18616 non-null  float64
 10  dew_point_2m_mean             18616 non-null  float64
 11  rain_sum                      18616 non-null  float64
 12  pressure_msl_mean             18616 non-null  float64
 13  soil_mo

In [8]:
X = df.drop(columns=["EVENT_TYPE"])
y = df["EVENT_TYPE"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [9]:
#this was for finding optimal hyperparameters, ignore this cell



# rf = RandomForestClassifier(random_state=42, n_jobs=-1)

# # parameter grid, for gridsearch cv to find hyperparameters that yield highest performance
# param_grid = {
#     "n_estimators": [100, 300, 500],
#     "max_depth": [None, 10, 20],
#     "min_samples_split": [2, 3, 5],
#     "min_samples_leaf": [1, 2]
# }

# grid = GridSearchCV(
#     estimator=rf,
#     param_grid=param_grid,
#     cv=3,               # 3-fold cross validation
#     scoring="accuracy", 
#     n_jobs=-1,
#     verbose=1
# )

# grid.fit(X_train, y_train)

# best_rf = grid.best_estimator_
# print("Best parameters:", grid.best_params_)

# y_pred = best_rf.predict(X_test)
# print("Accuracy:", accuracy_score(y_test, y_pred))
# print(classification_report(y_test, y_pred))


Fitting 3 folds for each of 54 candidates, totalling 162 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 500}
Accuracy: 0.8375402792696026
              precision    recall  f1-score   support

 Flash Flood       0.83      0.84      0.84      1600
       Flood       0.63      0.26      0.36       262
      Normal       0.85      0.92      0.88      1862

    accuracy                           0.84      3724
   macro avg       0.77      0.67      0.69      3724
weighted avg       0.83      0.84      0.83      3724



In [10]:
rf = RandomForestClassifier(
    n_estimators=500,
    max_depth=None,
    min_samples_split=2,
    min_samples_leaf=1,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

print(classification_report(y_test, y_pred))

Accuracy: 0.8386143931256713
              precision    recall  f1-score   support

 Flash Flood       0.83      0.85      0.84      1600
       Flood       0.63      0.24      0.35       262
      Normal       0.86      0.91      0.89      1862

    accuracy                           0.84      3724
   macro avg       0.77      0.67      0.69      3724
weighted avg       0.83      0.84      0.83      3724

