In [None]:
# Flood Prediction using Gaussian Naïve Bayes (GNB)


In [7]:
import numpy as np
import pandas as pd
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PowerTransformer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
warnings.filterwarnings('ignore')


In [12]:
df = pd.read_csv('dataset2.csv')
target_column = 'Flood Occurred' if 'Flood Occurred' in df.columns else df.columns[-1]
X = df.drop([target_column,'Historical Floods'], axis='columns')
y = df[target_column]
df.describe()
# print(y.value_counts(normalize=True))
# X.columns

Unnamed: 0,Rainfall (mm),Temperature (°C),Humidity (%),River Discharge (m³/s),Water Level (m),Elevation (m),Historical Floods,Flood Occurred
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,82.937247,30.11996,62.041264,1240.65813,4.965512,1144.825402,1.7428,0.3292
std,68.799231,6.469157,21.239291,1232.517583,1.789395,2130.272042,1.446882,0.46997
min,0.582906,13.597231,20.042424,11.955257,0.022045,-197.919316,0.0,0.0
25%,27.877637,25.629473,43.411869,367.586782,3.836338,150.933995,0.0,0.0
50%,61.195122,29.781332,61.616768,783.803912,5.00212,308.98755,1.0,0.0
75%,117.710799,34.714288,80.481428,1568.233824,6.059815,501.24501,3.0,1.0
max,299.412942,47.443344,99.981283,4995.221303,9.998535,8832.452524,4.0,1.0


In [13]:
# scaler = PowerTransformer()
# X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Target Distribution:", y_train.value_counts())
print(y_train.value_counts(normalize=True))


Target Distribution: Flood Occurred
0    2664
1    1336
Name: count, dtype: int64
Flood Occurred
0    0.666
1    0.334
Name: proportion, dtype: float64


In [14]:
# from imblearn.under_sampling import RandomUnderSampler

# undersample = RandomUnderSampler(random_state=42)
# X_train_resampled, y_train_resampled = undersample.fit_resample(X_train, y_train)

# print("Target Distribution:", y_train_resampled.value_counts())

In [15]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy=0.9, random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

print("Target Distribution:", y_train_resampled.value_counts())
print(y_train_resampled.value_counts(normalize=True))
# df2 = pd.concat([X_train_resampled, y_train_resampled], axis=1)
# df2.to_csv("dataset3.csv", index=False)

Target Distribution: Flood Occurred
0    2664
1    2397
Name: count, dtype: int64
Flood Occurred
0    0.526378
1    0.473622
Name: proportion, dtype: float64


In [16]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV
import numpy as np
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)
# Define parameter grid for var_smoothing
params = {'var_smoothing': np.logspace(-10, 0, 50)}

# Initialize Naive Bayes model
gnb = GaussianNB()
# Perform GridSearchCV to find best hyperparameters
grid_search = GridSearchCV(gnb, params, cv=10, scoring='accuracy', n_jobs=-1)
grid_search.fit(X_train_scaled, y_train)

# Best model with optimal hyperparameters
best_gnb = grid_search.best_estimator_
best_gnb.score(X_test_scaled,y_test)

0.707

In [17]:
# Make predictions on test set
y_pred = best_gnb.predict(X_test_scaled)

# Print evaluation metrics
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Test Accuracy: 0.707
Confusion Matrix:
 [[612  78]
 [215  95]]
Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.89      0.81       690
           1       0.55      0.31      0.39       310

    accuracy                           0.71      1000
   macro avg       0.64      0.60      0.60      1000
weighted avg       0.68      0.71      0.68      1000



In [21]:
def predict_flood(Rainfall, Temperature, Humidity, River_Discharge,Water_Level, Elevation):
    v = np.array([[Rainfall, Temperature, Humidity, River_Discharge,Water_Level, Elevation]])
    v_scaled = scaler.transform(v)
    prediction = best_gnb.predict(v_scaled)[0]
    probability = round(best_gnb.predict_proba(v_scaled)[0][1], 3)  # Probability of flood occurring (class 1)
    print('Flood predicted:'+str(prediction))
    print('Flood possibility: '+str(probability*100)+"%")

predict_flood(150, 26, 95, 14000, 8.2,70)
predict_flood(25, 30, 65, 3200,3.5,120)

Flood predicted:1
Flood possibility: 100.0%
Flood predicted:0
Flood possibility: 26.700000000000003%
