In [None]:
# This cell is inserted to fix the issue with long routes failing due to API limits and timeouts.
# We will implement caching for weather data and optimize the Dijkstra algorithm.


In [5]:
import kagglehub

path = kagglehub.dataset_download("aditya2803/india-floods-inventory")
print("Path to dataset files:", path)

Path to dataset files: C:\Users\raiti\.cache\kagglehub\datasets\aditya2803\india-floods-inventory\versions\1


In [6]:
import os
print(os.listdir(path))

['India_Floods_Inventory.csv']


In [7]:
import pandas as pd

df = pd.read_csv(f"{path}/India_Floods_Inventory.csv")
print(df.head())

                    UEI  Start Date    End Date Duration(Days)   Main Cause  \
0  UEI-IMD-FL-2015-0001  2015-06-20  2015-06-21              1  Heavy rains   
1  UEI-IMD-FL-2015-0002  2015-11-15  2015-11-23              8  Heavy rains   
2  UEI-IMD-FL-2015-0003  2015-12-22  2015-12-22              0  Heavy rains   
3  UEI-IMD-FL-2015-0004  2015-10-06  2015-10-06              0  Heavy rains   
4  UEI-IMD-FL-2015-0005  2015-02-19  2015-02-19              0  Heavy rains   

  Location                                          Districts  \
0      NaN  East Godavari, Srikakulam, Visakhapatnam and W...   
1      NaN  Anantapur, Chittoor, East Godavari, Krishna, N...   
2      NaN                                     Vishakhapatnam   
3      NaN                         Parts of Arunachal Pradesh   
4      NaN                                     Parts of Assam   

               State  Latitude  Longitude  Severity  Area Affected  \
0    ANDHRA  PRADESH       NaN        NaN       NaN            N

In [8]:
# Keep only columns needed for ML
df_ml = df[
    [
        "Latitude",
        "Longitude",
        "Start Date",
        "Main Cause",
        "Area Affected",
        "Severity",
        "State"
    ]
]

print(df_ml.head())
print(df_ml.isnull().sum())


   Latitude  Longitude  Start Date   Main Cause  Area Affected  Severity  \
0       NaN        NaN  2015-06-20  Heavy rains            NaN       NaN   
1       NaN        NaN  2015-11-15  Heavy rains            NaN       NaN   
2       NaN        NaN  2015-12-22  Heavy rains            NaN       NaN   
3       NaN        NaN  2015-10-06  Heavy rains            NaN       NaN   
4       NaN        NaN  2015-02-19  Heavy rains            NaN       NaN   

               State  
0    ANDHRA  PRADESH  
1    ANDHRA  PRADESH  
2    ANDHRA  PRADESH  
3  ARUNACHAL PRADESH  
4              ASSAM  
Latitude         702
Longitude        702
Start Date         0
Main Cause       122
Area Affected    765
Severity         765
State            563
dtype: int64


In [9]:
df_ml = df_ml.dropna(subset=["Latitude", "Longitude", "Severity"])
print("Rows after drop:", len(df_ml))


Rows after drop: 262


In [10]:
def make_risk(row):
    if row["Severity"] >= 3 or row["Area Affected"] > 200000:
        return "HIGH"
    elif row["Severity"] == 2:
        return "MEDIUM"
    else:
        return "LOW"

df_ml["Risk_Level"] = df_ml.apply(make_risk, axis=1)
print(df_ml["Risk_Level"].value_counts())



Risk_Level
LOW       181
HIGH       60
MEDIUM     21
Name: count, dtype: int64


In [11]:
df_ml["Area Affected"] = df_ml["Area Affected"].fillna(
    df_ml["Area Affected"].median()
)

df_ml["Main Cause"] = df_ml["Main Cause"].fillna("Unknown")
df_ml["State"] = df_ml["State"].fillna("Unknown")


In [12]:
print(df_ml.head())
print(df_ml.isnull().sum())


     Latitude  Longitude  Start Date        Main Cause  Area Affected  \
464   14.5398    75.0937  1985-06-23        Heavy rain      254234.60   
465   32.8353    76.9103  1985-07-18        Heavy rain      117441.17   
466   26.4816    82.8434  1985-09-13        Heavy rain       89994.67   
467   21.1495    86.7154  1985-10-18  Tropical cyclone       46758.64   
468   25.6339    84.0726  1986-06-15    Monsoonal rain      507167.44   

     Severity    State Risk_Level  
464       1.0  Unknown       HIGH  
465       1.0  Unknown        LOW  
466       2.0  Unknown     MEDIUM  
467       2.0  Unknown     MEDIUM  
468       1.0  Unknown       HIGH  
Latitude         0
Longitude        0
Start Date       0
Main Cause       0
Area Affected    0
Severity         0
State            0
Risk_Level       0
dtype: int64


In [13]:
from sklearn.preprocessing import LabelEncoder

le_cause = LabelEncoder()
le_state = LabelEncoder()
le_risk = LabelEncoder()

df_ml["Main Cause Enc"] = le_cause.fit_transform(df_ml["Main Cause"])
df_ml["State Enc"] = le_state.fit_transform(df_ml["State"])
df_ml["Risk Enc"] = le_risk.fit_transform(df_ml["Risk_Level"])

print(df_ml[["Main Cause", "Main Cause Enc"]].head())
print(df_ml[["Risk_Level", "Risk Enc"]].head())


           Main Cause  Main Cause Enc
464        Heavy rain               3
465        Heavy rain               3
466        Heavy rain               3
467  Tropical cyclone               7
468    Monsoonal rain               4
    Risk_Level  Risk Enc
464       HIGH         0
465        LOW         1
466     MEDIUM         2
467     MEDIUM         2
468       HIGH         0


In [14]:
# Extract month from Start Date before it was dropped
df["Start Date"] = pd.to_datetime(df["Start Date"], errors="coerce")
df["Month"] = df["Start Date"].dt.month

# Now add the Month column to df_ml
df_ml["Month"] = df.loc[df_ml.index, "Month"]

print(df_ml[["Month"]].head())


     Month
464    6.0
465    7.0
466    9.0
467   10.0
468    6.0


In [15]:
X = df_ml[
    [
        "Latitude",
        "Longitude",
        "Month",
        "Main Cause Enc",
        "Area Affected",
        "State Enc"
    ]
]

y_class = df_ml["Risk Enc"]     # classification
y_reg = df_ml["Severity"]       # regression

print(X.head())


     Latitude  Longitude  Month  Main Cause Enc  Area Affected  State Enc
464   14.5398    75.0937    6.0               3      254234.60          0
465   32.8353    76.9103    7.0               3      117441.17          0
466   26.4816    82.8434    9.0               3       89994.67          0
467   21.1495    86.7154   10.0               7       46758.64          0
468   25.6339    84.0726    6.0               4      507167.44          0


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_class_train, y_class_test = train_test_split(
    X, y_class, test_size=0.2, random_state=42
)

_, _, y_reg_train, y_reg_test = train_test_split(
    X, y_reg, test_size=0.2, random_state=42
)


In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

clf = RandomForestClassifier(
    n_estimators=200,
    max_depth=12,
    class_weight="balanced",
    random_state=42
)

clf.fit(X_train, y_class_train)

y_class_pred = clf.predict(X_test)

print("Classification Accuracy:", accuracy_score(y_class_test, y_class_pred))
print(classification_report(y_class_test, y_class_pred))
print(confusion_matrix(y_class_test, y_class_pred))


Classification Accuracy: 0.8867924528301887
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        14
           1       0.89      0.94      0.92        35
           2       0.00      0.00      0.00         4

    accuracy                           0.89        53
   macro avg       0.63      0.65      0.64        53
weighted avg       0.85      0.89      0.87        53

[[14  0  0]
 [ 0 33  2]
 [ 0  4  0]]


In [18]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

reg = RandomForestRegressor(
    n_estimators=100,
    max_depth=12,
    random_state=42
)

reg.fit(X_train, y_reg_train)

y_reg_pred = reg.predict(X_test)

print("MAE:", mean_absolute_error(y_reg_test, y_reg_pred))
print("R2 Score:", r2_score(y_reg_test, y_reg_pred))


MAE: 0.3167517399346283
R2 Score: -0.18082096470381015


In [19]:
import joblib

joblib.dump(clf, "flood_risk_classifier.pkl")
joblib.dump(reg, "flood_severity_regressor.pkl")
joblib.dump(le_risk, "risk_encoder.pkl")

print("Models saved!")


Models saved!
