In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the data
file_path = 'C:\Users\shrey\OneDrive\Desktop\floodguard\final.csv'  
final = pd.read_csv(file_path)

In [3]:
# Check missing values
NAs = pd.concat([final.isnull().sum()], axis=1, keys=["Final"])
print("Missing values:")
print(NAs[NAs.sum(axis=1) > 0])
final.head(5)

Missing values:
      Final
tavg    211
prcp    552
snow   5753
wdir  11713
wspd    372
pres   1093


Unnamed: 0,state,category,date,disaster,disaster_info,designated_area,severity,tavg,tmin,tmax,prcp,snow,wdir,wspd,pres
0,TX,FM,2000-01-03,Fire,Saddleback Fire,Wise (County),0.0,15.5,9.4,21.1,0.0,0.0,294.0,17.3,1012.8
1,TX,FM,2000-01-04,Fire,Purgatory Fire,Comal (County),0.0,6.1,-0.6,12.2,0.0,0.0,,13.7,1030.2
2,TX,FM,2000-01-05,Fire,Bob's Trail Fire,Bastrop (County),0.0,6.1,-3.4,15.0,0.0,0.0,,9.0,1025.3
3,KY,DR,2000-01-10,Tornado,"Tornadoes, Severe Storms, Torrential Rains, An...",Ballard (County),0.5,8.9,4.4,13.3,0.0,0.0,198.0,25.6,1002.9
4,KY,DR,2000-01-10,Tornado,"Tornadoes, Severe Storms, Torrential Rains, An...",Breckinridge (County),0.5,8.9,4.4,13.3,0.0,0.0,198.0,25.6,1002.9


In [4]:
# Fill missing values for numerical columns
numerical_cols = ['tavg', 'prcp', 'snow', 'wdir', 'wspd', 'pres']
for col in numerical_cols:
    final[col].fillna(final[col].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  final[col].fillna(final[col].mean(), inplace=True)


In [5]:

# Create dummy variables
final = pd.get_dummies(final)

In [6]:
# Create severity bins
bins = [final['severity'].min(), 2, 4, final['severity'].max()]
bins.sort()
labels = ['Low', 'Medium', 'High']

In [7]:
# Convert severity to categories
final['severity'] = pd.cut(final['severity'], bins=bins, labels=labels, right=False)


In [8]:
# Split features and target
X = final.drop(['severity'], axis=1)
y = final['severity']

In [9]:
# Split train test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Convert categorical labels to numerical
y_train_num = pd.Categorical(y_train).codes
y_test_num = pd.Categorical(y_test).codes

In [13]:
# Initialize and train XGBoost
xgb_model = xgb.XGBClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=6,
    random_state=42,
    objective='multi:softprob',
    num_class=len(labels),
      eval_metric='mlogloss' 
)

In [16]:
xgb_model.fit(
    X_train, 
    y_train_num,
    eval_set=[(X_test, y_test_num)],
    
    verbose=True
)

[0]	validation_0-mlogloss:0.95582
[1]	validation_0-mlogloss:0.83765
[2]	validation_0-mlogloss:0.73818
[3]	validation_0-mlogloss:0.65332
[4]	validation_0-mlogloss:0.58026
[5]	validation_0-mlogloss:0.51669
[6]	validation_0-mlogloss:0.46145
[7]	validation_0-mlogloss:0.41292
[8]	validation_0-mlogloss:0.37009
[9]	validation_0-mlogloss:0.33251
[10]	validation_0-mlogloss:0.29914
[11]	validation_0-mlogloss:0.26944
[12]	validation_0-mlogloss:0.24311
[13]	validation_0-mlogloss:0.21948
[14]	validation_0-mlogloss:0.19848
[15]	validation_0-mlogloss:0.17978
[16]	validation_0-mlogloss:0.16283
[17]	validation_0-mlogloss:0.14784
[18]	validation_0-mlogloss:0.13420
[19]	validation_0-mlogloss:0.12211
[20]	validation_0-mlogloss:0.11122
[21]	validation_0-mlogloss:0.10147
[22]	validation_0-mlogloss:0.09249
[23]	validation_0-mlogloss:0.08467
[24]	validation_0-mlogloss:0.07742
[25]	validation_0-mlogloss:0.07087
[26]	validation_0-mlogloss:0.06513
[27]	validation_0-mlogloss:0.05975
[28]	validation_0-mlogloss:0.0

In [17]:
# Make predictions
y_pred_proba = xgb_model.predict_proba(X_test)
y_pred_num = np.argmax(y_pred_proba, axis=1)
y_pred = pd.Categorical.from_codes(y_pred_num, categories=labels)

In [18]:

# Print metrics
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 99.94%

Classification Report:
              precision    recall  f1-score   support

         Low       1.00      1.00      1.00      3988
      Medium       1.00      1.00      1.00      2934

    accuracy                           1.00      6922
   macro avg       1.00      1.00      1.00      6922
weighted avg       1.00      1.00      1.00      6922



In [19]:
# Feature importance
feature_importance = pd.DataFrame({
    'feature': X_train.columns,
    'importance': xgb_model.feature_importances_
}).sort_values('importance', ascending=False)


In [20]:
print(feature_importance)

                               feature  importance
1954             disaster_Severe Storm    0.526100
1953         disaster_Severe Ice Storm    0.262600
16                            state_FL    0.037105
3124                disaster_info_Snow    0.017432
1636                   date_2019-05-28    0.013314
...                                ...         ...
1821                   date_2021-08-17    0.000000
1820                   date_2021-08-16    0.000000
1819                   date_2021-08-14    0.000000
1818                   date_2021-08-13    0.000000
5403  designated_area_Ziebach (County)    0.000000

[5404 rows x 2 columns]
