In [ ]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Machine Learning imports
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB

# Metrics
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, roc_curve, auc,
    roc_auc_score, precision_recall_curve
)

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Model persistence
import joblib
import os

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✅ All libraries imported successfully!")

## 1. Data Loading and Initial Exploration

Load the FEMA disaster declarations dataset and perform initial inspection.

In [None]:
# Load the dataset
data_path = '../data/disaster_declarations.csv'
df = pd.read_csv(data_path)

print(f"Dataset Shape: {df.shape}")
print(f"\nRows: {df.shape[0]:,}")
print(f"Columns: {df.shape[1]}")
print("\n" + "="*50)
print("Dataset loaded successfully!")

In [None]:
# Display first few rows
print("First 5 rows of the dataset:")
df.head()

In [None]:
# Display dataset information
print("Dataset Information:")
print("="*70)
df.info()
print("\n" + "="*70)
print("\nData Types Distribution:")
print(df.dtypes.value_counts())

In [None]:
# Check for missing values
print("Missing Values Analysis:")
print("="*70)
missing = df.isnull().sum()
missing_pct = (df.isnull().sum() / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing.index,
    'Missing_Count': missing.values,
    'Percentage': missing_pct.values
})
missing_df = missing_df[missing_df['Missing_Count'] > 0].sort_values('Missing_Count', ascending=False)

if len(missing_df) > 0:
    print(missing_df.to_string(index=False))
else:
    print("✅ No missing values found in the dataset!")

In [None]:
# Statistical summary
print("Statistical Summary of Numerical Features:")
print("="*70)
df.describe().T