# Exploratory Data Analysis - Sri Lanka Flood Risk Dataset

This notebook performs a comprehensive EDA on the flood risk dataset to understand feature distributions, correlations, and class balance before model training.

In [1]:
import pandas as pd
import numpy as np
import matplotlib
matplotlib.use('Agg')
import matplotlib.pyplot as plt
import seaborn as sns
import sys, os

# Add project root to path
sys.path.insert(0, os.path.abspath('..'))

from src.config import DATA_PATH, CATEGORICAL_FEATURES, NUMERICAL_FEATURES, TARGET

sns.set_theme(style='whitegrid', palette='muted')
plt.rcParams['figure.dpi'] = 120

df = pd.read_csv(DATA_PATH)
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (9986, 11)


Unnamed: 0,district,division,climate_zone,year,month,rainfall_mm,river_level_m,soil_saturation_percent,drainage_quality,district_flood_prone,flood_occurred
0,Matara,Matara-DS6,Wet,2019,4,281.1,3.21,61.3,Moderate,1,1
1,Colombo,Colombo-DS3,Wet,2019,12,342.6,3.5,56.6,Poor,1,0
2,Kalutara,Kalutara-DS5,Wet,2009,5,254.6,3.36,54.2,Moderate,1,1
3,Nuwara Eliya,Nuwara Eliya-DS4,Wet,2022,2,240.7,2.57,48.5,Good,0,0
4,Gampaha,Gampaha-DS5,Wet,2019,9,321.3,3.33,63.7,Moderate,1,0


## 1. Data Overview

In [2]:
print('--- Data Types ---')
print(df.dtypes)
print(f'\n--- Missing Values ---')
print(df.isnull().sum())
print(f'\nTotal missing: {df.isnull().sum().sum()}')

--- Data Types ---
district                    object
division                    object
climate_zone                object
year                         int64
month                        int64
rainfall_mm                float64
river_level_m              float64
soil_saturation_percent    float64
drainage_quality            object
district_flood_prone         int64
flood_occurred               int64
dtype: object

--- Missing Values ---
district                   0
division                   0
climate_zone               0
year                       0
month                      0
rainfall_mm                0
river_level_m              0
soil_saturation_percent    0
drainage_quality           0
district_flood_prone       0
flood_occurred             0
dtype: int64

Total missing: 0


In [3]:
df.describe()

Unnamed: 0,year,month,rainfall_mm,river_level_m,soil_saturation_percent,district_flood_prone,flood_occurred
count,9986.0,9986.0,9986.0,9986.0,9986.0,9986.0,9986.0
mean,2016.519928,6.520529,179.253285,2.484633,43.795203,0.442319,0.219808
std,5.171457,3.454233,115.273745,0.672955,12.453076,0.496687,0.414137
min,2008.0,1.0,20.5,1.14,8.0,0.0,0.0
25%,2012.0,4.0,94.025,1.95,34.6,0.0,0.0
50%,2017.0,7.0,149.7,2.32,41.7,0.0,0.0
75%,2021.0,10.0,232.575,2.99,51.8,1.0,0.0
max,2025.0,12.0,950.0,5.86,100.0,1.0,1.0


## 2. Target Variable - Class Distribution

In [4]:
fig, axes = plt.subplots(1, 2, figsize=(10, 4))

# Bar chart
counts = df[TARGET].value_counts()
colors = ['#059669', '#dc2626']
axes[0].bar(['No Flood (0)', 'Flood (1)'], counts.values, color=colors)
axes[0].set_ylabel('Count')
axes[0].set_title('Class Distribution')
for i, v in enumerate(counts.values):
    axes[0].text(i, v + 50, str(v), ha='center', fontweight='bold')

# Pie chart
axes[1].pie(counts.values, labels=['No Flood', 'Flood'], colors=colors,
            autopct='%1.1f%%', startangle=90, textprops={'fontsize': 12})
axes[1].set_title('Class Proportion')

fig.suptitle(f'Target Distribution (n={len(df)})', fontsize=14, fontweight='bold')
fig.tight_layout()
plt.savefig('../outputs/eda_class_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

  plt.show()


## 3. Numerical Feature Distributions

In [5]:
num_cols = ['rainfall_mm', 'river_level_m', 'soil_saturation_percent']
fig, axes = plt.subplots(1, 3, figsize=(14, 4))

for i, col in enumerate(num_cols):
    axes[i].hist(df[col], bins=40, color='#3b82f6', edgecolor='white', alpha=0.8)
    axes[i].axvline(df[col].mean(), color='#dc2626', linestyle='--', label=f'Mean: {df[col].mean():.1f}')
    axes[i].axvline(df[col].median(), color='#059669', linestyle='--', label=f'Median: {df[col].median():.1f}')
    axes[i].set_title(col.replace('_', ' ').title())
    axes[i].set_xlabel(col)
    axes[i].legend(fontsize=8)

fig.suptitle('Distribution of Key Numerical Features', fontsize=14, fontweight='bold')
fig.tight_layout()
plt.savefig('../outputs/eda_numerical_distributions.png', dpi=150, bbox_inches='tight')
plt.show()

  plt.show()


## 4. Feature Distributions by Flood Class

In [6]:
fig, axes = plt.subplots(1, 3, figsize=(14, 5))

for i, col in enumerate(num_cols):
    no_flood = df[df[TARGET] == 0][col]
    flood = df[df[TARGET] == 1][col]
    bp = axes[i].boxplot([no_flood, flood], labels=['No Flood', 'Flood'],
                         patch_artist=True, widths=0.6)
    bp['boxes'][0].set_facecolor('#059669')
    bp['boxes'][1].set_facecolor('#dc2626')
    for box in bp['boxes']:
        box.set_alpha(0.7)
    axes[i].set_title(col.replace('_', ' ').title())
    axes[i].set_ylabel(col)

fig.suptitle('Feature Distributions by Flood Outcome', fontsize=14, fontweight='bold')
fig.tight_layout()
plt.savefig('../outputs/eda_boxplots_by_class.png', dpi=150, bbox_inches='tight')
plt.show()

  bp = axes[i].boxplot([no_flood, flood], labels=['No Flood', 'Flood'],
  bp = axes[i].boxplot([no_flood, flood], labels=['No Flood', 'Flood'],
  bp = axes[i].boxplot([no_flood, flood], labels=['No Flood', 'Flood'],


  plt.show()


## 5. Correlation Heatmap

In [7]:
corr_cols = NUMERICAL_FEATURES + [TARGET]
corr_matrix = df[corr_cols].corr()

fig, ax = plt.subplots(figsize=(8, 6))
mask = np.triu(np.ones_like(corr_matrix, dtype=bool))
sns.heatmap(corr_matrix, mask=mask, annot=True, fmt='.2f', cmap='RdBu_r',
            center=0, square=True, linewidths=0.5, ax=ax,
            cbar_kws={'shrink': 0.8})
ax.set_title('Correlation Heatmap (Numerical Features + Target)', fontsize=13, fontweight='bold')
fig.tight_layout()
plt.savefig('../outputs/eda_correlation_heatmap.png', dpi=150, bbox_inches='tight')
plt.show()

  plt.show()


## 6. Categorical Feature Analysis

In [8]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Climate zone flood rates
cz_rates = df.groupby('climate_zone')[TARGET].mean().sort_values(ascending=False)
axes[0].bar(cz_rates.index, cz_rates.values, color=['#dc2626', '#d97706', '#059669'][:len(cz_rates)])
axes[0].set_title('Flood Rate by Climate Zone')
axes[0].set_ylabel('Flood Rate')
for i, v in enumerate(cz_rates.values):
    axes[0].text(i, v + 0.01, f'{v:.1%}', ha='center', fontweight='bold', fontsize=10)

# Drainage quality flood rates
dq_rates = df.groupby('drainage_quality')[TARGET].mean().sort_values(ascending=False)
axes[1].bar(dq_rates.index, dq_rates.values, color=['#dc2626', '#d97706', '#059669'][:len(dq_rates)])
axes[1].set_title('Flood Rate by Drainage Quality')
axes[1].set_ylabel('Flood Rate')
for i, v in enumerate(dq_rates.values):
    axes[1].text(i, v + 0.01, f'{v:.1%}', ha='center', fontweight='bold', fontsize=10)

fig.suptitle('Flood Rate by Categorical Features', fontsize=14, fontweight='bold')
fig.tight_layout()
plt.savefig('../outputs/eda_categorical_analysis.png', dpi=150, bbox_inches='tight')
plt.show()

  plt.show()


## 7. District-Level Flood Rates

In [9]:
district_rates = df.groupby('district').agg(
    flood_rate=(TARGET, 'mean'),
    count=(TARGET, 'count')
).sort_values('flood_rate', ascending=True)

fig, ax = plt.subplots(figsize=(10, max(6, len(district_rates) * 0.35)))
colors = ['#dc2626' if r > 0.3 else '#d97706' if r > 0.15 else '#059669'
          for r in district_rates['flood_rate']]
ax.barh(district_rates.index, district_rates['flood_rate'], color=colors)
ax.set_xlabel('Flood Rate')
ax.set_title('Flood Rate by District', fontsize=14, fontweight='bold')
ax.axvline(x=df[TARGET].mean(), color='#64748b', linestyle='--',
           label=f'Overall mean: {df[TARGET].mean():.1%}')
ax.legend()
fig.tight_layout()
plt.savefig('../outputs/eda_district_flood_rates.png', dpi=150, bbox_inches='tight')
plt.show()

  plt.show()


## 8. Monthly Flood Patterns

In [10]:
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
               'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']

monthly = df.groupby('month').agg(
    flood_rate=(TARGET, 'mean'),
    avg_rainfall=('rainfall_mm', 'mean')
).reindex(range(1, 13))

fig, ax1 = plt.subplots(figsize=(10, 5))
ax2 = ax1.twinx()

ax1.bar(range(1, 13), monthly['avg_rainfall'], color='#93c5fd', alpha=0.7, label='Avg Rainfall (mm)')
ax2.plot(range(1, 13), monthly['flood_rate'], 'o-', color='#dc2626', linewidth=2, label='Flood Rate')

ax1.set_xticks(range(1, 13))
ax1.set_xticklabels(month_names)
ax1.set_xlabel('Month')
ax1.set_ylabel('Average Rainfall (mm)', color='#3b82f6')
ax2.set_ylabel('Flood Rate', color='#dc2626')
ax1.set_title('Monthly Rainfall and Flood Rate Patterns', fontsize=14, fontweight='bold')

lines1, labels1 = ax1.get_legend_handles_labels()
lines2, labels2 = ax2.get_legend_handles_labels()
ax1.legend(lines1 + lines2, labels1 + labels2, loc='upper left')

fig.tight_layout()
plt.savefig('../outputs/eda_monthly_patterns.png', dpi=150, bbox_inches='tight')
plt.show()

  plt.show()


## 9. Key Observations

### Dataset Summary
- **Size**: ~10,000 records with 10 features and 1 binary target
- **Class imbalance**: ~78% no-flood vs ~22% flood - addressed with `class_weight='balanced'` during training
- **No missing values** - clean dataset requiring no imputation

### Key Findings
1. **Rainfall, river level, and soil saturation** show clear separation between flood and no-flood cases
2. **Climate zone** strongly influences flood risk - wet zones have higher flood rates
3. **Poor drainage** is associated with higher flood rates
4. **Seasonal patterns** exist with certain months showing elevated risk
5. **District-level variation** suggests geographic factors play a significant role