# Flood Event Data Processing

This notebook processes storm event CSV files to extract and filter flooding events.


In [1]:
import os
import pandas as pd
from pathlib import Path
from typing import List


## Loading initial data

### Loading data


In [None]:
def get_csv_files(data_dir: str) -> List[str]:
    csv_files = []
    data_path = Path(data_dir)
    
    if not data_path.exists():
        raise FileNotFoundError(f"Directory {data_dir} does not exist")
    
    for file in data_path.glob("*.csv"):
        csv_files.append(str(file))
    
    return sorted(csv_files)


In [None]:
def parse_csv_file(file_path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(file_path, low_memory=False)
        return df
    except Exception as e:
        print(f"Error parsing {file_path}: {e}")
        return pd.DataFrame()


### Filtering


In [None]:
def filter_flooding_events(df: pd.DataFrame, state: str = None) -> pd.DataFrame:
    if df.empty or 'EVENT_TYPE' not in df.columns:
        return pd.DataFrame()
    
    flood_keywords = ['flood', 'flash flood', 'coastal flood', 'lakeshore flood', 
                      'river flood', 'urban flood', 'small stream flood']
    
    mask = df['EVENT_TYPE'].str.lower().str.contains('|'.join(flood_keywords), 
                                                       case=False, 
                                                       na=False)    
    if state is not None:
        if 'STATE' not in df.columns:
            print(f"Warning: STATE column not found, cannot filter by state")
        else:
            state_mask = df['STATE'].str.upper() == state.upper()
            mask = mask & state_mask
    
    df = df[mask].copy()

    # Remove rows with invalid coordinates
    if 'BEGIN_LAT' in df.columns and 'BEGIN_LON' in df.columns:
        df['BEGIN_LAT'] = pd.to_numeric(df['BEGIN_LAT'], errors='coerce')
        df['BEGIN_LON'] = pd.to_numeric(df['BEGIN_LON'], errors='coerce')
        mask = (df['BEGIN_LAT'].notna()) & (df['BEGIN_LON'].notna()) & \
            (df['BEGIN_LAT'] != 0) & (df['BEGIN_LON'] != 0)
        
        filtered_df = df[mask].copy()
    else:
        filtered_df = df.copy()

    return filtered_df


In [None]:
def split_yearmonth(df: pd.DataFrame) -> pd.DataFrame:

    if df.empty or 'BEGIN_YEARMONTH' not in df.columns:
        return df
    
    df = df.copy()
    df['YEAR'] = df['BEGIN_YEARMONTH'].astype(str).str[:4].astype(int)
    df['MONTH'] = df['BEGIN_YEARMONTH'].astype(str).str[4:6].astype(int)
    
    return df


In [None]:
def select_columns(df: pd.DataFrame) -> pd.DataFrame:

    if df.empty:
        return pd.DataFrame()
    
    columns_to_keep = [
        'YEAR',
        'MONTH',
        'BEGIN_DAY',
        'BEGIN_TIME',
        'BEGIN_LAT',
        'BEGIN_LON',
        'STATE',
        'EVENT_TYPE',
        'FLOOD_CAUSE',
        'EVENT_NARRATIVE'
    ]
    

    available_columns = [col for col in columns_to_keep if col in df.columns]
    missing_columns = [col for col in columns_to_keep if col not in df.columns]
    
    if missing_columns:
        print(f"Warning: The following columns were not found: {missing_columns}")
    
    if not available_columns:
        print("Warning: None of the requested columns were found in the DataFrame")
        return pd.DataFrame()
    
    selected_df = df[available_columns].copy()
    return selected_df


### Main processing


In [None]:
def process_all_csv_files(raw_data_dir: str, state: str = None) -> pd.DataFrame:
    csv_files = get_csv_files(raw_data_dir)
    
    if not csv_files:
        print(f"No CSV files found in {raw_data_dir}")
        return pd.DataFrame()
    
    print(f"Found {len(csv_files)} CSV file(s) to process")
    if state:
        print(f"Filtering for state: {state.upper()}")
    
    all_flooding_events = []
    
    for csv_file in csv_files:
        print(f"Processing {os.path.basename(csv_file)}...")
        df = parse_csv_file(csv_file)
        
        if not df.empty:
            flooding_df = filter_flooding_events(df, state=state)
            if not flooding_df.empty:
                print(f"  Found {len(flooding_df)} flooding event(s)")
                all_flooding_events.append(flooding_df)
            else:
                print(f"  No flooding events found")
        else:
            print(f"  Failed to parse or file is empty")
    
    if all_flooding_events:
        combined_df = pd.concat(all_flooding_events, ignore_index=True)
        print(f"\nTotal flooding events found: {len(combined_df)}")
        
        combined_df = split_yearmonth(combined_df)
        selected_df = select_columns(combined_df)
        return selected_df
    else:
        print("\nNo flooding events found in any files")
        return pd.DataFrame()


In [None]:
raw_data_dir = "raw_data"
state_filter = "TEXAS" 

print(f"Data directory: {raw_data_dir}")
print(f"State filter: {state_filter if state_filter else 'None (all states)'}")


Data directory: raw_data
State filter: TEXAS


In [None]:
flooding_events_df = process_all_csv_files(str(raw_data_dir), state=state_filter)


Found 25 CSV file(s) to process
Filtering for state: TEXAS
Processing StormEvents_details-ftp_v1.0_d2001_c20250520.csv...
  Found 44 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2002_c20250520.csv...
  No flooding events found
Processing StormEvents_details-ftp_v1.0_d2003_c20250520.csv...
  Found 45 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2004_c20250520.csv...
  Found 32 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2005_c20250520.csv...
  Found 374 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2006_c20250520.csv...
  Found 136 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2007_c20250520.csv...
  Found 1345 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2008_c20250520.csv...
  Found 243 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2009_c20250520.csv...
  Found 533 flooding event(s)
Processing StormEvents_details-ftp_v1.0_d2010_c20250520.csv...
  Found 416 flooding event(s)
Processing Storm

In [12]:
# Display summary information
if not flooding_events_df.empty:
    print(f"\nTotal rows: {len(flooding_events_df)}")
    print(f"\nColumns: {list(flooding_events_df.columns)}")
    print(f"\nFirst few rows:")
    display(flooding_events_df.head(10))
    
    print(f"\nDataFrame info:")
    flooding_events_df.info()
else:
    print("No flooding events found.")



Total rows: 9340

Columns: ['YEAR', 'MONTH', 'BEGIN_DAY', 'BEGIN_TIME', 'BEGIN_LAT', 'BEGIN_LON', 'STATE', 'EVENT_TYPE', 'FLOOD_CAUSE', 'EVENT_NARRATIVE']

First few rows:


Unnamed: 0,YEAR,MONTH,BEGIN_DAY,BEGIN_TIME,BEGIN_LAT,BEGIN_LON,STATE,EVENT_TYPE,FLOOD_CAUSE,EVENT_NARRATIVE
0,2001,3,11,140,32.53333,-96.66667,TEXAS,Flash Flood,,Roads were closed near Sanger due to high water.
1,2001,5,4,15,33.73333,-102.78333,TEXAS,Flash Flood,,I-27 was flooded and subsequently closed due t...
2,2001,5,6,125,31.46667,-97.71667,TEXAS,Flash Flood,,Water was over Northwest Highway at Hillcrest ...
3,2001,5,6,128,33.08333,-97.13333,TEXAS,Flash Flood,,Water was over Highway 75 at Parkview.
4,2001,5,6,1700,33.36667,-97.68333,TEXAS,Flash Flood,,Flooding resulted in high water rescues and ma...
5,2001,5,6,45,33.75,-96.55,TEXAS,Flash Flood,,A bridge on County Road 720 was submerged near...
6,2001,5,5,2330,32.05,-97.2,TEXAS,Flash Flood,,A bridge on County Road 720 was submerged near...
7,2001,5,6,0,32.03333,-97.13333,TEXAS,Flash Flood,,Street flooding was reported north of Farm Mar...
8,2001,6,7,2000,29.36667,-95.08333,TEXAS,Flash Flood,,Flooding from the remnants of T.S. Allison. D...
9,2001,6,8,430,30.5,-95.33333,TEXAS,Flash Flood,,Flooding from the remnants of T.S. Allison. D...



DataFrame info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9340 entries, 0 to 9339
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   YEAR             9340 non-null   int64  
 1   MONTH            9340 non-null   int64  
 2   BEGIN_DAY        9340 non-null   int64  
 3   BEGIN_TIME       9340 non-null   int64  
 4   BEGIN_LAT        9340 non-null   float64
 5   BEGIN_LON        9340 non-null   float64
 6   STATE            9340 non-null   object 
 7   EVENT_TYPE       9340 non-null   object 
 8   FLOOD_CAUSE      8814 non-null   object 
 9   EVENT_NARRATIVE  9247 non-null   object 
dtypes: float64(2), int64(4), object(4)
memory usage: 729.8+ KB


## Augment Data

In [17]:
!pip3 install openmeteo-requests
!pip3 install requests-cache retry-requests



### Augmenting flooding data using the [Open Meteo API](https://open-meteo.com/en/docs/climate-api?utm_source=chatgpt.com&daily=temperature_2m_mean,wind_speed_10m_mean,cloud_cover_mean,relative_humidity_2m_mean,dew_point_2m_mean,precipitation_sum,rain_sum,snowfall_sum,pressure_msl_mean,soil_moisture_0_to_10cm_mean&start_date=2020-01-01&end_date=2020-01-01&models=EC_Earth3P_HR#settings)

This is an open-sourced api that allows you to entire a lat, long, and date and it gives you data on the weather conditions of that area at the time. 

In [None]:
import openmeteo_requests
import requests_cache
from retry_requests import retry

def get_weather_data(lat: float, lon: float, date: str) -> dict:
    cache_session = requests_cache.CachedSession('.cache', expire_after = 3600)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    url = "https://climate-api.open-meteo.com/v1/climate"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": date,
        "end_date": date,
        "models": "EC_Earth3P_HR",
        "daily": ["temperature_2m_mean", "wind_speed_10m_mean", "cloud_cover_mean", "relative_humidity_2m_mean", "dew_point_2m_mean", "precipitation_sum", "rain_sum", "snowfall_sum", "pressure_msl_mean", "soil_moisture_0_to_10cm_mean"],
        "utm_source": "chatgpt.com",
    }
    responses = openmeteo.weather_api(url, params=params)
    response = responses[0]

    daily = response.Daily()
    daily_temperature_2m_mean = daily.Variables(0).ValuesAsNumpy()
    daily_wind_speed_10m_mean = daily.Variables(1).ValuesAsNumpy()
    daily_cloud_cover_mean = daily.Variables(2).ValuesAsNumpy()
    daily_relative_humidity_2m_mean = daily.Variables(3).ValuesAsNumpy()
    daily_dew_point_2m_mean = daily.Variables(4).ValuesAsNumpy()
    daily_precipitation_sum = daily.Variables(5).ValuesAsNumpy()
    daily_rain_sum = daily.Variables(6).ValuesAsNumpy()
    daily_snowfall_sum = daily.Variables(7).ValuesAsNumpy()
    daily_pressure_msl_mean = daily.Variables(8).ValuesAsNumpy()
    daily_soil_moisture_0_to_10cm_mean = daily.Variables(9).ValuesAsNumpy()

    daily_data = {"date": date}

    daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
    daily_data["wind_speed_10m_mean"] = daily_wind_speed_10m_mean
    daily_data["cloud_cover_mean"] = daily_cloud_cover_mean
    daily_data["relative_humidity_2m_mean"] = daily_relative_humidity_2m_mean
    daily_data["dew_point_2m_mean"] = daily_dew_point_2m_mean
    daily_data["precipitation_sum"] = daily_precipitation_sum
    daily_data["rain_sum"] = daily_rain_sum
    daily_data["snowfall_sum"] = daily_snowfall_sum
    daily_data["pressure_msl_mean"] = daily_pressure_msl_mean
    daily_data["soil_moisture_0_to_10cm_mean"] = daily_soil_moisture_0_to_10cm_mean
    daily_data["elevation"] = response.Elevation()

    daily_dataframe = pd.DataFrame(data = daily_data)
    return daily_dataframe

In [None]:
# For every flooding event, augment the row with the weather data
# Add the weather data to the flooding data
# Save the augmented data to a new csv file

import time
from datetime import datetime

def augment_flooding_data_with_weather(flooding_df: pd.DataFrame) -> pd.DataFrame:
    """
    Augment flooding events DataFrame with weather data from Open Meteo API.
    
    Args:
        flooding_df: DataFrame with flooding events
        
    Returns:
        DataFrame with weather data added to each flooding event
    """
    augmented_rows = []
    total_rows = len(flooding_df)
    
    print(f"Augmenting {total_rows} flooding events with weather data...")
    print("This may take a while due to API rate limits...\n")
    
    for idx, row in flooding_df.iterrows():
        try:
            # Construct date from YEAR, MONTH, BEGIN_DAY
            year = int(row['YEAR'])
            month = int(row['MONTH'])
            day = int(row['BEGIN_DAY'])
            
            # Format date as YYYY-MM-DD
            date_str = f"{year}-{month:02d}-{day:02d}"
            
            # Get weather data
            weather_df = get_weather_data(
                lat=float(row['BEGIN_LAT']),
                lon=float(row['BEGIN_LON']),
                date=date_str
            )
            
            # Merge weather data with flooding event data
            if not weather_df.empty:
                # Get the first row of weather data (should only be one row)
                weather_row = weather_df.iloc[0]
                
                # Create augmented row by combining original row with weather data
                augmented_row = row.to_dict()
                
                # Add weather columns
                for col in weather_df.columns:
                    if col != 'date':  # Skip date as we already have it
                        augmented_row[col] = weather_row[col]
                
                augmented_rows.append(augmented_row)
            else:
                # If no weather data, just use the original row
                augmented_rows.append(row.to_dict())
            
            # Progress indicator
            if (idx + 1) % 50 == 0:
                print(f"Processed {idx + 1}/{total_rows} events...")
            
            # Small delay to respect API rate limits
            time.sleep(0.2)
            
        except Exception as e:
            print(f"Error processing row {idx}: {e}")
            # On error, add original row without weather data
            augmented_rows.append(row.to_dict())
    
    # Create DataFrame from augmented rows
    augmented_df = pd.DataFrame(augmented_rows)
    
    print(f"\nAugmentation complete! Processed {len(augmented_df)} events.")
    return augmented_df

# Augment the flooding events with weather data
augmented_flooding_df = augment_flooding_data_with_weather(flooding_events_df)

# Display summary
print(f"\nAugmented DataFrame shape: {augmented_flooding_df.shape}")
print(f"\nNew columns added: {set(augmented_flooding_df.columns) - set(flooding_events_df.columns)}")
print(f"\nFirst few rows of augmented data:")
display(augmented_flooding_df.head())

# Save to CSV file
output_path = "flooding_events_augmented.csv"
augmented_flooding_df.to_csv(output_path, index=False)
print(f"\nAugmented data saved to: {output_path}")


Augmenting 9340 flooding events with weather data...
This may take a while due to API rate limits...

Processed 50/9340 events...


KeyboardInterrupt: 