In [11]:
# Project: Storm Events Analysis (NOAA)
# Notebook: 00_data_download.ipynb
# Goal: Determine data shapes and column IDs, plan for combining data, implement data compiling.
# Author: Brice Nelson
# Date: 2025-09-05

from pathlib import Path
import re
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns

# --- Display & Repro ---
plt.rcParams["figure.dpi"] = 130
pd.set_option("display.max_rows", 25)
pd.set_option("display.max_columns", 120)
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# --- Paths ---
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
REPORT_FIGS = PROJECT_ROOT / "reports" / "figures"
REPORT_TABLES = PROJECT_ROOT / "reports" / "tables"

for p in [
    DATA_INTERIM / "details",
    DATA_INTERIM / "locations",
    DATA_INTERIM / "fatalities",
    REPORT_FIGS,
    REPORT_TABLES,
    DATA_INTERIM / "_logs",
]:
    p.mkdir(parents=True, exist_ok=True)

# --- Notebook knobs (tweak here) ---
MIN_YEAR = 1996                    # only process files from this year onward
INCLUDE_YEARS = None               # e.g., set to range(1996, 2021) or a set like {2018, 2019, 2020}; None = auto
ACCEPT_EXTENSIONS = ("*.csv", "*.csv.gz")
DRY_RUN = False                    # True = preview without writing to data/interim
SAVE_INTERIM_AS = "csv"            # "csv" (default) or "parquet" (later, if desired)

# --- Filename/type detection (no functions here) ---
# Handles names like: StormEvents_details-ftp_v1.0_d2020_c20210108.csv(.gz)
YEAR_DIR_RE = re.compile(r"^(?:19|20)\d{2}$")  # folder named 1996, 2020, etc.
YEAR_IN_NAME_D_PATTERN = re.compile(r"d((?:19|20)\d{2})", re.I)  # ..._d2020_...
YEAR_IN_NAME_ANY_PATTERN = re.compile(r"(?:19|20)\d{2}")

TYPE_PATTERNS = {
    "details":    re.compile(r"(?:^|[_\W])detail[s]?(?:[_\W]|$)", re.I),
    "locations":  re.compile(r"(?:^|[_\W])loc(?:ation|ations)?(?:[_\W]|$)", re.I),
    "fatalities": re.compile(r"(?:^|[_\W])fatal(?:ity|ities)?(?:[_\W]|$)", re.I),
}

# If your files use different tokens, add them here (e.g., {"details": r"\bDET\b"})
EXTRA_TYPE_TOKENS = {
    # "details": re.compile(r"\bDET\b", re.I),
    # "locations": re.compile(r"\bLOC\b", re.I),
    # "fatalities": re.compile(r"\bFAT\b", re.I),
}

# --- Column normalization hints (used in later cells) ---
LOWERCASE_COLS = True
EVENT_ID_CANDIDATES = (
    "event_id", "event id", "eventid", "EVENT_ID", "Event_ID", "Event Id"
)

# Pandas read options (you can tweak for speed/memory)
READ_KWARGS = dict(low_memory=False)  # compression will be inferred for .csv.gz

# Output mapping (used by later cells)
OUTPUT_SUBDIRS = {
    "details": DATA_INTERIM / "details",
    "locations": DATA_INTERIM / "locations",
    "fatalities": DATA_INTERIM / "fatalities",
}

# Small metadata dict for logging
RUN_META = {
    "min_year": MIN_YEAR,
    "include_years": (list(INCLUDE_YEARS) if INCLUDE_YEARS is not None else None),
    "accept_extensions": ACCEPT_EXTENSIONS,
    "dry_run": DRY_RUN,
    "save_interim_as": SAVE_INTERIM_AS,
    "project_root": str(PROJECT_ROOT),
}
RUN_META


{'min_year': 1996,
 'include_years': None,
 'accept_extensions': ('*.csv', '*.csv.gz'),
 'dry_run': False,
 'save_interim_as': 'csv',
 'project_root': '/home/bnelson_regex/projects/machine_learning_projects/weather_storm_events_predict'}

## Determine shape of files and information in files for planning of data handling

In [2]:
storm_fatalities_2020 = pd.read_csv("../data/raw/archive/StormEvents_fatalities-ftp_v1.0_d2020_c20201216.csv")
print(f'Shape: {storm_fatalities_2020.shape}')
print(f'info: {storm_fatalities_2020.info()}')

Shape: (471, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   FAT_YEARMONTH      471 non-null    int64  
 1   FAT_DAY            471 non-null    int64  
 2   FAT_TIME           471 non-null    int64  
 3   FATALITY_ID        471 non-null    int64  
 4   EVENT_ID           471 non-null    int64  
 5   FATALITY_TYPE      471 non-null    object 
 6   FATALITY_DATE      471 non-null    object 
 7   FATALITY_AGE       400 non-null    float64
 8   FATALITY_SEX       438 non-null    object 
 9   FATALITY_LOCATION  471 non-null    object 
 10  EVENT_YEARMONTH    471 non-null    int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 40.6+ KB
info: None


In [3]:
storm_details_2020 = pd.read_csv("../data/raw/archive/StormEvents_details-ftp_v1.0_d2020_c20201216.csv")
print(f'Shape: {storm_details_2020.shape}')
print(f'info: {storm_details_2020.info()}')

Shape: (50317, 51)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50317 entries, 0 to 50316
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   BEGIN_YEARMONTH     50317 non-null  int64  
 1   BEGIN_DAY           50317 non-null  int64  
 2   BEGIN_TIME          50317 non-null  int64  
 3   END_YEARMONTH       50317 non-null  int64  
 4   END_DAY             50317 non-null  int64  
 5   END_TIME            50317 non-null  int64  
 6   EPISODE_ID          50317 non-null  int64  
 7   EVENT_ID            50317 non-null  int64  
 8   STATE               50317 non-null  object 
 9   STATE_FIPS          50317 non-null  int64  
 10  YEAR                50317 non-null  int64  
 11  MONTH_NAME          50317 non-null  object 
 12  EVENT_TYPE          50317 non-null  object 
 13  CZ_TYPE             50317 non-null  object 
 14  CZ_FIPS             50317 non-null  int64  
 15  CZ_NAME             50317 non-null

In [4]:
storm_locations_2020 = pd.read_csv("../data/raw/archive/StormEvents_locations-ftp_v1.0_d2020_c20201216.csv")
print(f'Shape: {storm_locations_2020.shape}')
print(f'info: {storm_locations_2020.info()}')

Shape: (48968, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48968 entries, 0 to 48967
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   YEARMONTH       48968 non-null  int64  
 1   EPISODE_ID      48968 non-null  int64  
 2   EVENT_ID        48968 non-null  int64  
 3   LOCATION_INDEX  48968 non-null  int64  
 4   RANGE           48968 non-null  float64
 5   AZIMUTH         48968 non-null  object 
 6   LOCATION        48968 non-null  object 
 7   LATITUDE        48968 non-null  float64
 8   LONGITUDE       48968 non-null  float64
 9   LAT2            48968 non-null  int64  
 10  LON2            48968 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 4.1+ MB
info: None


In [12]:
def infer_year_from_path(fp: Path) -> int | None:
    # Prefer parent folder like data/raw/2020/...
    parent = fp.parent.name
    if YEAR_DIR_RE.fullmatch(parent):
        return int(parent)
    # Fallbacks: NOAA pattern like ..._d2020_..., then any 4-digit year
    m = YEAR_IN_NAME_D_PATTERN.search(fp.name)
    if m:
        return int(m.group(1))
    m = YEAR_IN_NAME_ANY_PATTERN.search(fp.name)
    return int(m.group(0)) if m else None

def infer_type_from_name(name: str) -> str | None:
    n = name.lower()
    for t, pat in TYPE_PATTERNS.items():
        if pat.search(n):
            return t
    for t, pat in EXTRA_TYPE_TOKENS.items():
        if pat.search(n):
            return t
    # simple substring fallbacks
    if "detail" in n: return "details"
    if "fatal" in n:  return "fatalities"
    if "location" in n or "locat" in n: return "locations"
    return None

def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df.columns = [c.strip().lower() for c in df.columns]
    # normalize event_id (covers 'event id', 'eventid', etc.)
    if "event_id" not in df.columns:
        for c in df.columns:
            if re.sub(r"[\s_]+", "", c) == "eventid":
                df.rename(columns={c: "event_id"}, inplace=True)
                break
    if "event_id" in df.columns:
        df["event_id"] = pd.to_numeric(df["event_id"], errors="coerce").astype("Int64")
    return df

# ---- Main ingest: walk all CSV/CSV.GZ under data/raw/<year>/* ----
index_rows = []
min_year = globals().get("MIN_YEAR", 1996)
include_years = globals().get("INCLUDE_YEARS", None)
accept_exts = globals().get("ACCEPT_EXTENSIONS", ("*.csv", "*.csv.gz"))
dry_run = bool(globals().get("DRY_RUN", False))
save_as = str(globals().get("SAVE_INTERIM_AS", "csv")).lower()
read_kwargs = globals().get("READ_KWARGS", dict(low_memory=False))

# Collect files by extension(s)
files = []
for pat in accept_exts:
    files.extend(DATA_RAW.rglob(pat))
files = sorted(files)

for fp in files:
    year = infer_year_from_path(fp)
    ftype = infer_type_from_name(fp.name)

    # Year/type filters
    if year is None or year < min_year or ftype is None:
        print(f"[skip] {fp}")
        continue
    if include_years is not None and year not in include_years:
        print(f"[skip: not in INCLUDE_YEARS] {fp}")
        continue

    try:
        df = pd.read_csv(fp, **read_kwargs)  # compression inferred for .gz
    except Exception as e:
        print(f"[ERROR] reading {fp}: {e}")
        continue

    df = standardize_columns(df)
    df["year"] = year
    df["source_filename"] = fp.name

    out_dir = OUTPUT_SUBDIRS.get(ftype, DATA_INTERIM / ftype)
    out_dir.mkdir(parents=True, exist_ok=True)

    if dry_run:
        print(f"[dry-run] Would save ({len(df)} rows) -> {out_dir / f'{ftype}_{year}.{save_as}'}")
    else:
        if save_as == "parquet":
            try:
                df.to_parquet(out_dir / f"{ftype}_{year}.parquet", index=False)
            except Exception as e:
                print(f"[WARN] Parquet failed ({e}); falling back to CSV for {fp.name}")
                df.to_csv(out_dir / f"{ftype}_{year}.csv", index=False)
                save_ext = "csv"
            else:
                save_ext = "parquet"
        else:
            df.to_csv(out_dir / f"{ftype}_{year}.csv", index=False)
            save_ext = "csv"

        index_rows.append({
            "year": year,
            "type": ftype,
            "rows": int(len(df)),
            "saved_as": save_ext,
            "saved_path": str(out_dir / f"{ftype}_{year}.{save_ext}")
        })

# Quick summary + persistent log
summary = pd.DataFrame(index_rows).sort_values(["type", "year"]) if index_rows else pd.DataFrame()
display(summary)
if not summary.empty:
    log_dir = DATA_INTERIM / "_logs"
    log_dir.mkdir(exist_ok=True)
    summary.to_csv(log_dir / "ingest_summary.csv", index=False)

[skip] /home/bnelson_regex/projects/machine_learning_projects/weather_storm_events_predict/data/raw/archive/StormEvents_details-ftp_v1.0_d1950_c20170120.csv
[skip] /home/bnelson_regex/projects/machine_learning_projects/weather_storm_events_predict/data/raw/archive/StormEvents_details-ftp_v1.0_d1951_c20160223.csv
[skip] /home/bnelson_regex/projects/machine_learning_projects/weather_storm_events_predict/data/raw/archive/StormEvents_details-ftp_v1.0_d1952_c20170619.csv
[skip] /home/bnelson_regex/projects/machine_learning_projects/weather_storm_events_predict/data/raw/archive/StormEvents_details-ftp_v1.0_d1953_c20160223.csv
[skip] /home/bnelson_regex/projects/machine_learning_projects/weather_storm_events_predict/data/raw/archive/StormEvents_details-ftp_v1.0_d1954_c20160223.csv
[skip] /home/bnelson_regex/projects/machine_learning_projects/weather_storm_events_predict/data/raw/archive/StormEvents_details-ftp_v1.0_d1955_c20160223.csv
[skip] /home/bnelson_regex/projects/machine_learning_proje

Unnamed: 0,year,type,rows,saved_as,saved_path
0,1996,details,48561,csv,/home/bnelson_regex/projects/machine_learning_...
139,1996,details,48561,csv,/home/bnelson_regex/projects/machine_learning_...
3,1997,details,41991,csv,/home/bnelson_regex/projects/machine_learning_...
140,1997,details,41991,csv,/home/bnelson_regex/projects/machine_learning_...
6,1998,details,50973,csv,/home/bnelson_regex/projects/machine_learning_...
...,...,...,...,...,...
136,2018,locations,47529,csv,/home/bnelson_regex/projects/machine_learning_...
71,2019,locations,52677,csv,/home/bnelson_regex/projects/machine_learning_...
137,2019,locations,52677,csv,/home/bnelson_regex/projects/machine_learning_...
74,2020,locations,48968,csv,/home/bnelson_regex/projects/machine_learning_...
