In [1]:
# Project: Storm Events Analysis (NOAA)
# Notebook: 00_data_download.ipynb
# Goal: Determine data shapes and column IDS, plan for com
# Author: Brice Nelson
# Date: 2025-09-05

from pathlib import Path
import numpy as np, pandas as pd, matplotlib.pyplot as plt, seaborn as sns
plt.rcParams["figure.dpi"] = 130
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)

# Paths
PROJECT_ROOT = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
DATA_RAW = PROJECT_ROOT / "data" / "raw"
DATA_INTERIM = PROJECT_ROOT / "data" / "interim"
DATA_PROCESSED = PROJECT_ROOT / "data" / "processed"
REPORT_FIGS = PROJECT_ROOT / "reports" / "figures"
REPORT_FIGS.mkdir(parents=True, exist_ok=True)

## Determine shape of files and information in files for planning of data handling

In [2]:
storm_fatalities_2020 = pd.read_csv("../data/raw/archive/StormEvents_fatalities-ftp_v1.0_d2020_c20201216.csv")
print(f'Shape: {storm_fatalities_2020.shape}')
print(f'info: {storm_fatalities_2020.info()}')

Shape: (471, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 471 entries, 0 to 470
Data columns (total 11 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   FAT_YEARMONTH      471 non-null    int64  
 1   FAT_DAY            471 non-null    int64  
 2   FAT_TIME           471 non-null    int64  
 3   FATALITY_ID        471 non-null    int64  
 4   EVENT_ID           471 non-null    int64  
 5   FATALITY_TYPE      471 non-null    object 
 6   FATALITY_DATE      471 non-null    object 
 7   FATALITY_AGE       400 non-null    float64
 8   FATALITY_SEX       438 non-null    object 
 9   FATALITY_LOCATION  471 non-null    object 
 10  EVENT_YEARMONTH    471 non-null    int64  
dtypes: float64(1), int64(6), object(4)
memory usage: 40.6+ KB
info: None


In [3]:
storm_details_2020 = pd.read_csv("../data/raw/archive/StormEvents_details-ftp_v1.0_d2020_c20201216.csv")
print(f'Shape: {storm_details_2020.shape}')
print(f'info: {storm_details_2020.info()}')

Shape: (50317, 51)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50317 entries, 0 to 50316
Data columns (total 51 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   BEGIN_YEARMONTH     50317 non-null  int64  
 1   BEGIN_DAY           50317 non-null  int64  
 2   BEGIN_TIME          50317 non-null  int64  
 3   END_YEARMONTH       50317 non-null  int64  
 4   END_DAY             50317 non-null  int64  
 5   END_TIME            50317 non-null  int64  
 6   EPISODE_ID          50317 non-null  int64  
 7   EVENT_ID            50317 non-null  int64  
 8   STATE               50317 non-null  object 
 9   STATE_FIPS          50317 non-null  int64  
 10  YEAR                50317 non-null  int64  
 11  MONTH_NAME          50317 non-null  object 
 12  EVENT_TYPE          50317 non-null  object 
 13  CZ_TYPE             50317 non-null  object 
 14  CZ_FIPS             50317 non-null  int64  
 15  CZ_NAME             50317 non-null

In [4]:
storm_locations_2020 = pd.read_csv("../data/raw/archive/StormEvents_locations-ftp_v1.0_d2020_c20201216.csv")
print(f'Shape: {storm_locations_2020.shape}')
print(f'info: {storm_locations_2020.info()}')

Shape: (48968, 11)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48968 entries, 0 to 48967
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   YEARMONTH       48968 non-null  int64  
 1   EPISODE_ID      48968 non-null  int64  
 2   EVENT_ID        48968 non-null  int64  
 3   LOCATION_INDEX  48968 non-null  int64  
 4   RANGE           48968 non-null  float64
 5   AZIMUTH         48968 non-null  object 
 6   LOCATION        48968 non-null  object 
 7   LATITUDE        48968 non-null  float64
 8   LONGITUDE       48968 non-null  float64
 9   LAT2            48968 non-null  int64  
 10  LON2            48968 non-null  int64  
dtypes: float64(3), int64(6), object(2)
memory usage: 4.1+ MB
info: None
