In [20]:
# libraries
import polars as pl
import urllib.request
import os
from pathlib import Path
from datetime import datetime, timedelta

In [21]:
# current working directory
notebook_dir = Path.cwd()

# go up one level (from notebooks/ → project root), then to data/external
file_path = notebook_dir.parent / "data" / "external" / "FPA_FOD_Plus.csv"

print("File path:", file_path)

df = pl.scan_csv(
    file_path,
    infer_schema_length=10000,  # <- increase this for better type inference
    ignore_errors=True
)
df_full = df.collect()

print(df_full.shape)


File path: /Users/nedim/Desktop/MY_REPOS/mids-w207-section1-team1-finalproject/data/external/FPA_FOD_Plus.csv
(2302521, 308)


**Basic Info about the dataset**

In [None]:
# shape of the dataset
print("Rows:", df_full.height, "Columns:", df_full.width)

Rows: 2302521 Columns: 308


In [None]:
# preview data
print(df.head(5))

naive plan: (run LazyFrame.explain(optimized=True) to see the optimized plan)

SLICE[offset: 0, len: 5]
  Csv SCAN [/Users/nedim/Desktop/MY_REPOS/mids-w207-section1-team1-finalproject/data/external/FPA_FOD_Plus.csv]
  PROJECT */308 COLUMNS


In [24]:
missing_summary = (
    df_full.select([
        pl.col(col).is_null().sum().alias(col)
        for col in df_full.columns
    ])
)

print("Missing values per column:")
print(missing_summary.transpose(include_header=True))

Missing values per column:
shape: (308, 2)
┌───────────────────────┬──────────┐
│ column                ┆ column_0 │
│ ---                   ┆ ---      │
│ str                   ┆ u32      │
╞═══════════════════════╪══════════╡
│ FOD_ID                ┆ 0        │
│ FPA_ID                ┆ 0        │
│ SOURCE_SYSTEM_TYPE    ┆ 0        │
│ SOURCE_SYSTEM         ┆ 0        │
│ NWCG_REPORTING_AGENCY ┆ 0        │
│ …                     ┆ …        │
│ CheatGrass            ┆ 1921616  │
│ ExoticAnnualGrass     ┆ 1921616  │
│ Medusahead            ┆ 1921616  │
│ PoaSecunda            ┆ 1921616  │
│ geometry              ┆ 1730504  │
└───────────────────────┴──────────┘


In [25]:
print("Column dtypes:")
print(df_full.schema)

Column dtypes:
Schema([('FOD_ID', Int64), ('FPA_ID', String), ('SOURCE_SYSTEM_TYPE', String), ('SOURCE_SYSTEM', String), ('NWCG_REPORTING_AGENCY', String), ('NWCG_REPORTING_UNIT_ID', String), ('NWCG_REPORTING_UNIT_NAME', String), ('SOURCE_REPORTING_UNIT', String), ('SOURCE_REPORTING_UNIT_NAME', String), ('LOCAL_FIRE_REPORT_ID', Float64), ('LOCAL_INCIDENT_ID', String), ('FIRE_CODE', String), ('FIRE_NAME', String), ('ICS_209_PLUS_INCIDENT_JOIN_ID', String), ('ICS_209_PLUS_COMPLEX_JOIN_ID', String), ('MTBS_ID', String), ('MTBS_FIRE_NAME', String), ('COMPLEX_NAME', String), ('FIRE_YEAR', Int64), ('DISCOVERY_DATE', String), ('DISCOVERY_DOY', Int64), ('DISCOVERY_TIME', Float64), ('NWCG_CAUSE_CLASSIFICATION', String), ('NWCG_GENERAL_CAUSE', String), ('NWCG_CAUSE_AGE_CATEGORY', String), ('CONT_DATE', String), ('CONT_DOY', Float64), ('CONT_TIME', Float64), ('FIRE_SIZE', Float64), ('FIRE_SIZE_CLASS', String), ('LATITUDE', Float64), ('LONGITUDE', Float64), ('OWNER_DESCR', String), ('STATE', Strin

In [26]:
print("Summary stats:")
print(df_full.describe())

Summary stats:
shape: (9, 309)
┌───────────┬───────────┬───────────┬───────────┬───┬───────────┬───────────┬───────────┬──────────┐
│ statistic ┆ FOD_ID    ┆ FPA_ID    ┆ SOURCE_SY ┆ … ┆ ExoticAnn ┆ Medusahea ┆ PoaSecund ┆ geometry │
│ ---       ┆ ---       ┆ ---       ┆ STEM_TYPE ┆   ┆ ualGrass  ┆ d         ┆ a         ┆ ---      │
│ str       ┆ f64       ┆ str       ┆ ---       ┆   ┆ ---       ┆ ---       ┆ ---       ┆ str      │
│           ┆           ┆           ┆ str       ┆   ┆ str       ┆ str       ┆ str       ┆          │
╞═══════════╪═══════════╪═══════════╪═══════════╪═══╪═══════════╪═══════════╪═══════════╪══════════╡
│ count     ┆ 2.302521e ┆ 2302521   ┆ 2302521   ┆ … ┆ 380905    ┆ 380905    ┆ 380905    ┆ 572017   │
│           ┆ 6         ┆           ┆           ┆   ┆           ┆           ┆           ┆          │
│ null_coun ┆ 0.0       ┆ 0         ┆ 0         ┆ … ┆ 1921616   ┆ 1921616   ┆ 1921616   ┆ 1730504  │
│ t         ┆           ┆           ┆           ┆   ┆       

In [27]:
print(df_full.select(pl.col("STATE").unique().sort()))

shape: (52, 1)
┌───────┐
│ STATE │
│ ---   │
│ str   │
╞═══════╡
│ AK    │
│ AL    │
│ AR    │
│ AZ    │
│ CA    │
│ …     │
│ VT    │
│ WA    │
│ WI    │
│ WV    │
│ WY    │
└───────┘


In [28]:
# checking for any dupe fires
dupes = df_full.select(pl.col("FOD_ID")).is_duplicated().sum()
print("Duplicate FOD_IDs:", dupes)

Duplicate FOD_IDs: 128


In [29]:
df_full.columns

['FOD_ID',
 'FPA_ID',
 'SOURCE_SYSTEM_TYPE',
 'SOURCE_SYSTEM',
 'NWCG_REPORTING_AGENCY',
 'NWCG_REPORTING_UNIT_ID',
 'NWCG_REPORTING_UNIT_NAME',
 'SOURCE_REPORTING_UNIT',
 'SOURCE_REPORTING_UNIT_NAME',
 'LOCAL_FIRE_REPORT_ID',
 'LOCAL_INCIDENT_ID',
 'FIRE_CODE',
 'FIRE_NAME',
 'ICS_209_PLUS_INCIDENT_JOIN_ID',
 'ICS_209_PLUS_COMPLEX_JOIN_ID',
 'MTBS_ID',
 'MTBS_FIRE_NAME',
 'COMPLEX_NAME',
 'FIRE_YEAR',
 'DISCOVERY_DATE',
 'DISCOVERY_DOY',
 'DISCOVERY_TIME',
 'NWCG_CAUSE_CLASSIFICATION',
 'NWCG_GENERAL_CAUSE',
 'NWCG_CAUSE_AGE_CATEGORY',
 'CONT_DATE',
 'CONT_DOY',
 'CONT_TIME',
 'FIRE_SIZE',
 'FIRE_SIZE_CLASS',
 'LATITUDE',
 'LONGITUDE',
 'OWNER_DESCR',
 'STATE',
 'COUNTY',
 'FIPS_CODE',
 'FIPS_NAME',
 'LatLong_State',
 'LatLong_County',
 'NPL',
 'Mang_Type',
 'Mang_Name',
 'Des_Tp',
 'GAP_Sts',
 'GAP_Prity',
 'EVH',
 'EVT',
 'EVH_1km',
 'EVT_1km',
 'EVC',
 'EVC_1km',
 'NAME',
 'MOD_NDVI_12m',
 'MOD_EVI_12m',
 'Land_Cover',
 'Land_Cover_1km',
 'rpms',
 'rpms_1km',
 'Population',
 'Popo_

In [30]:
date_range = df_full.select(
    pl.col("DISCOVERY_DATE").min().alias("DISCOVERY_DATE_min"),
    pl.col("DISCOVERY_DATE").max().alias("DISCOVERY_DATE_max")
)

print(date_range)


shape: (1, 2)
┌────────────────────┬────────────────────┐
│ DISCOVERY_DATE_min ┆ DISCOVERY_DATE_max │
│ ---                ┆ ---                │
│ str                ┆ str                │
╞════════════════════╪════════════════════╡
│ 1992-01-01         ┆ 2020-12-31         │
└────────────────────┴────────────────────┘


In [None]:
# convert to datetime format for future analysis
# only using df from this point on
df = df_full.with_columns(
    pl.col("DISCOVERY_DATE").str.strptime(pl.Date, "%Y-%m-%d").alias("DISCOVERY_DATE")
)

print(df.select("DISCOVERY_DATE").head())

shape: (5, 1)
┌────────────────┐
│ DISCOVERY_DATE │
│ ---            │
│ date           │
╞════════════════╡
│ 2007-01-01     │
│ 2007-01-01     │
│ 2007-01-01     │
│ 2007-01-01     │
│ 2007-01-01     │
└────────────────┘


In [37]:
# count how many missing values in DISCOVERY_DATE
missing_count = df.select(
    pl.col("DISCOVERY_DATE").is_null().sum().alias("missing_dates")
)

print(missing_count)

shape: (1, 1)
┌───────────────┐
│ missing_dates │
│ ---           │
│ u32           │
╞═══════════════╡
│ 0             │
└───────────────┘
