In [6]:
import pandas as pd

# File paths for input CSV files and output CSV
UNSTRUCTURED_CSV = "cleaned_unstructured_data.csv"
STRUCTURED_CSV = "structured.csv"
MERGED_CSV = "new_merged_wildfire_data_new.csv"

# Step 1: Load the unstructured data
try:
    unstructured_df = pd.read_csv(UNSTRUCTURED_CSV, low_memory=False)
except FileNotFoundError:
    print(f"Error: File '{UNSTRUCTURED_CSV}' not found. Please check the file path.")
    exit()

unstructured_cols = ["latitude", "longitude", "acq_date", "frp", "bright_ti4", "confidence", "daynight"]
if not all(col in unstructured_df.columns for col in unstructured_cols):
    missing_cols = [col for col in unstructured_cols if col not in unstructured_df.columns]
    print(f"Error: Missing required columns in unstructured data: {missing_cols}")
    exit()
unstructured_df = unstructured_df[unstructured_cols]
print("Unstructured data selected columns:", unstructured_df.columns.tolist())
print(f"Unstructured data - Rows: {len(unstructured_df)}")

# Round latitude and longitude to 2 decimal places to allow for approximate matches
unstructured_df["latitude"] = unstructured_df["latitude"].round(2)
unstructured_df["longitude"] = unstructured_df["longitude"].round(2)

# Convert acq_date to datetime and extract year_month
try:
    unstructured_df["acq_date"] = pd.to_datetime(unstructured_df["acq_date"])
except Exception as e:
    print(f"Error converting 'acq_date' to datetime: {e}")
    print("Please check the date format in 'acq_date'. Expected format: YYYY-MM-DD")
    exit()
unstructured_df["year_month"] = unstructured_df["acq_date"].dt.to_period("M").astype(str)

# Debug: Check the ranges of latitude, longitude, and year_month
print("\nUnstructured data - Latitude range:", unstructured_df["latitude"].min(), "to", unstructured_df["latitude"].max())
print("Unstructured data - Longitude range:", unstructured_df["longitude"].min(), "to", unstructured_df["longitude"].max())
print("Unstructured data - Year-Month range:", unstructured_df["year_month"].min(), "to", unstructured_df["year_month"].max())
print("Unstructured data - Sample latitude/longitude pairs (first 5):")
print(unstructured_df[["latitude", "longitude"]].head())

# Step 2: Load the structured data
try:
    structured_df = pd.read_csv(STRUCTURED_CSV)
except FileNotFoundError:
    print(f"Error: File '{STRUCTURED_CSV}' not found. Please check the file path.")
    exit()

structured_cols = ["latitude", "longitude", "disc_clean_date", "stat_cause_descr", "state"]
if not all(col in structured_df.columns for col in structured_cols):
    missing_cols = [col for col in structured_cols if col not in structured_df.columns]
    print(f"Error: Missing required columns in structured data: {missing_cols}")
    exit()
structured_df = structured_df[structured_cols]
print("\nStructured data selected columns:", structured_df.columns.tolist())
print(f"Structured data - Rows: {len(structured_df)}")

# Round latitude and longitude to 2 decimal places
structured_df["latitude"] = structured_df["latitude"].round(2)
structured_df["longitude"] = structured_df["longitude"].round(2)

# Convert disc_clean_date to datetime and extract year_month
try:
    structured_df["disc_clean_date"] = pd.to_datetime(structured_df["disc_clean_date"])
except Exception as e:
    print(f"Error converting 'disc_clean_date' to datetime: {e}")
    print("Please check the date format in 'disc_clean_date'. Expected format: YYYY-MM-DD")
    exit()
structured_df["year_month"] = structured_df["disc_clean_date"].dt.to_period("M").astype(str)

# Debug: Check the ranges of latitude, longitude, and year_month
print("\nStructured data - Latitude range:", structured_df["latitude"].min(), "to", structured_df["latitude"].max())
print("Structured data - Longitude range:", structured_df["longitude"].min(), "to", structured_df["longitude"].max())
print("Structured data - Year-Month range:", structured_df["year_month"].min(), "to", structured_df["year_month"].max())
print("Structured data - Sample latitude/longitude pairs (first 5):")
print(structured_df[["latitude", "longitude"]].head())

# Step 3: Merge the datasets based on latitude, longitude, and same month/year
merged_df = pd.merge(
    unstructured_df,
    structured_df[["latitude", "longitude", "year_month", "stat_cause_descr", "state"]],
    on=["latitude", "longitude", "year_month"],
    how="inner"
)

# Drop the temporary year_month column
merged_df = merged_df.drop(columns=["year_month"])
print(f"\nMerged data - Rows: {len(merged_df)}")

# If the merged DataFrame is empty, provide additional debugging
if len(merged_df) == 0:
    print("\nNo matches found. Possible reasons:")
    print("- Latitude/Longitude values may not match exactly between the datasets even after rounding to 2 decimal places.")
    print("- Dates may not align in the same month and year.")
    print("- Check the sample latitude/longitude pairs and year_month ranges above to identify discrepancies.")
    print("- You may need to adjust the rounding of latitude/longitude (e.g., round to 1 decimal place) or verify the date formats.")
    print("- If coordinates are still not matching, you may need to verify if the datasets cover the same geographic area.")

# Step 4: Save the merged dataset with all available columns
# Note: Columns like temp_mean_0, prcp_sum_0, etc., are not available in the input datasets
merged_df.to_csv(MERGED_CSV, index=False)
print(f"Merged dataset saved to '{MERGED_CSV}'")
print("Final merged columns:", merged_df.columns.tolist())
print("\nNote: The following desired columns are missing because they are not in the input datasets:")
print("- temp_mean_0, prcp_sum_0, wspd_mean_0, temp_mean_10, prcp_sum_10, wspd_mean_10, temp_mean_180, prcp_sum_180, wspd_mean_180, fire_occurrence")
print("To include these columns, please provide the source dataset (e.g., merged_wildfire_data.csv) and merge it with the result.")

Unstructured data selected columns: ['latitude', 'longitude', 'acq_date', 'frp', 'bright_ti4', 'confidence', 'daynight']
Unstructured data - Rows: 7884686

Unstructured data - Latitude range: 18.92 to 71.33
Unstructured data - Longitude range: -178.81 to 179.59
Unstructured data - Year-Month range: 2000-11 to 2022-03
Unstructured data - Sample latitude/longitude pairs (first 5):
   latitude  longitude
0     38.54     -78.30
1     38.56     -78.31
2     38.55     -78.31
3     38.56     -78.32
4     31.34     -89.91

Structured data selected columns: ['latitude', 'longitude', 'disc_clean_date', 'stat_cause_descr', 'state']
Structured data - Rows: 55367

Structured data - Latitude range: 17.96 to 69.85
Structured data - Longitude range: -165.94 to -65.29
Structured data - Year-Month range: 1992-01 to 2015-12
Structured data - Sample latitude/longitude pairs (first 5):
   latitude  longitude
0     18.11     -66.75
1     35.04     -87.61
2     34.95     -88.72
3     39.64    -119.31
4     3