##### Gridmet Data were most of the predictors presents more information about data is in Data composition

In [None]:
## Gridmet Datasets combined after pulling from GEE(Google Earth Engine). More informations about datasets in Data composition. 

import pandas as pd
from pathlib import Path

# ==== paths (edit these) ====
folder = Path('/Users/davidjerome/Desktop/Dissertation/XGBoost/Data collection/GRIDmeta data')
gm_path   = folder / "gridMET_daily_2024_ALL.csv"
ndvi_path = folder / "NDVI_S2_10m_exactday_AVAILABLE_2024_ALL.csv"
out_path  = folder / "gridMET_plus_NDVI_exact_2024_ALL.csv"

# ---- read both as strings for exact matching on keys ----
# (this preserves the exact text for date/lat/lon; no float rounding)
gm   = pd.read_csv(gm_path, dtype={"date": str, "lat": str, "lon": str})
ndvi = pd.read_csv(ndvi_path, dtype={"date": str, "lat": str, "lon": str})

# strip accidental whitespace on keys
for df in (gm, ndvi):
    df["date"] = df["date"].astype(str).str.strip()
    df["lat"]  = df["lat"].astype(str).str.strip()
    df["lon"]  = df["lon"].astype(str).str.strip()

# keep only the NDVI column from the NDVI table for the join
ndvi_small = ndvi[["date", "lat", "lon", "ndvi"]]

# left join: keep ALL gridMET rows; ndvi will be NaN when there is no exact match
merged = gm.merge(ndvi_small, on=["date", "lat", "lon"], how="left")

# save
merged.to_csv(out_path, index=False)

# quick stats
matched = merged["ndvi"].notna().sum()
total   = len(merged)
print(f"Saved → {out_path}")
print(f"Rows total: {total:,}")
print(f"Rows with NDVI match: {matched:,} ({matched/total:.1%})")
print(f"Rows without NDVI (left blank): {total-matched:,}")

In [None]:
# Ndvi Combinied after pulling it from GEE(Google Earth Engine). More informations about datasets in Data composition.

import pandas as pd
from pathlib import Path
import re

# folder with your 12 monthly CSVs
folder = Path('/Users/davidjerome/Desktop/Dissertation/XGBoost/Data collection/ndvi_final')   # <- change to your folder

# pick files like: NDVI_S2_10m_exactday_AVAILABLE_2024_1.csv ... _12.csv
files = list(folder.glob("NDVI_S2_10m_exactday_AVAILABLE_2024_*.csv"))

# sort by the month number at the end of the filename
def month_key(p):
    m = re.search(r"_(\d+)\.csv$", p.name)
    return int(m.group(1)) if m else 0
files = sorted(files, key=month_key)

# keep column order from the first file; just append rows
cols = pd.read_csv(files[0], nrows=0).columns
dfs = [pd.read_csv(f, usecols=cols) for f in files]
out = pd.concat(dfs, ignore_index=True, sort=False)

out_path = folder / "NDVI_S2_10m_exactday_AVAILABLE_2024_ALL.csv"
out.to_csv(out_path, index=False)
print("Combined:", len(files), "files →", out_path)

##### Ndvi missing values were temporally interpolated 

In [None]:
import pandas as pd
from pathlib import Path

p = Path('/Users/davidjerome/Desktop/Dissertation/XGBoost/Data collection/ndvi_final/ndvi_gaps_fill')
df = pd.read_csv(p/"gridMET_plus_NDVI_exact_2024_ALL.csv")

# Parse date
df["date"] = pd.to_datetime(df["date"], errors="coerce")

# Sort properly
df = df.sort_values(["id","lat","lon","date"])

# Replace 0 with NaN (zeros are usually cloud/no data)
df["ndvi_clean"] = df["ndvi"].where(df["ndvi"] > 0)

# Interpolate NDVI per location
def interp_group(g: pd.DataFrame) -> pd.DataFrame:
    g = g.set_index("date").copy()
    g["ndvi_interp"] = g["ndvi_clean"].interpolate(method="linear", limit=30)  
    # limit=30 means fill gaps ≤30 days; longer gaps stay NaN
    return g.reset_index()

df = df.groupby(["id","lat","lon"], group_keys=False).apply(interp_group)

# Add missing flag
df["ndvi_missing"] = df["ndvi"].isna().astype(int)

out = p/"gridMET_plus_NDVI_interp_2024.csv"
df.to_csv(out, index=False)
print("Saved:", out, "rows:", len(df))

In [None]:
## Missiing values from the previous dataset will be interpolated and showed as a graph in the next block of code

import pandas as pd

# -------- settings (no paths) --------
INFILE  = "gridMET_plus_NDVI_exact_2024_ALL.csv"
OUTFILE = "gridMET_plus_NDVI_interp_2024_ALL.csv"
TREAT_ZERO_AS_MISSING = True   # keep as True for clouds/non-veg zeros
MAX_GAP_STEPS = 30             # linear interpolate across ≤ this many consecutive NaNs
# ------------------------------------

# 1) Read as TEXT to keep date/lat/lon formats EXACTLY as in the file
df = pd.read_csv(INFILE, dtype=str)
orig_index = df.index
orig_cols  = df.columns.tolist()

# 2) Helper columns ONLY for computation (originals stay untouched)
#    Robust date parsing to datetime (we do NOT overwrite 'date')
dtxt = df["date"].astype(str).str.strip().str.split().str[0]
ddt  = pd.to_datetime(dtxt, errors="coerce", format="%Y-%m-%d")
m = ddt.isna();  ddt.loc[m] = pd.to_datetime(dtxt[m], errors="coerce", format="%m/%d/%Y")
m = ddt.isna();  ddt.loc[m] = pd.to_datetime(dtxt[m], errors="coerce", format="%d/%m/%Y")
df["_date_dt"] = ddt

# numeric copy of NDVI for interpolation
ndvi_num = pd.to_numeric(df["ndvi"], errors="coerce")
if TREAT_ZERO_AS_MISSING:
    ndvi_num = ndvi_num.where(ndvi_num != 0)
df["_ndvi_num"] = ndvi_num

# 3) Linear TEMPORAL interpolation per (id,lat,lon) WITHOUT reindexing or touching date/lat/lon
#    Fix for "duplicate labels": aggregate to unique dates first, then map back
def interp_group(g: pd.DataFrame) -> pd.Series:
    # average NDVI if multiple rows share the same date (rare but avoids duplicate-index issues)
    per_day = g.groupby("_date_dt")["_ndvi_num"].mean().sort_index()
    # time-based linear interpolation across up to MAX_GAP_STEPS consecutive NaNs
    per_day_i = per_day.interpolate(method="time", limit=MAX_GAP_STEPS)
    # map the interpolated per-day series back to each original row's date, preserving order
    return g["_date_dt"].map(per_day_i)

df["ndvi_interp"] = (
    df.groupby(["id","lat","lon"], group_keys=False)
      .apply(interp_group)
)

# 4) Optional flag: original NDVI missing (kept for modeling/QA)
df["ndvi_missing_flag"] = df["ndvi"].isna().astype("Int64")

# 5) Restore original row order and save with NEW columns appended
df = df.loc[orig_index]
out_cols = orig_cols + [c for c in ["ndvi_interp","ndvi_missing_flag"] if c not in orig_cols]
df[out_cols].to_csv(OUTFILE, index=False)

In [None]:
## This set code reperesents the line graph in the same duration that differentiates
## the default data missing points vs how interpolated points filled that gap.

import matplotlib.pyplot as plt
import pandas as pd

# Parse dates
df["_date_dt"] = pd.to_datetime(df["date"], errors="coerce")

# Convert NDVI columns to numeric
ndvi_raw = pd.to_numeric(df["ndvi"], errors="coerce")
ndvi_interp = pd.to_numeric(df["ndvi_interp"], errors="coerce")

# Monthly means
monthly_raw = ndvi_raw.groupby(df["_date_dt"].dt.month).mean()
monthly_interp = ndvi_interp.groupby(df["_date_dt"].dt.month).mean()

# Plot
plt.figure(figsize=(10,5))
plt.plot(monthly_raw.index, monthly_raw.values, marker="o", linestyle="--",
         label="Original NDVI", alpha=0.7)
plt.plot(monthly_interp.index, monthly_interp.values, marker="o", linestyle="-",
         label="Interpolated NDVI", alpha=0.9)

plt.xticks(range(1,13), ["Jan","Feb","Mar","Apr","May","Jun",
                         "Jul","Aug","Sep","Oct","Nov","Dec"])
plt.xlabel("Month")
plt.ylabel("Average NDVI")
plt.title("Monthly Average NDVI (Original vs Interpolated)")
plt.legend()
plt.grid(True)
plt.show()

### Topography and human proximities were directly pulled from GEE (Google Earth Engine). More information about the data is in Data composition. Lets look about fire data

##### Fire data were directly pulled from NASA Fire firms. More information about this dataset will be in data composition.

In [None]:
import pandas as pd, geopandas as gpd
from shapely.geometry import Point
from datetime import timedelta
from sklearn.neighbors import KDTree
import numpy as np

FINAL = "final_dataset_with_labels.csv"  # after you've already merged MODIS labels (id,lat,lon,date,...,fire_modis)
FIRMS = [
    "fire_archive_J1V-C2_661605.csv",   # add all you downloaded
    "fire_archive_SV-C2_661607.csv",
    "fire_nrt_J2V-C2_661606.csv"
]
BUFFER_M   = 1500         # start generous; tighten later
DAY_TOL    = 1            # UTC/local safety
CONF_NUM   = 80           # set None to skip; or use text filter below

def norm_date(s):
    d = pd.to_datetime(s, errors="coerce", infer_datetime_format=True, utc=False)
    m = d.isna()
    if m.any(): d.loc[m] = pd.to_datetime(s[m], errors="coerce", dayfirst=True)
    m = d.isna()
    if m.any(): d.loc[m] = pd.to_datetime(s[m], errors="coerce")
    return d.dt.date

# 1) Load final (points × days) with MODIS labels already inside
final = pd.read_csv(FINAL)
final["date"] = norm_date(final["date"])
gpts = gpd.GeoDataFrame(
    final[["id","lat","lon","date"]].drop_duplicates(),
    geometry=[Point(xy) for xy in zip(final["lon"], final["lat"])],
    crs="EPSG:4326"
)
gpts_m = gpts.to_crs(3857)
gpts_m["geometry"] = gpts_m.buffer(BUFFER_M)

# 2) Load/union FIRMS and standardize
fires = []
for f in FIRMS:
    df = pd.read_csv(f)
    # Required fields
    keep = {"latitude","longitude","acq_date","confidence","daynight"}
    missing = [c for c in ["latitude","longitude","acq_date"] if c not in df.columns]
    if missing: raise ValueError(f"{f} missing {missing}")
    df["date"] = norm_date(df["acq_date"])
    # Confidence filter (numeric or text)
    if "confidence" in df.columns:
        try:
            df = df[df["confidence"].astype(float) >= CONF_NUM]
        except:
            df = df[df["confidence"].astype(str).str.lower().isin({"nominal","high"})]
    fires.append(df[["latitude","longitude","date"]])

fires = pd.concat(fires, ignore_index=True)
gfires = gpd.GeoDataFrame(
    fires,
    geometry=[Point(xy) for xy in zip(fires["longitude"], fires["latitude"])],
    crs="EPSG:4326"
).to_crs(3857)

# 3) Label by date (±DAY_TOL) + buffer
labels = []
for d in sorted(gpts_m["date"].unique()):
    mask = (gfires["date"] >= (d - timedelta(days=DAY_TOL))) & (gfires["date"] <= (d + timedelta(days=DAY_TOL)))
    fires_d = gfires.loc[mask]
    pbuf_d  = gpts_m[gpts_m["date"] == d][["id","date","geometry"]]

    if fires_d.empty:
        out = pbuf_d[["id","date"]].copy()
        out["fire_firms"] = 0
        out["min_dist_m"] = np.nan
        labels.append(out)
        continue

    # Spatial join to mark hits
    joined = gpd.sjoin(pbuf_d, fires_d[["geometry"]], how="left", predicate="intersects")
    hit_ids = set(joined.dropna(subset=["index_right"])["id"].values)

    out = pbuf_d[["id","date"]].copy()
    out["fire_firms"] = out["id"].isin(hit_ids).astype(int)

    # Also compute nearest distance (diagnostic)
    tree = KDTree(np.vstack([fires_d.geometry.x, fires_d.geometry.y]).T)
    pts_xy = np.vstack([pbuf_d.geometry.centroid.x, pbuf_d.geometry.centroid.y]).T
    dist, _ = tree.query(pts_xy, k=1)
    out["min_dist_m"] = dist[:,0]  # meters in EPSG:3857
    labels.append(out)

labels = pd.concat(labels, ignore_index=True)

# 4) Merge back and combine with MODIS  (SAFE VERSION)

# If we never created any labels (e.g., no fires matched any date), make an empty table
if 'labels' not in locals() or labels is None or labels.empty:
    import pandas as pd
    labels = pd.DataFrame(columns=["id","date","fire_firms"])

# Ensure the expected columns exist
if "fire_firms" not in labels.columns:
    labels["fire_firms"] = 0

# Do the merge
final = final.merge(labels[["id","date","fire_firms"]], on=["id","date"], how="left")

# Guarantee the column exists after merge and fill NaNs with 0
if "fire_firms" not in final.columns:
    final["fire_firms"] = 0
else:
    final["fire_firms"] = final["fire_firms"].fillna(0).astype(int)

# If MODIS column is missing, create it as zeros
if "fire_modis" not in final.columns:
    final["fire_modis"] = 0

# Final combined label: 1 if either MODIS or FIRMS is 1
final["fire_label"] = ((final["fire_modis"] == 1) | (final["fire_firms"] == 1)).astype(int)

In [None]:
## After run this code I came to find out that fire data (Target) is primary for
## classification task has a huge imbalance and it definetly need a imbalance 
## method

print("labels shape:", labels.shape, "columns:", list(labels.columns))
print("labels sample:\n", labels.head())

print("final columns after merge:", list(final.columns))
print("fire_firms value counts:", final.get("fire_firms", 0).__class__.__name__,
      "\n", final["fire_firms"].value_counts(dropna=False) if "fire_firms" in final.columns else "no column")

### Final merging of all different datasets into a one final dataset to push into a model and to achieve our research questions.

##### Info : Lat and Long kept has primary has it represents each spatial points in our study area (some geological core stuff to do in coding)

In [None]:

import pandas as pd

# ========= EDIT THESE =========
BASE_CSV = "/Users/davidjerome/Desktop/Dissertation/Machine_learning/Data collection/FINAL_PDFS_DATA/Terrains_Cat_added_to_final.csv"  # your final base file (will NOT be altered)

# Each entry: (csv_path, {"source_col_in_that_csv": "new_col_name_in_output", ...})
ADDONS = [
    ("distance_to_urban.csv", {
        "dist_to_urban_m": "dist_to_urban_m",
    }),
    ("/Users/davidjerome/Desktop/Dissertation/Machine_learning/Data collection/FINAL_PDFS_DATA/Distance_to_roads_TableToExcel.csv", {
        "distance_to_roads_m": "distance_to_roads_m",
    }),
    ("/Users/davidjerome/Desktop/Dissertation/Machine_learning/Data collection/FINAL_PDFS_DATA/points_human_proximity_vars_2024_per_point.csv", {
        "dist_to_powerplant_m": "dist_to_powerplant_m",
        "urban_flag": "urban_flag",
        "viirs_avg_rad_2024":"viirs_avg_rad_2024",
        "worldpop_100m":"worldpop_100m",
    }),
]

# If your add-on files use different headers for lat/lon, add them here (case-insensitive)
LAT_CANDIDATES = ["lat", "latitude"]
LON_CANDIDATES = ["lon", "long", "longitude"]
# ==============================


def read_txt(path):
    # Read as text so nothing gets reformatted (dates stay dates-as-text, etc.)
    return pd.read_csv(path, dtype=str, keep_default_na=False, na_filter=False)


def find_col(df, candidates):
    # return the actual column name matching any candidate (case/space tolerant)
    normalized = {c.lower().replace(" ", "").replace("_", ""): c for c in df.columns}
    for c in candidates:
        k = c.lower().replace(" ", "").replace("_", "")
        if k in normalized:
            return normalized[k]
    return None


def lookup_from_addon(path, mapping):
    """
    path: CSV path
    mapping: dict {source_col -> out_col}
    returns a dataframe with columns: lat, lon, <out cols...>
    """
    df = read_txt(path)

    lat_col = find_col(df, LAT_CANDIDATES)
    lon_col = find_col(df, LON_CANDIDATES)
    if not lat_col or not lon_col:
        raise ValueError(f"{path}: couldn't find lat/lon columns. Saw: {list(df.columns)}")

    missing = [src for src in mapping.keys() if src not in df.columns]
    if missing:
        raise ValueError(f"{path}: missing source columns {missing}. Saw: {list(df.columns)}")

    keep = [lat_col, lon_col] + list(mapping.keys())
    tmp = df[keep].copy()

    # collapse to one row per (lat,lon); take first non-empty for each value column
    tmp = tmp.replace({"": None}).groupby([lat_col, lon_col], as_index=False).first()

    # rename lat/lon to match base, and source cols to desired output names
    rename_map = {lat_col: "lat", lon_col: "lon"}
    rename_map.update(mapping)
    return tmp.rename(columns=rename_map)


# -------- main --------
base = read_txt(BASE_CSV)
if "lat" not in base.columns or "lon" not in base.columns:
    raise ValueError(f"Base must contain 'lat' and 'lon'. Saw: {list(base.columns)}")

base_cols = list(base.columns)          # preserve base order
added_cols_in_order = []                # track new columns as we add them

for path, mapping in ADDONS:
    look = lookup_from_addon(path, mapping)
    base = base.merge(look, on=["lat", "lon"], how="left")
    added_cols_in_order.extend(mapping.values())

# final column order: base columns first, then the new ones (in the order you listed)
final_cols = base_cols + [c for c in added_cols_in_order if c in base.columns]
base = base[final_cols]

base.to_csv("final_dataset.csv", index=False)
print("✓ final_dataset.csv written (base unchanged; requested columns added).")