In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv("Unified_Climate_Dataset_2010-2024_ML_READY_5.csv")
print(df["year"].unique())   # should show 2010–2024


[2010 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
 2024]


In [4]:
# Step 3. Define an Extension Function

# We’ll use a lightweight AR(2) + linear trend hybrid.
# It automatically falls back to a linear trend if AR(2) fails.


def extend_feature(df, feature, id_cols=["pixel_id", "latitude", "longitude"],
                   year_col="year", end_year=2030):
    """Extend each feature to 2030 using AR(2) or linear trend with fallback."""
    extended_rows = []

    for _, g in df.groupby(id_cols):
        g = g.sort_values(year_col)
        years = g[year_col].values
        values = g[feature].interpolate(limit_direction="both").values  # fill NaN

        if len(values) < 3:
            continue  # skip short histories

        # AR(2) coefficients (safe version)
        y1, y2, y3 = values[-1], values[-2], values[-3]
        denominator = (y2 - y3)
        if abs(denominator) < 1e-9:
            a, b = 1, 0
        else:
            a = (y1 - y2) / denominator
            b = 1 - a


        # # AR(2) coefficients (with fallback)
        # y1, y2, y3 = values[-1], values[-2], values[-3]
        # try:
        #     a = (y1 - y2) / (y2 - y3)
        #     b = 1 - a
        # except ZeroDivisionError:
        #     a, b = 1, 0

        # Forecast yearly up to 2030
        future_years = range(int(years.max()) + 1, end_year + 1)
        y_prev2, y_prev1 = values[-2], values[-1]
        y_future = []

        for _ in future_years:
            y_next = a * y_prev1 + b * y_prev2
            if feature in ["precipitation", "soil_moisture", "ch4_emissions"]:
                y_next = max(y_next, 0)  # avoid negatives
            y_future.append(y_next)
            y_prev2, y_prev1 = y_prev1, y_next

        df_future = pd.DataFrame({
            **{col: g[col].iloc[0] for col in id_cols},
            year_col: list(future_years),
            feature: y_future
        })
        extended_rows.append(df_future)

    if not extended_rows:
        print(f"⚠️ No new rows generated for {feature}. Check missing data or ID grouping.")
        return df

    df_extended = pd.concat([df, pd.concat(extended_rows)], ignore_index=True)
    return df_extended


In [5]:
# Step 4. Apply the Function to Each Dynamic Variable

dynamic_vars = [
    "temperature", "precipitation", "soil_moisture",
    "permafrost_fraction", "ch4_emissions"
]

for var in dynamic_vars:
    print(f"Extending {var}...")
    df = extend_feature(df, var, end_year=2030)


Extending temperature...
Extending precipitation...
Extending soil_moisture...
Extending permafrost_fraction...
Extending ch4_emissions...


In [6]:
# Step 5. Replicate Static Features
# These features remain constant per pixel:

static_vars = ["elevation", "land_cover_class", "is_wetland_like"]

for var in static_vars:
    df[var] = df.groupby("pixel_id")[var].transform("first")


In [7]:
df.to_csv("Unified_Climate_Dataset_2010-2030_ML_READY.csv", index=False)
print("✅ Extended dataset saved!")
print(df["year"].unique())  # should show 2010–2030


✅ Extended dataset saved!
[2010. 2011. 2012. 2013. 2014. 2015. 2016. 2017. 2018. 2019. 2020. 2021.
 2022. 2023. 2024. 2025. 2026. 2027. 2028. 2029. 2030.]


In [10]:
import pandas as pd
import numpy as np

# ✅ Step 1: Load your clean base (up to 2024)
df = pd.read_csv("Unified_Climate_Dataset_2010-2024_ML_READY_5.csv")

# ✅ Step 2: Define variable groups
id_cols   = ["pixel_id", "latitude", "longitude"]
year_col  = "year"
dynamic   = ["temperature", "precipitation", "soil_moisture",
             "permafrost_fraction", "ch4_emissions"]
static    = ["elevation", "land_cover_class", "is_wetland_like"]

start_y, end_y = 2025, 2030
future_years = list(range(start_y, end_y + 1))

# ✅ Step 3: Extension function (safe AR(2) with fallback)
def extend_feature_from_base(base_df, feature):
    rows = []
    for _, g in base_df[[*id_cols, year_col, feature]].groupby(id_cols):
        g = g.sort_values(year_col)
        vals = g[feature].interpolate(limit_direction="both").values
        if len(vals) < 3:
            continue

        y1, y2, y3 = vals[-1], vals[-2], vals[-3]
        denom = (y2 - y3)
        if abs(denom) < 1e-9:
            a, b = 1.0, 0.0
        else:
            a = (y1 - y2) / denom
            b = 1.0 - a

        yp2, yp1 = y2, y1
        fut_vals = []
        for _ in future_years:
            y_next = a*yp1 + b*yp2
            if feature in ["precipitation", "soil_moisture", "ch4_emissions"]:
                y_next = max(y_next, 0.0)
            fut_vals.append(y_next)
            yp2, yp1 = yp1, y_next

        rows.append(pd.DataFrame({
            **{c: g[c].iloc[0] for c in id_cols},
            year_col: future_years,
            feature: fut_vals
        }))
    return pd.concat(rows, ignore_index=True) if rows else pd.DataFrame(columns=[*id_cols, year_col, feature])

# ✅ Step 4: Create a future grid and extend each dynamic feature
base = df[df[year_col] <= 2024].copy()
keys = base[id_cols].drop_duplicates().copy()
keys["_k"] = 1
years_df = pd.DataFrame({year_col: future_years})
years_df["_k"] = 1
future = keys.merge(years_df, on="_k").drop(columns="_k")

for feat in dynamic:
    fut_feat = extend_feature_from_base(base, feat)
    future = future.merge(fut_feat, on=[*id_cols, year_col], how="left")

for s in static:
    map_s = base.groupby("pixel_id")[s].first()
    future[s] = future["pixel_id"].map(map_s)

future["ch4_concentration"] = np.nan
future["land_cover_name"] = base.groupby("pixel_id")["land_cover_name"].first().reindex(future["pixel_id"]).values

# ✅ Step 5: Combine base + future
df_2030 = pd.concat([base, future], ignore_index=True)
df_2030.to_csv("Unified_Climate_Dataset_2010-2030_FIXED.csv", index=False)
print("✅ Saved: Unified_Climate_Dataset_2010-2030_FIXED.csv")


✅ Saved: Unified_Climate_Dataset_2010-2030_FIXED.csv


In [11]:
import logging, time
logging.basicConfig(level=logging.INFO, format="%(asctime)s | %(levelname)s | %(message)s")

for feat in dynamic:
    t0 = time.perf_counter()
    logging.info(f"Extending {feat} …")
    fut_feat = extend_feature_from_base(base, feat)
    if fut_feat.empty:
        logging.warning(f"{feat}: skipped (no rows).")
        continue
    future = future.merge(fut_feat, on=[*id_cols, year_col], how="left")
    logging.info(f"{feat}: done in {time.perf_counter()-t0:.1f}s")


2025-10-21 17:32:44,289 | INFO | Extending temperature …
2025-10-21 17:33:24,550 | INFO | temperature: done in 40.3s
2025-10-21 17:33:24,550 | INFO | Extending precipitation …
2025-10-21 17:34:04,068 | INFO | precipitation: done in 39.5s
2025-10-21 17:34:04,069 | INFO | Extending soil_moisture …
2025-10-21 17:34:44,054 | INFO | soil_moisture: done in 40.0s
2025-10-21 17:34:44,055 | INFO | Extending permafrost_fraction …
2025-10-21 17:35:24,072 | INFO | permafrost_fraction: done in 40.0s
2025-10-21 17:35:24,073 | INFO | Extending ch4_emissions …
2025-10-21 17:36:04,218 | INFO | ch4_emissions: done in 40.1s
