In [1]:
#  GREENPULSE — Preprocessing 
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import KNNImputer
from sklearn.preprocessing import OrdinalEncoder

In [3]:
# Load the Two Datasets
df1 = pd.read_csv("subnational_1_tree_cover_loss.csv")
df2 = pd.read_csv("subnational_2_tree_cover_loss.csv")

print(" Data loaded successfully.")
print("Sheet 1 shape:", df1.shape)
print("Sheet 2 shape:", df2.shape)

 Data loaded successfully.
Sheet 1 shape: (288, 30)
Sheet 2 shape: (5328, 31)


In [4]:
# Standardize Column Names

df1.columns = df1.columns.str.strip().str.lower().str.replace(" ", "_")
df2.columns = df2.columns.str.strip().str.lower().str.replace(" ", "_")

# Ensure consistent columns for merging
common_cols = [col for col in df1.columns if col in df2.columns]
combined = pd.concat([df1[common_cols], df2[common_cols]], ignore_index=True)

print(" Combined dataset shape:", combined.shape)

 Combined dataset shape: (5616, 30)


In [5]:
#  Handle Missing Values Safely
num_cols = combined.select_dtypes(include=np.number).columns
cat_cols = combined.select_dtypes(exclude=np.number).columns

# KNN imputation only on numeric columns
imputer = KNNImputer(n_neighbors=5)
combined[num_cols] = imputer.fit_transform(combined[num_cols])

# Fill categorical NaNs with "Unknown"
combined[cat_cols] = combined[cat_cols].fillna("Unknown")

print(" Missing values handled successfully.")

 Missing values handled successfully.


In [6]:
# ⿤ Encode Categorical Columns

encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
combined[cat_cols] = encoder.fit_transform(combined[cat_cols].astype(str))

print(" Ordinal encoding done")

 Ordinal encoding done


In [7]:
# Domain-Specific Feature Engineering
# Example features — adapt as per available columns
if "extent_2000_ha" in combined.columns and "extent_2010_ha" in combined.columns:
    combined["extent_change_2000_2010"] = combined["extent_2010_ha"] - combined["extent_2000_ha"]

loss_cols = [col for col in combined.columns if "tc_loss_ha_" in col]
if loss_cols:
    combined["total_tree_loss_ha_2001_2023"] = combined[loss_cols].sum(axis=1)

if "gain_2000-2020_ha" in combined.columns:
    combined["gain_loss_ratio"] = combined["gain_2000-2020_ha"] / (combined["total_tree_loss_ha_2001_2023"] + 1)

# Green Deficit Index (GDI)
if {"total_tree_loss_ha_2001_2023", "gain_2000-2020_ha", "extent_2000_ha"}.issubset(combined.columns):
    combined["GDI"] = (combined["total_tree_loss_ha_2001_2023"] - combined["gain_2000-2020_ha"]) / (combined["extent_2000_ha"] + 1)

    def categorize_gdi(x):
        if x <= -5:
            return "Excellent (Net Gain)"
        elif -5 < x <= 0:
            return "Acceptable"
        elif 0 < x <= 10:
            return "Concerning"
        else:
            return "High-Risk"
    combined["GDI_Category"] = combined["GDI"].apply(categorize_gdi)

print(" Feature engineering completed.")

 Feature engineering completed.


In [8]:
# Save Clean File

combined.to_csv("feature_engineered_greenpulse.csv", index=False)
print(" Saved cleaned dataset as 'feature_engineered_greenpulse.csv'")


 Saved cleaned dataset as 'feature_engineered_greenpulse.csv'
