In [1]:
import os
import json
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import GroupKFold, GroupShuffleSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor

# --- Proje yolları ---
BASE_DIR = Path("..")                      # notebooks klasöründen bir üst
DATA_DIR = BASE_DIR / "data"
PROCESSED_DIR = DATA_DIR / "processed"
RAW_DIR = DATA_DIR / "raw"

BASE_DIR, PROCESSED_DIR


(WindowsPath('..'), WindowsPath('../data/processed'))

In [2]:
candidate_files = [
    PROCESSED_DIR / "districts_features_v3_predictions.csv",
    PROCESSED_DIR / "districts_features_v2_model.csv",
    PROCESSED_DIR / "districts_features_v1.csv",
]

features_path = None
for p in candidate_files:
    if p.exists():
        features_path = p
        break

if features_path is None:
    raise FileNotFoundError("Processed klasöründe features CSV bulamadım: " + str(candidate_files))

print("Kullanılan features dosyası:", features_path)

df = pd.read_csv(features_path)
print("Satır:", len(df), "Kolon:", df.shape[1])
df.head()


Kullanılan features dosyası: ..\data\processed\districts_features_v3_predictions.csv
Satır: 929 Kolon: 13


Unnamed: 0,province_name,district_name,lat,lon,avg_temp,avg_rain,treecover_pct,potential_treecover_pct,missing_treecover_pct,model_potential_treecover_pct,model_missing_treecover_pct,pred_treecover_pct,treecover_gap_pct
0,Adana,Aladağ,37.666642,35.387781,16.739615,0.0,54.58,10,0.0,44.543306,0.0,45.610237,0.0
1,Adana,Ceyhan,37.011888,35.768198,19.804077,0.0,0.05,10,9.95,27.848537,27.798537,37.492598,37.442598
2,Adana,Feke,37.871495,35.821754,10.716615,0.0,15.07,10,0.0,30.32221,15.25221,27.646413,12.576413
3,Adana,Karaisali,37.259147,35.142888,12.239538,0.0,8.33,10,1.67,16.481983,8.151983,18.05402,9.72402
4,Adana,Karataş,36.675979,35.229132,21.450077,0.0,0.0,5,5.0,3.265905,3.265905,3.663069,3.663069


In [3]:
def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    # çok temel normalize: ardışık boşlukları tek boşluk yap
    s = " ".join(s.split())
    return s

df["prov_norm"] = df["province_name"].apply(normalize_text)
df["dist_norm"] = df["district_name"].apply(normalize_text)

df[["province_name","district_name","prov_norm","dist_norm"]].head()


Unnamed: 0,province_name,district_name,prov_norm,dist_norm
0,Adana,Aladağ,adana,aladağ
1,Adana,Ceyhan,adana,ceyhan
2,Adana,Feke,adana,feke
3,Adana,Karaisali,adana,karaisali
4,Adana,Karataş,adana,karataş


In [7]:
from pathlib import Path

RAW_DIR = (Path("..") / "data" / "raw").resolve()
print("RAW_DIR:", RAW_DIR)

# raw içinde TUR_2.shp arayalım
matches = list(RAW_DIR.rglob("*TUR_2.shp"))

print("Bulunan shp sayısı:", len(matches))
for m in matches[:20]:
    print(m)


RAW_DIR: C:\Kodlar\greensense-ai\data\raw
Bulunan shp sayısı: 1
C:\Kodlar\greensense-ai\data\raw\admin_boundaries\gadm41_TUR_shp\gadm41_TUR_2.shp


In [9]:
from pathlib import Path
import geopandas as gpd

districts_shp = Path(r"C:\Kodlar\greensense-ai\data\raw\admin_boundaries\gadm41_TUR_shp\gadm41_TUR_2.shp")

print("SHP exists:", districts_shp.exists(), districts_shp)

gdf = gpd.read_file(districts_shp)
gdf[["NAME_1", "NAME_2"]].head()


SHP exists: True C:\Kodlar\greensense-ai\data\raw\admin_boundaries\gadm41_TUR_shp\gadm41_TUR_2.shp


Unnamed: 0,NAME_1,NAME_2
0,Adana,Aladağ
1,Adana,Ceyhan
2,Adana,Feke
3,Adana,İmamoğlu
4,Adana,Karaisali


In [10]:
# equal-area projeksiyon (alan doğru hesap için)
gdf_area = gdf.to_crs(epsg=6933)  # World Equidistant Cylindrical - equal-area

# m^2 -> hektar
gdf_area["area_ha"] = gdf_area.geometry.area / 10_000

# normalleştirilmiş join kolonları
gdf_area["prov_norm"] = gdf_area["NAME_1"].apply(normalize_text)
gdf_area["dist_norm"] = gdf_area["NAME_2"].apply(normalize_text)

gdf_small = gdf_area[["prov_norm","dist_norm","area_ha"]].copy()
gdf_small.head()


Unnamed: 0,prov_norm,dist_norm,area_ha
0,adana,aladağ,195789.563207
1,adana,ceyhan,150936.179743
2,adana,feke,150492.049089
3,adana,i̇mamoğlu,30007.531484
4,adana,karaisali,173589.725808


In [11]:
df = df.merge(gdf_small, on=["prov_norm","dist_norm"], how="left")

missing_area = df["area_ha"].isna().mean()
print("area_ha boş oranı:", missing_area)

# Çok az boş olmalı. Eğer yüksekse isim eşleşmesi bozuk demektir.
df[["province_name","district_name","area_ha"]].head()


area_ha boş oranı: 0.0


Unnamed: 0,province_name,district_name,area_ha
0,Adana,Aladağ,195789.563207
1,Adana,Ceyhan,150936.179743
2,Adana,Feke,150492.049089
3,Adana,Karaisali,173589.725808
4,Adana,Karataş,45217.646956


In [12]:
if "missing_treecover_pct" in df.columns:
    well = df[df["missing_treecover_pct"] <= 5].copy()
else:
    # fallback: mevcut treecover'ı yüksek olanlar "iyi ormanlı" kabul
    well = df[df["treecover_pct"] >= 70].copy()

print("Eğitim (iyi ormanlı) satır:", len(well))
well[["province_name","district_name","treecover_pct"]].head()


Eğitim (iyi ormanlı) satır: 272


Unnamed: 0,province_name,district_name,treecover_pct
0,Adana,Aladağ,54.58
2,Adana,Feke,15.07
3,Adana,Karaisali,8.33
4,Adana,Karataş,0.0
7,Adana,Saimbeyli,43.16


In [13]:
# Modelde KULLANILMAYAN kolonlar:
ID_COLS = ["province_name","district_name","prov_norm","dist_norm","lat","lon"]

# Zorunlu hedef
TARGET = "treecover_pct"

# Aday feature kolonları: sayısal olanlardan seç
numeric_cols = [c for c in df.columns if pd.api.types.is_numeric_dtype(df[c])]

# Hedef ve istemediğimizleri çıkar
feature_cols = [c for c in numeric_cols if c not in [TARGET] and c not in ["area_ha"]]

print("Kullanılan feature sayısı:", len(feature_cols))
feature_cols[:30], "..."


Kullanılan feature sayısı: 10


(['lat',
  'lon',
  'avg_temp',
  'avg_rain',
  'potential_treecover_pct',
  'missing_treecover_pct',
  'model_potential_treecover_pct',
  'model_missing_treecover_pct',
  'pred_treecover_pct',
  'treecover_gap_pct'],
 '...')

In [14]:
X = well[feature_cols].copy()
y = well[TARGET].values
groups = well["prov_norm"].values

preprocess = Pipeline([
    ("imp", SimpleImputer(strategy="median"))
])

models = {
    "RF": RandomForestRegressor(
        n_estimators=600,
        random_state=42,
        n_jobs=-1,
        max_depth=None,
        min_samples_leaf=1
    ),
    "ExtraTrees": ExtraTreesRegressor(
        n_estimators=800,
        random_state=42,
        n_jobs=-1,
        max_depth=None,
        min_samples_leaf=1
    ),
    "HistGB": HistGradientBoostingRegressor(
        random_state=42,
        max_depth=8,
        learning_rate=0.06,
        max_iter=800
    )
}

def cv_score(model, X, y, groups, n_splits=5):
    gkf = GroupKFold(n_splits=n_splits)
    maes, rmses, r2s = [], [], []
    for tr, te in gkf.split(X, y, groups=groups):
        pipe = Pipeline([("prep", preprocess), ("model", model)])
        pipe.fit(X.iloc[tr], y[tr])
        pred = pipe.predict(X.iloc[te])
        maes.append(mean_absolute_error(y[te], pred))
        rmses.append(mean_squared_error(y[te], pred) ** 0.5)
        r2s.append(r2_score(y[te], pred))
    return {
        "mae_mean": float(np.mean(maes)), "mae_std": float(np.std(maes)),
        "rmse_mean": float(np.mean(rmses)), "rmse_std": float(np.std(rmses)),
        "r2_mean": float(np.mean(r2s)), "r2_std": float(np.std(r2s)),
    }

cv_results = {}
for name, mdl in models.items():
    res = cv_score(mdl, X, y, groups, n_splits=5)
    cv_results[name] = res
    print(name, res)


RF {'mae_mean': 5.42381569809204, 'mae_std': 0.9818773833465549, 'rmse_mean': 7.948109860492953, 'rmse_std': 1.4303035325782345, 'r2_mean': 0.9186387454081965, 'r2_std': 0.027455884363402395}
ExtraTrees {'mae_mean': 4.8022585227272625, 'mae_std': 0.7489173018743759, 'rmse_mean': 7.515738486976161, 'rmse_std': 1.4210587784872997, 'r2_mean': 0.9269865040524208, 'r2_std': 0.02515970068511816}
HistGB {'mae_mean': 7.288502929456982, 'mae_std': 1.336900043710903, 'rmse_mean': 9.913381534360592, 'rmse_std': 1.7472603864513103, 'r2_mean': 0.8715330512177404, 'r2_std': 0.04851760574297347}


In [15]:
best_name = sorted(cv_results.keys(), key=lambda k: (cv_results[k]["rmse_mean"], -cv_results[k]["r2_mean"]))[0]
best_base_model = models[best_name]
print("CV'ye göre en iyi:", best_name, cv_results[best_name])


CV'ye göre en iyi: ExtraTrees {'mae_mean': 4.8022585227272625, 'mae_std': 0.7489173018743759, 'rmse_mean': 7.515738486976161, 'rmse_std': 1.4210587784872997, 'r2_mean': 0.9269865040524208, 'r2_std': 0.02515970068511816}


In [16]:
pipe = Pipeline([("prep", preprocess), ("model", best_base_model)])

param_dist = {}
if best_name in ["RF","ExtraTrees"]:
    param_dist = {
        "model__n_estimators": [400, 600, 900, 1200],
        "model__max_depth": [None, 10, 20, 30],
        "model__min_samples_leaf": [1, 2, 4, 8],
        "model__min_samples_split": [2, 5, 10],
        "model__max_features": ["sqrt", 0.7, 0.9]
    }
elif best_name == "HistGB":
    param_dist = {
        "model__max_depth": [4, 6, 8, 10],
        "model__learning_rate": [0.03, 0.05, 0.07, 0.1],
        "model__max_iter": [400, 600, 800, 1200],
        "model__min_samples_leaf": [10, 20, 40, 80]
    }

# Province bazlı split için GroupShuffleSplit ile "validation" yapacağız
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, val_idx = next(gss.split(X, y, groups=groups))

X_train, y_train = X.iloc[train_idx], y[train_idx]
X_val, y_val = X.iloc[val_idx], y[val_idx]
groups_train = groups[train_idx]

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=25,
    scoring="neg_root_mean_squared_error",
    cv=GroupKFold(n_splits=5),
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train, groups=groups_train)
print("Best params:", search.best_params_)
print("Best CV RMSE:", -search.best_score_)
best_pipe = search.best_estimator_


Fitting 5 folds for each of 25 candidates, totalling 125 fits
Best params: {'model__n_estimators': 1200, 'model__min_samples_split': 5, 'model__min_samples_leaf': 2, 'model__max_features': 0.9, 'model__max_depth': 10}
Best CV RMSE: 8.348754414001522


In [17]:
val_pred = best_pipe.predict(X_val)
mae = mean_absolute_error(y_val, val_pred)
rmse = mean_squared_error(y_val, val_pred) ** 0.5
r2 = r2_score(y_val, val_pred)

print("VAL MAE:", round(mae,3))
print("VAL RMSE:", round(rmse,3))
print("VAL R2:", round(r2,3))


VAL MAE: 3.45
VAL RMSE: 4.988
VAL R2: 0.969


In [18]:
X_all = df[feature_cols].copy()
df["model_potential_treecover_pct"] = best_pipe.predict(X_all)

# mantıksal sınır: 0-100
df["model_potential_treecover_pct"] = df["model_potential_treecover_pct"].clip(0, 100)

# gap: potansiyel - mevcut (negatifse 0)
df["treecover_gap_pct"] = (df["model_potential_treecover_pct"] - df["treecover_pct"]).clip(lower=0, upper=100)

df[["province_name","district_name","treecover_pct","model_potential_treecover_pct","treecover_gap_pct"]].head()


Unnamed: 0,province_name,district_name,treecover_pct,model_potential_treecover_pct,treecover_gap_pct
0,Adana,Aladağ,54.58,57.872458,3.292458
1,Adana,Ceyhan,0.05,7.38032,7.33032
2,Adana,Feke,15.07,15.349908,0.279908
3,Adana,Karaisali,8.33,8.446646,0.116646
4,Adana,Karataş,0.0,0.247892,0.247892


In [19]:
TREES_PER_HA = 500  # senaryo parametresi: 300/500/800 gibi

# trees_needed = gap% * alan(ha) * ağaç/ha
df["trees_needed"] = (df["treecover_gap_pct"] / 100.0) * df["area_ha"] * TREES_PER_HA
df["trees_needed"] = df["trees_needed"].round().astype("Int64")

df[["province_name","district_name","area_ha","treecover_gap_pct","trees_needed"]].head()


Unnamed: 0,province_name,district_name,area_ha,treecover_gap_pct,trees_needed
0,Adana,Aladağ,195789.563207,3.292458,3223145
1,Adana,Ceyhan,150936.179743,7.33032,5532052
2,Adana,Feke,150492.049089,0.279908,210620
3,Adana,Karaisali,173589.725808,0.116646,101243
4,Adana,Karataş,45217.646956,0.247892,56046


In [20]:
out = df[[
    "province_name","district_name",
    "prov_norm","dist_norm",
    "treecover_pct","model_potential_treecover_pct","treecover_gap_pct",
    "area_ha","trees_needed"
]].copy()

# CSV kaydet
csv_out_path = PROCESSED_DIR / "districts_trees_needed_final.csv"
out.to_csv(csv_out_path, index=False)

# lookup json
lookup = {}
for _, r in out.iterrows():
    key = f"{r['prov_norm']}|{r['dist_norm']}"
    lookup[key] = {
        "province_name": r["province_name"],
        "district_name": r["district_name"],
        "trees_needed": None if pd.isna(r["trees_needed"]) else int(r["trees_needed"]),
        "treecover_pct": float(r["treecover_pct"]),
        "potential_treecover_pct": float(r["model_potential_treecover_pct"]),
        "gap_pct": float(r["treecover_gap_pct"]),
        "area_ha": float(r["area_ha"]) if not pd.isna(r["area_ha"]) else None,
        "trees_per_ha": TREES_PER_HA
    }

json_out_path = PROCESSED_DIR / "districts_trees_needed_lookup.json"
with open(json_out_path, "w", encoding="utf-8") as f:
    json.dump(lookup, f, ensure_ascii=False, indent=2)

csv_out_path, json_out_path, len(lookup)


(WindowsPath('../data/processed/districts_trees_needed_final.csv'),
 WindowsPath('../data/processed/districts_trees_needed_lookup.json'),
 929)

In [21]:
import joblib

MODELS_DIR = BASE_DIR / "models"
MODELS_DIR.mkdir(exist_ok=True)

model_path = MODELS_DIR / "best_treecover_potential_model.joblib"
joblib.dump({
    "model": best_pipe,
    "feature_cols": feature_cols,
    "train_filter_size": len(well),
    "trees_per_ha": TREES_PER_HA
}, model_path)

model_path


WindowsPath('../models/best_treecover_potential_model.joblib')

In [22]:
import numpy as np
import pandas as pd
import json
from pathlib import Path

BASE_DIR = Path("..")
PROCESSED_DIR = BASE_DIR / "data" / "processed"

TREES_PER_HA = 500
FEASIBLE_FRACTION = 0.15
YEARS_TARGET = 10
ANNUAL_CAP = 200_000

def normalize_text(s: str) -> str:
    if s is None or (isinstance(s, float) and np.isnan(s)):
        return ""
    s = str(s).strip().lower()
    s = " ".join(s.split())
    return s

# df hazır olmalı. df yoksa doğru dosyayı oku:
# df = pd.read_csv(PROCESSED_DIR / "districts_trees_needed_final.csv")

df2 = df.copy()

df2["treecover_pct_out"] = df2["treecover_pct"].mask(df2["treecover_pct"] == 0, np.nan)
df2["has_treecover_data"] = df2["treecover_pct_out"].notna()

lookup = {}

for _, r in df2.iterrows():
    prov = normalize_text(r["province_name"])
    dist = normalize_text(r["district_name"])
    key = f"{prov}|{dist}"

    treecover = r["treecover_pct_out"]
    has_data = bool(r["has_treecover_data"])

    potential = float(r["model_potential_treecover_pct"])
    area_ha = float(r["area_ha"])

    item = {
        "province_name": r["province_name"],
        "district_name": r["district_name"],
        "treecover_pct": None if pd.isna(treecover) else float(treecover),
        "has_treecover_data": has_data,
        "potential_treecover_pct": potential,
        "area_ha": area_ha,
        "trees_per_ha": TREES_PER_HA,
        "years_target": YEARS_TARGET,
        "feasible_fraction": FEASIBLE_FRACTION,
        "annual_cap": ANNUAL_CAP,
    }

    if has_data:
        gap_pct = max(0.0, min(100.0, potential - float(treecover)))
        trees_theoretical = int(round((gap_pct / 100.0) * area_ha * TREES_PER_HA))
        trees_feasible = int(round(trees_theoretical * FEASIBLE_FRACTION))
        annual = int(round(trees_feasible / YEARS_TARGET))
        annual_capped = int(min(annual, ANNUAL_CAP))

        item.update({
            "gap_pct": gap_pct,
            "trees_needed_theoretical": trees_theoretical,
            "trees_needed_feasible": trees_feasible,
            "annual_trees_needed": annual,
            "annual_trees_needed_capped": annual_capped,
        })
    else:
        item.update({
            "gap_pct": None,
            "trees_needed_theoretical": None,
            "trees_needed_feasible": None,
            "annual_trees_needed": None,
            "annual_trees_needed_capped": None,
        })

    lookup[key] = item

out_path = PROCESSED_DIR / "districts_trees_needed_lookup.json"
with open(out_path, "w", encoding="utf-8") as f:
    json.dump(lookup, f, ensure_ascii=False, indent=2)

print("OK:", out_path, "kayıt:", len(lookup))


OK: ..\data\processed\districts_trees_needed_lookup.json kayıt: 929
