In [1]:
import pandas as pd
import numpy as np

from pathlib import Path

from sklearn.model_selection import GroupKFold, RandomizedSearchCV
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import HistGradientBoostingRegressor


In [2]:
BASE_DIR = Path("..")  # notebooks klasöründeysen bir üst proje kökü
PROCESSED_DIR = BASE_DIR / "data" / "processed"

features_path = PROCESSED_DIR / "districts_features_v3_predictions.csv"
features_path


WindowsPath('../data/processed/districts_features_v3_predictions.csv')

In [3]:
df = pd.read_csv(features_path)
df.shape, df.columns


((929, 13),
 Index(['province_name', 'district_name', 'lat', 'lon', 'avg_temp', 'avg_rain',
        'treecover_pct', 'potential_treecover_pct', 'missing_treecover_pct',
        'model_potential_treecover_pct', 'model_missing_treecover_pct',
        'pred_treecover_pct', 'treecover_gap_pct'],
       dtype='object'))

In [5]:
import geopandas as gpd


In [9]:
import os
from pathlib import Path

print("CWD:", os.getcwd())
print("CWD path:", Path().resolve())


CWD: C:\Kodlar\greensense-ai\notebooks
CWD path: C:\Kodlar\greensense-ai\notebooks


In [10]:
from pathlib import Path

BASE_DIR = Path("..").resolve()   # notebooks klasöründen bir üst: proje kökü
districts_shp = BASE_DIR / "data" / "raw" / "admin_boundaries" / "gadm41_TUR_shp" / "gadm41_TUR_2.shp"

print("Path:", districts_shp)
print("Exists:", districts_shp.exists())


Path: C:\Kodlar\greensense-ai\data\raw\admin_boundaries\gadm41_TUR_shp\gadm41_TUR_2.shp
Exists: True


In [11]:
import geopandas as gpd

try:
    gdf = gpd.read_file(districts_shp, engine="pyogrio")
except Exception as e1:
    print("pyogrio failed:", e1)
    gdf = gpd.read_file(districts_shp, engine="fiona")

gdf.head(), gdf.crs, gdf.columns


(       GID_2 GID_0 COUNTRY    GID_1 NAME_1 NL_NAME_1     NAME_2 VARNAME_2  \
 0  TUR.1.1_1   TUR  Turkey  TUR.1_1  Adana        NA     Aladağ        NA   
 1  TUR.1.2_1   TUR  Turkey  TUR.1_1  Adana        NA     Ceyhan        NA   
 2  TUR.1.3_1   TUR  Turkey  TUR.1_1  Adana        NA       Feke        NA   
 3  TUR.1.4_1   TUR  Turkey  TUR.1_1  Adana        NA   İmamoğlu        NA   
 4  TUR.1.5_1   TUR  Turkey  TUR.1_1  Adana        NA  Karaisali        NA   
 
   NL_NAME_2    TYPE_2 ENGTYPE_2 CC_2    HASC_2  \
 0        NA  District  District   NA  TR.AA.AL   
 1        NA  District  District   NA  TR.AA.CE   
 2        NA  District  District   NA  TR.AA.FE   
 3        NA  District  District   NA  TR.AA.IM   
 4        NA  District  District   NA  TR.AA.KS   
 
                                             geometry  
 0  POLYGON ((35.58969 37.94585, 35.58018 37.9321,...  
 1  POLYGON ((35.93405 36.87986, 35.93375 36.87986...  
 2  POLYGON ((35.64175 37.70995, 35.63527 37.71608... 

In [12]:
name_cols = [c for c in gdf.columns if "NAME" in c.upper()]
name_cols


['NAME_1', 'NL_NAME_1', 'NAME_2', 'VARNAME_2', 'NL_NAME_2']

In [13]:
gdf_area = gdf.to_crs(epsg=6933)           # equal-area
gdf_area["area_ha"] = gdf_area.geometry.area / 10_000

gdf_area[["NAME_1","NAME_2","area_ha"]].head()


Unnamed: 0,NAME_1,NAME_2,area_ha
0,Adana,Aladağ,195789.563207
1,Adana,Ceyhan,150936.179743
2,Adana,Feke,150492.049089
3,Adana,İmamoğlu,30007.531484
4,Adana,Karaisali,173589.725808


In [17]:
# 1) shapefile kolonlarını bizim standart isimlere çevir
gdf_area = gdf_area.rename(columns={
    "NAME_1": "province_name",
    "NAME_2": "district_name"
})

# 2) sadece lazım olan kolonları al
gdf_small = gdf_area[["province_name", "district_name", "area_ha"]].copy()

gdf_small.head()


Unnamed: 0,province_name,district_name,area_ha
0,Adana,Aladağ,195789.563207
1,Adana,Ceyhan,150936.179743
2,Adana,Feke,150492.049089
3,Adana,İmamoğlu,30007.531484
4,Adana,Karaisali,173589.725808


In [18]:
import pandas as pd
from pathlib import Path

BASE_DIR = Path("..")
PROCESSED_DIR = BASE_DIR / "data" / "processed"

pred_path = PROCESSED_DIR / "districts_features_v3_predictions.csv"
df = pd.read_csv(pred_path)

df.head()


Unnamed: 0,province_name,district_name,lat,lon,avg_temp,avg_rain,treecover_pct,potential_treecover_pct,missing_treecover_pct,model_potential_treecover_pct,model_missing_treecover_pct,pred_treecover_pct,treecover_gap_pct
0,Adana,Aladağ,37.666642,35.387781,16.739615,0.0,54.58,10,0.0,44.543306,0.0,45.610237,0.0
1,Adana,Ceyhan,37.011888,35.768198,19.804077,0.0,0.05,10,9.95,27.848537,27.798537,37.492598,37.442598
2,Adana,Feke,37.871495,35.821754,10.716615,0.0,15.07,10,0.0,30.32221,15.25221,27.646413,12.576413
3,Adana,Karaisali,37.259147,35.142888,12.239538,0.0,8.33,10,1.67,16.481983,8.151983,18.05402,9.72402
4,Adana,Karataş,36.675979,35.229132,21.450077,0.0,0.0,5,5.0,3.265905,3.265905,3.663069,3.663069


In [19]:
import re

def normalize_tr(s):
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    s = s.replace("ı", "i").replace("İ", "i")
    s = s.replace("ğ", "g").replace("ü", "u").replace("ş", "s").replace("ö", "o").replace("ç", "c")
    s = re.sub(r"\s+", " ", s)   # fazla boşlukları tek boşluk yap
    return s

# CSV tarafı
df["prov_key"] = df["province_name"].apply(normalize_tr)
df["dist_key"] = df["district_name"].apply(normalize_tr)

# shapefile tarafı
gdf_small["prov_key"] = gdf_small["province_name"].apply(normalize_tr)
gdf_small["dist_key"] = gdf_small["district_name"].apply(normalize_tr)


In [20]:
df2 = df.merge(
    gdf_small[["prov_key", "dist_key", "area_ha"]],
    on=["prov_key", "dist_key"],
    how="left"
)

df2[["province_name","district_name","area_ha"]].head()


Unnamed: 0,province_name,district_name,area_ha
0,Adana,Aladağ,195789.563207
1,Adana,Ceyhan,150936.179743
2,Adana,Feke,150492.049089
3,Adana,Karaisali,173589.725808
4,Adana,Karataş,45217.646956


In [21]:
missing_rate = df2["area_ha"].isna().mean()
missing_count = df2["area_ha"].isna().sum()

missing_rate, missing_count


(0.0, 0)

In [22]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path("..")
PROCESSED_DIR = BASE_DIR / "data" / "processed"

# Tahmin dosyan (model için en güncel tabloyu kullan)
data_path = PROCESSED_DIR / "districts_features_v3_predictions.csv"
df = pd.read_csv(data_path)

df.shape, df.columns


((929, 13),
 Index(['province_name', 'district_name', 'lat', 'lon', 'avg_temp', 'avg_rain',
        'treecover_pct', 'potential_treecover_pct', 'missing_treecover_pct',
        'model_potential_treecover_pct', 'model_missing_treecover_pct',
        'pred_treecover_pct', 'treecover_gap_pct'],
       dtype='object'))

In [23]:
target = "treecover_pct"

# Modelde kullanacağımız feature’lar (senin dosyana göre)
feature_cols = [
    "avg_temp",
    "avg_rain",
    "temp_std_10yr",
    "precip_std_10yr",
    "temp_min_10yr",
    "temp_max_10yr"
]

# Elinde bazıları yoksa otomatik düşelim:
feature_cols = [c for c in feature_cols if c in df.columns]
feature_cols


['avg_temp', 'avg_rain']

In [24]:
# sadece gerekli kolonlar
df_ml = df.dropna(subset=feature_cols + [target]).copy()

# target aralığını fiziksel sınıra çek (0-100)
df_ml[target] = df_ml[target].clip(0, 100)

df_ml.shape


(929, 13)

In [25]:
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

X = df_ml[feature_cols]
y = df_ml[target]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="median")),
    ("model", RandomForestRegressor(
        n_estimators=600,
        random_state=42,
        n_jobs=-1,
        max_depth=None,
        min_samples_leaf=2
    ))
])

# CV ile daha güvenilir ölçüm
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"mae": "neg_mean_absolute_error", "rmse": "neg_root_mean_squared_error", "r2": "r2"}

cv_res = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring, return_train_score=True)

def summarize_neg(scores):
    scores = -scores
    return scores.mean(), scores.std()

print("CV Train MAE:", summarize_neg(cv_res["train_mae"]))
print("CV Val   MAE:", summarize_neg(cv_res["test_mae"]))
print("CV Train RMSE:", summarize_neg(cv_res["train_rmse"]))
print("CV Val   RMSE:", summarize_neg(cv_res["test_rmse"]))
print("CV Train R2:", (cv_res["train_r2"].mean(), cv_res["train_r2"].std()))
print("CV Val   R2:", (cv_res["test_r2"].mean(), cv_res["test_r2"].std()))


CV Train MAE: (10.236720794065628, 0.16276096132864545)
CV Val   MAE: (14.45889624245263, 0.8020543866749623)
CV Train RMSE: (16.519001395359265, 0.23657333644295825)
CV Val   RMSE: (22.850418250884722, 0.7232226659123262)
CV Train R2: (0.5672156620248022, 0.011910970245913097)
CV Val   R2: (0.1648136964144332, 0.050707632948256)


In [26]:
pipe.fit(X_train, y_train)
pred = pipe.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred) ** 0.5
r2 = r2_score(y_test, pred)

mae, rmse, r2


(13.092960488438264, 21.937813388140047, 0.34996522511848316)

In [27]:
# tüm veri üzerinde tahmin (imputer sayesinde eksikler sorun olmaz)
df["pred_treecover_pct"] = pipe.predict(df[feature_cols])

# sınırla
df["pred_treecover_pct"] = df["pred_treecover_pct"].clip(0, 100)

out_path = PROCESSED_DIR / "districts_features_v4_pred_treecover.csv"
df.to_csv(out_path, index=False)

out_path


WindowsPath('../data/processed/districts_features_v4_pred_treecover.csv')

In [28]:
import pandas as pd
import numpy as np
from pathlib import Path

BASE_DIR = Path("..")
PROCESSED_DIR = BASE_DIR / "data" / "processed"

df = pd.read_csv(PROCESSED_DIR / "districts_features_v4_pred_treecover.csv")

target = "treecover_pct"

num_features = [
    "avg_temp","avg_rain",
    "temp_std_10yr","precip_std_10yr",
    "temp_min_10yr","temp_max_10yr",
    "lat","lon"
]
num_features = [c for c in num_features if c in df.columns]

cat_features = []
if "province_name" in df.columns:
    cat_features.append("province_name")

df_ml = df.dropna(subset=[target]).copy()
df_ml[target] = df_ml[target].clip(0, 100)

X = df_ml[num_features + cat_features]
y = df_ml[target]

X.shape, y.shape, num_features, cat_features


((929, 5), (929,), ['avg_temp', 'avg_rain', 'lat', 'lon'], ['province_name'])

In [35]:
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import HistGradientBoostingRegressor

ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

preprocess = ColumnTransformer(
    transformers=[
        ("num", SimpleImputer(strategy="median"), num_features),
        ("cat", Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", ohe)
        ]), cat_features)
    ],
    remainder="drop"
)

models["HistGB"] = HistGradientBoostingRegressor(
    random_state=42,
    max_depth=6,
    learning_rate=0.06,
    max_iter=800
)

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"mae":"neg_mean_absolute_error","rmse":"neg_root_mean_squared_error","r2":"r2"}

def summarize(name, res):
    val_mae = (-res["test_mae"]).mean()
    val_rmse = (-res["test_rmse"]).mean()
    val_r2 = (res["test_r2"]).mean()
    print(f"{name:10s} | Val MAE: {val_mae:.2f} | Val RMSE: {val_rmse:.2f} | Val R2: {val_r2:.3f}")

results = {}
for name, mdl in models.items():
    pipe = Pipeline([("prep", preprocess), ("model", mdl)])
    res = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring)
    results[name] = res
    summarize(name, res)


RF         | Val MAE: 11.75 | Val RMSE: 19.98 | Val R2: 0.364
ExtraTrees | Val MAE: 11.89 | Val RMSE: 20.67 | Val R2: 0.320
HistGB     | Val MAE: 13.92 | Val RMSE: 21.45 | Val R2: 0.257


In [36]:
from sklearn.ensemble import HistGradientBoostingRegressor

models["HistGB"] = HistGradientBoostingRegressor(
    random_state=42,
    max_depth=6,
    learning_rate=0.06,
    max_iter=800
)


In [37]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
import joblib
from pathlib import Path

best_pipe = Pipeline([
    ("prep", preprocess),
    ("model", RandomForestRegressor(
        n_estimators=1200,
        random_state=42,
        n_jobs=-1,
        min_samples_leaf=2
    ))
])

best_pipe.fit(X, y)

BASE_DIR = Path("..")
MODELS_DIR = BASE_DIR / "models"
MODELS_DIR.mkdir(exist_ok=True)

joblib.dump(best_pipe, MODELS_DIR / "treecover_model.joblib")
print("saved:", MODELS_DIR / "treecover_model.joblib")


saved: ..\models\treecover_model.joblib


In [38]:
import pandas as pd
from pathlib import Path
import joblib

BASE_DIR = Path("..")
PROCESSED_DIR = BASE_DIR / "data" / "processed"

df_all = pd.read_csv(PROCESSED_DIR / "districts_features_v4_pred_treecover.csv")

model = joblib.load(BASE_DIR / "models" / "treecover_model.joblib")

df_all["pred_treecover_pct"] = model.predict(df_all[num_features + cat_features])
df_all["pred_treecover_pct"] = df_all["pred_treecover_pct"].clip(0, 100)

df_all[["province_name","district_name","treecover_pct","pred_treecover_pct"]].head()


Unnamed: 0,province_name,district_name,treecover_pct,pred_treecover_pct
0,Adana,Aladağ,54.58,23.306582
1,Adana,Ceyhan,0.05,1.23131
2,Adana,Feke,15.07,24.76177
3,Adana,Karaisali,8.33,6.188967
4,Adana,Karataş,0.0,1.974127


In [40]:
import pandas as pd
import geopandas as gpd
from pathlib import Path

BASE_DIR = Path("..")
PROCESSED_DIR = BASE_DIR / "data" / "processed"
RAW_DIR = BASE_DIR / "data" / "raw"

csv_path = PROCESSED_DIR / "districts_features_v4_pred_treecover.csv"
df = pd.read_csv(csv_path)
df.head(), df.shape


(  province_name district_name        lat        lon   avg_temp  avg_rain  \
 0         Adana        Aladağ  37.666642  35.387781  16.739615       0.0   
 1         Adana        Ceyhan  37.011888  35.768198  19.804077       0.0   
 2         Adana          Feke  37.871495  35.821754  10.716615       0.0   
 3         Adana     Karaisali  37.259147  35.142888  12.239538       0.0   
 4         Adana       Karataş  36.675979  35.229132  21.450077       0.0   
 
    treecover_pct  potential_treecover_pct  missing_treecover_pct  \
 0          54.58                       10                   0.00   
 1           0.05                       10                   9.95   
 2          15.07                       10                   0.00   
 3           8.33                       10                   1.67   
 4           0.00                        5                   5.00   
 
    model_potential_treecover_pct  model_missing_treecover_pct  \
 0                      44.543306                     

In [41]:
shp_path = RAW_DIR / "admin_boundaries" / "gadm41_TUR_shp" / "gadm41_TUR_2.shp"
shp_path.exists(), shp_path


(True,
 WindowsPath('../data/raw/admin_boundaries/gadm41_TUR_shp/gadm41_TUR_2.shp'))

In [42]:
gdf = gpd.read_file(shp_path)
gdf.columns


Index(['GID_2', 'GID_0', 'COUNTRY', 'GID_1', 'NAME_1', 'NL_NAME_1', 'NAME_2',
       'VARNAME_2', 'NL_NAME_2', 'TYPE_2', 'ENGTYPE_2', 'CC_2', 'HASC_2',
       'geometry'],
      dtype='object')

In [43]:
gdf_area = gdf.to_crs(epsg=6933)  # equal-area
gdf_area["area_ha"] = gdf_area.geometry.area / 10_000  # m2 -> ha

gdf_area[["NAME_1", "NAME_2", "area_ha"]].head()


Unnamed: 0,NAME_1,NAME_2,area_ha
0,Adana,Aladağ,195789.563207
1,Adana,Ceyhan,150936.179743
2,Adana,Feke,150492.049089
3,Adana,İmamoğlu,30007.531484
4,Adana,Karaisali,173589.725808


In [44]:
def norm(s):
    return (s.astype(str)
            .str.strip()
            .str.replace("İ","I").str.replace("ı","i")
            .str.replace("Ş","S").str.replace("ş","s")
            .str.replace("Ğ","G").str.replace("ğ","g")
            .str.replace("Ü","U").str.replace("ü","u")
            .str.replace("Ö","O").str.replace("ö","o")
            .str.replace("Ç","C").str.replace("ç","c")
            .str.replace(r"\s+"," ", regex=True)
           )

df["prov_norm"] = norm(df["province_name"])
df["dist_norm"] = norm(df["district_name"])

gdf_area["prov_norm"] = norm(gdf_area["NAME_1"])
gdf_area["dist_norm"] = norm(gdf_area["NAME_2"])

gdf_small = gdf_area[["prov_norm","dist_norm","area_ha"]].copy()

df = df.merge(gdf_small, on=["prov_norm","dist_norm"], how="left")

df["area_ha"].isna().mean(), df["area_ha"].min(), df["area_ha"].max()


(0.0, 588.676854927398, 444064.0650180786)

In [45]:
df.columns


Index(['province_name', 'district_name', 'lat', 'lon', 'avg_temp', 'avg_rain',
       'treecover_pct', 'potential_treecover_pct', 'missing_treecover_pct',
       'model_potential_treecover_pct', 'model_missing_treecover_pct',
       'pred_treecover_pct', 'treecover_gap_pct', 'prov_norm', 'dist_norm',
       'area_ha'],
      dtype='object')

In [46]:
train_df = df[df["missing_treecover_pct"] <= 5].copy()  # zaten ormanlı sayılır
len(train_df), train_df["treecover_pct"].describe()


(272,
 count    272.000000
 mean      44.402684
 std       29.249909
 min        0.000000
 25%       18.497500
 50%       38.500000
 75%       71.380000
 max       98.720000
 Name: treecover_pct, dtype: float64)

In [47]:
from sklearn.model_selection import train_test_split, KFold, cross_validate
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor

# hangi kolonlar varsa onları kullan
candidate_cols = [
    "avg_temp", "avg_rain",
    "avg_temp_10yr", "avg_precip_10yr",
    "temp_std_10yr", "precip_std_10yr",
    "temp_min_10yr", "temp_max_10yr"
]
feature_cols = [c for c in candidate_cols if c in df.columns]
feature_cols


['avg_temp', 'avg_rain']

In [48]:
X = train_df[feature_cols]
y = train_df["treecover_pct"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

models = {
    "RF": RandomForestRegressor(n_estimators=600, random_state=42, n_jobs=-1),
    "ExtraTrees": ExtraTreesRegressor(n_estimators=800, random_state=42, n_jobs=-1),
    "HistGB": HistGradientBoostingRegressor(random_state=42, max_depth=8, learning_rate=0.06, max_iter=1200)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {"mae":"neg_mean_absolute_error", "rmse":"neg_root_mean_squared_error", "r2":"r2"}

def summarize(name, res):
    val_mae = (-res["test_mae"]).mean()
    val_rmse = (-res["test_rmse"]).mean()
    val_r2 = (res["test_r2"]).mean()
    print(f"{name:10s} | Val MAE: {val_mae:.2f} | Val RMSE: {val_rmse:.2f} | Val R2: {val_r2:.3f}")

results = {}

for name, mdl in models.items():
    pipe = Pipeline([
        ("imp", SimpleImputer(strategy="median")),
        ("model", mdl)
    ])
    res = cross_validate(pipe, X_train, y_train, cv=cv, scoring=scoring)
    results[name] = res
    summarize(name, res)


RF         | Val MAE: 25.52 | Val RMSE: 31.35 | Val R2: -0.239
ExtraTrees | Val MAE: 26.88 | Val RMSE: 33.07 | Val R2: -0.380
HistGB     | Val MAE: 25.86 | Val RMSE: 30.88 | Val R2: -0.197


In [49]:
best_pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("model", models["HistGB"])
])

best_pipe.fit(X_train, y_train)

# tüm ilçeler için potansiyel treecover tahmini
df["model_potential_treecover_pct"] = best_pipe.predict(df[feature_cols]).clip(0, 100)

# gap: potansiyel - mevcut
df["treecover_gap_pct"] = (df["model_potential_treecover_pct"] - df["treecover_pct"]).clip(lower=0)
df[["province_name","district_name","treecover_pct","model_potential_treecover_pct","treecover_gap_pct"]].head()


Unnamed: 0,province_name,district_name,treecover_pct,model_potential_treecover_pct,treecover_gap_pct
0,Adana,Aladağ,54.58,41.043211,0.0
1,Adana,Ceyhan,0.05,38.026676,37.976676
2,Adana,Feke,15.07,21.565607,6.495607
3,Adana,Karaisali,8.33,24.969186,16.639186
4,Adana,Karataş,0.0,38.026676,38.026676


In [50]:
TREES_PER_HA = 500

df["trees_needed"] = (df["treecover_gap_pct"] / 100.0) * df["area_ha"] * TREES_PER_HA
df["trees_needed"] = df["trees_needed"].round().astype("Int64")

df[["province_name","district_name","area_ha","treecover_gap_pct","trees_needed"]].head()


Unnamed: 0,province_name,district_name,area_ha,treecover_gap_pct,trees_needed
0,Adana,Aladağ,195789.563207,0.0,0
1,Adana,Ceyhan,150936.179743,37.976676,28660272
2,Adana,Feke,150492.049089,6.495607,4887686
3,Adana,Karaisali,173589.725808,16.639186,14441959
4,Adana,Karataş,45217.646956,38.026676,8597384


In [51]:
out = df[["province_name","district_name","trees_needed","treecover_pct","model_potential_treecover_pct","treecover_gap_pct"]].copy()
out_path = PROCESSED_DIR / "districts_trees_needed.csv"
out.to_csv(out_path, index=False)
out_path


WindowsPath('../data/processed/districts_trees_needed.csv')

In [52]:
# 1) Zorunlu kolon kontrolü
need_cols = ["province_name","district_name","area_ha","treecover_pct","model_potential_treecover_pct","treecover_gap_pct"]
missing = [c for c in need_cols if c not in df.columns]
print("Eksik kolonlar:", missing)

# 2) area_ha kontrol
print("area_ha NaN oranı:", df["area_ha"].isna().mean())
print("area_ha min / max:", df["area_ha"].min(), df["area_ha"].max())

# 3) gap kontrol (0-100 arası olmalı)
print("gap min / max:", df["treecover_gap_pct"].min(), df["treecover_gap_pct"].max())
print("gap negatif var mı?:", (df["treecover_gap_pct"] < 0).any())
print("gap > 100 var mı?:", (df["treecover_gap_pct"] > 100).any())


Eksik kolonlar: []
area_ha NaN oranı: 0.0
area_ha min / max: 588.676854927398 444064.0650180786
gap min / max: 0.0 75.75612984670919
gap negatif var mı?: False
gap > 100 var mı?: False


In [53]:
TREES_PER_HA = 500  # senaryo parametresi (istersen sonra değiştiririz)

# güvenlik: gap ve potansiyel sınırla
df["model_potential_treecover_pct"] = df["model_potential_treecover_pct"].clip(0, 100)
df["treecover_pct"] = df["treecover_pct"].clip(0, 100)
df["treecover_gap_pct"] = (df["model_potential_treecover_pct"] - df["treecover_pct"]).clip(lower=0, upper=100)

# ağaç ihtiyacı
df["trees_needed"] = (df["treecover_gap_pct"] / 100.0) * df["area_ha"] * TREES_PER_HA
df["trees_needed"] = df["trees_needed"].round().astype("int64")

df[["province_name","district_name","area_ha","treecover_pct","model_potential_treecover_pct","treecover_gap_pct","trees_needed"]].head()


Unnamed: 0,province_name,district_name,area_ha,treecover_pct,model_potential_treecover_pct,treecover_gap_pct,trees_needed
0,Adana,Aladağ,195789.563207,54.58,41.043211,0.0,0
1,Adana,Ceyhan,150936.179743,0.05,38.026676,37.976676,28660272
2,Adana,Feke,150492.049089,15.07,21.565607,6.495607,4887686
3,Adana,Karaisali,173589.725808,8.33,24.969186,16.639186,14441959
4,Adana,Karataş,45217.646956,0.0,38.026676,38.026676,8597384


In [54]:
out = df[[
    "province_name",
    "district_name",
    "trees_needed",
    "treecover_pct",
    "model_potential_treecover_pct",
    "treecover_gap_pct"
]].copy()

out_path = PROCESSED_DIR / "districts_trees_needed.csv"
out.to_csv(out_path, index=False)
out_path


WindowsPath('../data/processed/districts_trees_needed.csv')

In [55]:
import json

json_data = (
    out.sort_values(["province_name","district_name"])
       .to_dict(orient="records")
)

json_path = PROCESSED_DIR / "districts_trees_needed.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(json_data, f, ensure_ascii=False, indent=2)

json_path


WindowsPath('../data/processed/districts_trees_needed.json')

In [56]:
print(df.columns.tolist())


['province_name', 'district_name', 'lat', 'lon', 'avg_temp', 'avg_rain', 'treecover_pct', 'potential_treecover_pct', 'missing_treecover_pct', 'model_potential_treecover_pct', 'model_missing_treecover_pct', 'pred_treecover_pct', 'treecover_gap_pct', 'prov_norm', 'dist_norm', 'area_ha', 'trees_needed']


In [57]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import HistGradientBoostingRegressor

# 1) Hedef
y = df["treecover_pct"].astype(float)

# 2) Feature seçimi (en mantıklı set)
feature_cols = ["lat", "lon", "avg_temp", "avg_rain", "area_ha"]
X = df[feature_cols].copy()

# 3) Train/test ayır
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


((743, 5), (186, 5))

In [58]:
baseline = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("model", HistGradientBoostingRegressor(random_state=42))
])

baseline.fit(X_train, y_train)
pred = baseline.predict(X_test)

mae = mean_absolute_error(y_test, pred)
rmse = mean_squared_error(y_test, pred) ** 0.5
r2 = r2_score(y_test, pred)

print("BASELINE  MAE:", round(mae, 3))
print("BASELINE RMSE:", round(rmse, 3))
print("BASELINE   R2:", round(r2, 3))


BASELINE  MAE: 13.389
BASELINE RMSE: 21.512
BASELINE   R2: 0.375


In [59]:
from scipy.stats import randint, uniform

pipe = Pipeline([
    ("imp", SimpleImputer(strategy="median")),
    ("model", HistGradientBoostingRegressor(random_state=42))
])

param_dist = {
    "model__max_depth": randint(2, 12),
    "model__learning_rate": uniform(0.01, 0.2),      # 0.01 - 0.21
    "model__max_iter": randint(200, 1500),
    "model__min_samples_leaf": randint(10, 100),
    "model__l2_regularization": uniform(0.0, 1.0)
}

cv = KFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=60,                 # 60 deneme (makul)
    scoring="neg_mean_absolute_error",
    cv=cv,
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

print("En iyi MAE (CV):", -search.best_score_)
print("En iyi parametreler:", search.best_params_)
best_model = search.best_estimator_


Fitting 5 folds for each of 60 candidates, totalling 300 fits
En iyi MAE (CV): 13.179589645087328
En iyi parametreler: {'model__l2_regularization': 0.6635017691080558, 'model__learning_rate': 0.011012316769243738, 'model__max_depth': 3, 'model__max_iter': 1021, 'model__min_samples_leaf': 44}


In [60]:
pred_best = best_model.predict(X_test)

mae = mean_absolute_error(y_test, pred_best)
rmse = mean_squared_error(y_test, pred_best) ** 0.5
r2 = r2_score(y_test, pred_best)

print("BEST  MAE:", round(mae, 3))
print("BEST RMSE:", round(rmse, 3))
print("BEST   R2:", round(r2, 3))


BEST  MAE: 13.773
BEST RMSE: 21.374
BEST   R2: 0.383


In [61]:
df["pred_treecover_pct_v2"] = best_model.predict(df[feature_cols]).clip(0, 100)
df[["province_name","district_name","treecover_pct","pred_treecover_pct","pred_treecover_pct_v2"]].head()


Unnamed: 0,province_name,district_name,treecover_pct,pred_treecover_pct,pred_treecover_pct_v2
0,Adana,Aladağ,54.58,15.439182,5.23722
1,Adana,Ceyhan,0.05,12.524649,10.968685
2,Adana,Feke,15.07,15.140036,1.902585
3,Adana,Karaisali,8.33,9.762991,1.201734
4,Adana,Karataş,0.0,10.088933,12.639793


In [62]:
df["model_potential_treecover_pct_v2"] = df["pred_treecover_pct_v2"].clip(0, 100)
df["treecover_gap_pct_v2"] = (df["model_potential_treecover_pct_v2"] - df["treecover_pct"]).clip(lower=0, upper=100)

df[["province_name","district_name","treecover_pct","model_potential_treecover_pct_v2","treecover_gap_pct_v2"]].head()


Unnamed: 0,province_name,district_name,treecover_pct,model_potential_treecover_pct_v2,treecover_gap_pct_v2
0,Adana,Aladağ,54.58,5.23722,0.0
1,Adana,Ceyhan,0.05,10.968685,10.918685
2,Adana,Feke,15.07,1.902585,0.0
3,Adana,Karaisali,8.33,1.201734,0.0
4,Adana,Karataş,0.0,12.639793,12.639793


In [63]:
TREES_PER_HA = 500  # senaryo: 1 hektara kaç ağaç
df["trees_needed_v2"] = (df["treecover_gap_pct_v2"] / 100.0) * df["area_ha"] * TREES_PER_HA
df["trees_needed_v2"] = df["trees_needed_v2"].round().astype("int64")

df[["province_name","district_name","area_ha","treecover_gap_pct_v2","trees_needed_v2"]].head()


Unnamed: 0,province_name,district_name,area_ha,treecover_gap_pct_v2,trees_needed_v2
0,Adana,Aladağ,195789.563207,0.0,0
1,Adana,Ceyhan,150936.179743,10.918685,8240123
2,Adana,Feke,150492.049089,0.0,0
3,Adana,Karaisali,173589.725808,0.0,0
4,Adana,Karataş,45217.646956,12.639793,2857708


In [64]:
import json
from pathlib import Path

# proje yapına göre ayarlı değilse:
# PROCESSED_DIR = Path("data/processed")

out = df[[
    "province_name",
    "district_name",
    "trees_needed_v2",
    "treecover_pct",
    "model_potential_treecover_pct_v2",
    "treecover_gap_pct_v2"
]].copy()

# İstersen frontend daha rahat diye kolon isimlerini sadeleştir:
out = out.rename(columns={
    "trees_needed_v2": "trees_needed",
    "model_potential_treecover_pct_v2": "potential_treecover_pct_model",
    "treecover_gap_pct_v2": "treecover_gap_pct"
})

# CSV
csv_path = PROCESSED_DIR / "districts_trees_needed_v2.csv"
out.to_csv(csv_path, index=False)

# JSON
json_path = PROCESSED_DIR / "districts_trees_needed_v2.json"
with open(json_path, "w", encoding="utf-8") as f:
    json.dump(out.sort_values(["province_name","district_name"]).to_dict("records"), f, ensure_ascii=False, indent=2)

csv_path, json_path


(WindowsPath('../data/processed/districts_trees_needed_v2.csv'),
 WindowsPath('../data/processed/districts_trees_needed_v2.json'))