In [None]:
import os
from pathlib import Path

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score


In [None]:
BASE_DIR = Path("..")  # notebooks klasöründen bir üst
PROCESSED_DIR = BASE_DIR / "data" / "processed"

features_path = PROCESSED_DIR / "districts_features_v1.csv"
features_path


In [None]:
df = pd.read_csv(features_path)

print("Toplam ilçe sayısı:", len(df))
df.head()


In [None]:
df.columns

In [None]:
# Eksik treecover yüzdesi çok az olan ilçeler (zaten oldukça ormanlı)
well_forested = df[df["missing_treecover_pct"] <= 5].copy()

print("İyi ormanlı ilçe sayısı:", len(well_forested))
well_forested[["province_name", "district_name", "treecover_pct", "missing_treecover_pct"]].head()


In [None]:
feature_cols = ["avg_temp", "avg_rain"]

X = well_forested[feature_cols].values
y = well_forested["treecover_pct"].values


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

X_train.shape, X_test.shape


In [None]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


In [None]:
rf = RandomForestRegressor(
    n_estimators=200,
    max_depth=None,
    random_state=42,
    n_jobs=-1
)

rf.fit(X_train, y_train)


In [None]:
y_pred = rf.predict(X_test)

# Eski sklearn: 'squared' parametresi yok, önce MSE, sonra karekök:
mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

r2 = r2_score(y_test, y_pred)

print("Test RMSE:", round(rmse, 2))
print("Test R² :", round(r2, 3))


In [None]:
X_all = df[feature_cols].values
model_potential = rf.predict(X_all)

# Fiziksel sınırlar: 0 ile 90 arasında keselim
model_potential = np.clip(model_potential, 0, 90)

df["model_potential_treecover_pct"] = model_potential
df[["province_name", "district_name", "treecover_pct", "model_potential_treecover_pct"]].head()


In [None]:
# Treecover NaN ise 0 kabul
df["treecover_pct_filled"] = df["treecover_pct"].fillna(0)

df["model_missing_treecover_pct"] = np.maximum(
    df["model_potential_treecover_pct"] - df["treecover_pct_filled"],
    0
)

df[[
    "province_name", "district_name",
    "treecover_pct",
    "potential_treecover_pct",
    "model_potential_treecover_pct",
    "missing_treecover_pct",
    "model_missing_treecover_pct"
]].head()


In [None]:
province_compare = (
    df
    .groupby("province_name")[[
        "treecover_pct_filled",
        "potential_treecover_pct",
        "missing_treecover_pct",
        "model_potential_treecover_pct",
        "model_missing_treecover_pct"
    ]]
    .mean()
    .sort_values("model_missing_treecover_pct", ascending=False)
)

province_compare.head(20)


In [None]:
output_path_v2 = PROCESSED_DIR / "districts_features_v2_model.csv"

save_cols = [
    "province_name",
    "district_name",
    "lat",
    "lon",
    "avg_temp",
    "avg_rain",
    "treecover_pct",
    "potential_treecover_pct",
    "missing_treecover_pct",
    "model_potential_treecover_pct",
    "model_missing_treecover_pct"
]

df[save_cols].to_csv(output_path_v2, index=False)

len(df), output_path_v2
