In [1]:
import os
import json
import numpy as np
import pandas as pd

from pathlib import Path

from sklearn.model_selection import GroupKFold, GroupShuffleSplit, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, HistGradientBoostingRegressor

# --- Proje yolları ---
BASE_DIR = Path("..")                      # notebooks klasöründen bir üst
DATA_DIR = BASE_DIR / "data"
PROCESSED_DIR = DATA_DIR / "processed"
RAW_DIR = DATA_DIR / "raw"

BASE_DIR, PROCESSED_DIR


(WindowsPath('..'), WindowsPath('../data/processed'))

In [2]:
candidate_files = [
    PROCESSED_DIR / "districts_features_v3_predictions.csv",
    PROCESSED_DIR / "districts_features_v2_model.csv",
    PROCESSED_DIR / "districts_features_v1.csv",
]

features_path = None
for p in candidate_files:
    if p.exists():
        features_path = p
        break

if features_path is None:
    raise FileNotFoundError("Processed klasöründe features CSV bulamadım: " + str(candidate_files))

print("Kullanılan features dosyası:", features_path)

df = pd.read_csv(features_path)
print("Satır:", len(df), "Kolon:", df.shape[1])
df.head()


Kullanılan features dosyası: ..\data\processed\districts_features_v3_predictions.csv
Satır: 929 Kolon: 13


Unnamed: 0,province_name,district_name,lat,lon,avg_temp,avg_rain,treecover_pct,potential_treecover_pct,missing_treecover_pct,model_potential_treecover_pct,model_missing_treecover_pct,pred_treecover_pct,treecover_gap_pct
0,Adana,Aladağ,37.666642,35.387781,16.739615,0.0,54.58,10,0.0,44.543306,0.0,45.610237,0.0
1,Adana,Ceyhan,37.011888,35.768198,19.804077,0.0,0.05,10,9.95,27.848537,27.798537,37.492598,37.442598
2,Adana,Feke,37.871495,35.821754,10.716615,0.0,15.07,10,0.0,30.32221,15.25221,27.646413,12.576413
3,Adana,Karaisali,37.259147,35.142888,12.239538,0.0,8.33,10,1.67,16.481983,8.151983,18.05402,9.72402
4,Adana,Karataş,36.675979,35.229132,21.450077,0.0,0.0,5,5.0,3.265905,3.265905,3.663069,3.663069


In [3]:
def normalize_text(s: str) -> str:
    if pd.isna(s):
        return ""
    s = str(s).strip().lower()
    # çok temel normalize: ardışık boşlukları tek boşluk yap
    s = " ".join(s.split())
    return s

df["prov_norm"] = df["province_name"].apply(normalize_text)
df["dist_norm"] = df["district_name"].apply(normalize_text)

df[["province_name","district_name","prov_norm","dist_norm"]].head()


Unnamed: 0,province_name,district_name,prov_norm,dist_norm
0,Adana,Aladağ,adana,aladağ
1,Adana,Ceyhan,adana,ceyhan
2,Adana,Feke,adana,feke
3,Adana,Karaisali,adana,karaisali
4,Adana,Karataş,adana,karataş


In [4]:
import geopandas as gpd

districts_shp = RAW_DIR / "admin_boundaries_gadm41_TUR_shp" / "gadm41_TUR_2.shp"
print("SHP exists:", districts_shp.exists(), districts_shp)

if not districts_shp.exists():
    raise FileNotFoundError("Shapefile bulunamadı: " + str(districts_shp))

gdf = gpd.read_file(districts_shp)

# GADM kolonları genelde NAME_1 (il), NAME_2 (ilçe)
print("SHP kolon örneği:", [c for c in gdf.columns if "NAME" in c][:10])
gdf[["NAME_1","NAME_2"]].head()


SHP exists: False ..\data\raw\admin_boundaries_gadm41_TUR_shp\gadm41_TUR_2.shp


FileNotFoundError: Shapefile bulunamadı: ..\data\raw\admin_boundaries_gadm41_TUR_shp\gadm41_TUR_2.shp