In [5]:
# train_high_accuracy_top3.py
import os, re, warnings, joblib
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import top_k_accuracy_score

from catboost import CatBoostClassifier, Pool
warnings.filterwarnings("ignore")

# ===================== CONFIG =====================
DATA_PATH = "data/AP_data.csv"
DIST_PATH = "data/AP_district_level_master.csv"

# To push accuracy higher, you can filter to top-N crops (set True & choose N)
USE_TOP_N_CLASSES = True
TOP_N = 12            # try 10–12 for highest Top-3 on focused classes

# If not filtering, mapping tail to "Other" usually hurts; keep False for now
KEEP_OTHER_CLASS = False

TEST_SIZE = 0.25
RANDOM_STATE = 42

# Use Bernoulli bootstrap to allow subsample < 1.0
CB_PARAMS = dict(
    iterations=1200,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    l2_leaf_reg=3,
    bootstrap_type="Bernoulli",  # allows subsample
    subsample=0.8,
    random_seed=RANDOM_STATE,
    auto_class_weights="Balanced",
    verbose=False
)

MODEL_OUT = "serving_catboost_topN.pkl"
REPORT_OUT = "training_report_topN.txt"

# ===================== LOAD =====================
df = pd.read_csv(DATA_PATH)
dist_master = pd.read_csv(DIST_PATH)

df = df.rename(columns={"Extent\n(AC)": "Farm_Acres", "Crop before": "Crop_Sown"})
df["District"] = df["District"].replace({
    "Anantapur": "Ananthapur",
    "S.P.S.Nellore": "Nellore",
    "S.P.S. Nellore": "Nellore",
    "Kadapa YSR": "Kadapa"
})

# Merge district rainfall normals (if available)
if {"District","Kharif_rain","Rabi_rain","Zaid_rain"}.issubset(dist_master.columns):
    rain_df = dist_master[["District","Kharif_rain","Rabi_rain","Zaid_rain"]].drop_duplicates()
    df = df.merge(rain_df, on="District", how="left")

# Drop non-predictive / ID-like columns
drop_cols = [
    "Sl no","Date","Farmer No","Macro/ Micro nutrient","Farmer Name",
    "Fathers Name","Time","Recommended Sowing Time","Survey No.",
    "Latitude","Longitude","Farm_Acres"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

# Numeric coercion for known messy numeric columns
for col in ["OC","Avail-S","Avail-B"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# ===================== STANDARDIZE SOIL =====================
MASTER_SOIL = [
    "Black","Red","Sandy","Loam","Clay","Brown","Yellow","White",
    "Laterite","Saline","Alkaline","Alluvial","Gravel/Stony","Mixed","Other"
]

def _clean_text(s: str) -> str:
    s = str(s).lower().strip()
    s = re.sub(r"soil", "", s)
    s = re.sub(r"[^a-z\s\+\-]", "", s)
    return s.strip()

def standardize_soil(raw: str) -> str:
    if not isinstance(raw, str) or not raw.strip():
        return "Unknown"
    txt = _clean_text(raw)
    match, score, _ = process.extractOne(txt, MASTER_SOIL, scorer=fuzz.WRatio)
    return match if score >= 80 else "Unknown"

soil_col = "Soil type" if "Soil type" in df.columns else None
if not soil_col:
    for c in df.columns:
        if c.lower().strip() == "soil type":
            soil_col = c; break
if soil_col:
    df["Soil_Type_Standard"] = df[soil_col].apply(standardize_soil)
else:
    df["Soil_Type_Standard"] = "Unknown"

# ===================== STANDARDIZE CROP =====================
def standardize_crop(raw: str) -> str:
    if not isinstance(raw, str) or not raw.strip():
        return "Other"
    txt = raw.lower()
    aliases = {
        "paddy":"Rice","vari":"Rice","rice":"Rice",
        "maize":"Maize","sweetcorn":"Maize",
        "ground nut":"Groundnut","groundnut":"Groundnut","g.nut":"Groundnut",
        "cotton":"Cotton","castor":"Castor","sesamum":"Sesame","sesame":"Sesame",
        "sunflower":"Sunflower","soyabean":"Soyabean","soybean":"Soyabean",
        "chilli":"Chilli","chillies":"Chilli","mirchi":"Chilli",
        "tomato":"Tomato","brinjal":"Brinjal","okra":"Okra","benda":"Okra",
        "ragi":"Ragi","sorghum":"Sorghum","jowar":"Sorghum","jonna":"Sorghum",
        "bajra":"Pearl Millet","korra":"Foxtail Millet"
    }
    for k,v in aliases.items():
        if k in txt: return v
    return raw.title()

df["Crop_Sown_Standard"] = df["Crop_Sown"].apply(standardize_crop)

# ===================== FEATURE SET =====================
num_cols = [c for c in [
    "pH","EC","OC","Avail-P","Exch-K","Avail-Ca","Avail-Mg","Avail-S",
    "Avail-Zn","Avail-B","Avail-Fe","Avail-Cu","Avail-Mn",
    "Kharif_rain","Rabi_rain","Zaid_rain"
] if c in df.columns]

cat_cols = [c for c in ["District","Soil_Type_Standard"] if c in df.columns]
target = "Crop_Sown_Standard"

# Impute numerics (median)
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

# ===================== INTERACTIONS (safe) =====================
def safe_mul(a, b):
    return (pd.to_numeric(a, errors="coerce").fillna(0) *
            pd.to_numeric(b, errors="coerce").fillna(0))

def add_interactions(ddf: pd.DataFrame) -> pd.DataFrame:
    out = ddf.copy()
    if set(["pH","EC"]).issubset(out.columns):
        out["pH_x_EC"] = safe_mul(out["pH"], out["EC"])
    if set(["OC","Avail-P"]).issubset(out.columns):
        out["OC_x_AvailP"] = safe_mul(out["OC"], out["Avail-P"])
    if set(["OC","Exch-K"]).issubset(out.columns):
        out["OC_x_ExchK"] = safe_mul(out["OC"], out["Exch-K"])
    if set(["pH","Avail-Ca"]).issubset(out.columns):
        out["pH_x_AvailCa"] = safe_mul(out["pH"], out["Avail-Ca"])
    if set(["EC","Avail-Fe"]).issubset(out.columns):
        out["EC_x_AvailFe"] = safe_mul(out["EC"], out["Avail-Fe"])
    return out

df = add_interactions(df)
extra_nums = ["pH_x_EC","OC_x_AvailP","OC_x_ExchK","pH_x_AvailCa","EC_x_AvailFe"]
num_cols = [c for c in num_cols + extra_nums if c in df.columns]

# ===================== CLASS SPACE (Top-N option) =====================
if USE_TOP_N_CLASSES:
    top_classes = df[target].value_counts().head(TOP_N).index
    df = df[df[target].isin(top_classes)].copy()
else:
    if not KEEP_OTHER_CLASS:
        vc = df[target].value_counts()
        keep = vc[vc >= 10].index
        df = df[df[target].isin(keep)].copy()

# ===================== CLEAN TYPES / NAs =====================
df = df[df[target].notna()].copy()

# Categorical columns → string, fill NaN with "Unknown"
for c in cat_cols:
    if c not in df.columns:
        df[c] = "Unknown"
    df[c] = df[c].astype("string").fillna("Unknown")

# Numeric columns → float, fill NaN with 0 (already median-imputed, just in case)
for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(0.0).astype(float)

# ===================== BUILD X/y =====================
X = df[num_cols + cat_cols].copy()
y = df[target].copy()

# Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
)

# CatBoost categorical feature indices (by position in X)
cat_feature_indices = [X.columns.get_loc(c) for c in cat_cols]

# Pools
train_pool = Pool(X_train, y_train, cat_features=cat_feature_indices)
test_pool  = Pool(X_test,  y_test,  cat_features=cat_feature_indices)

# ===================== TRAIN CATBOOST =====================
cb = CatBoostClassifier(**CB_PARAMS)
cb.fit(train_pool, eval_set=test_pool, verbose=False)

# ===================== EVALUATION (Top-3 ONLY) =====================
# Probabilities & classes
probs = cb.predict_proba(X_test)          # shape: [n_samples, n_classes]
classes = cb.classes_                     # array of class labels (strings)

# For Top-3 accuracy, align probs to integer labels
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y)                                 # fit on all training labels
y_true_int = le.transform(y_test)
# Align columns: CatBoost class order → LabelEncoder class order
order = [np.where(classes == c)[0][0] for c in le.classes_]
probs_aligned = probs[:, order]

# Top-3 accuracy
acc_top3 = top_k_accuracy_score(y_true_int, probs_aligned, k=3)

# ===================== SAVE MODEL & REPORT =====================
joblib.dump(
    {"model": cb, "feature_names": list(X.columns), "cat_idx": cat_feature_indices},
    MODEL_OUT
)

with open(REPORT_OUT, "w", encoding="utf-8") as f:
    f.write(f"Top-3 Accuracy: {acc_top3:.4f}\n")
    f.write(f"Classes: {list(le.classes_)}\n")
    f.write(f"Features: {list(X.columns)}\n")
    f.write(f"Categorical features: {cat_cols}\n")
    f.write(f"Params: {CB_PARAMS}\n")

print("Top-3 Accuracy:", round(acc_top3, 4))
print("Saved:", MODEL_OUT, "and", REPORT_OUT)


Top-3 Accuracy: 0.9572
Saved: serving_catboost_topN.pkl and training_report_topN.txt
