In [None]:
# train_high_accuracy_top3_voting.py
import os, re, warnings, joblib
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.metrics import top_k_accuracy_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
warnings.filterwarnings("ignore")

# ===================== CONFIG =====================
DATA_PATH = "data/AP_data.csv"
DIST_PATH = "data/AP_district_level_master.csv"
USE_TOP_N_CLASSES = True
TOP_N = 12
TEST_SIZE = 0.25
RANDOM_STATE = 42

CB_PARAMS = dict(
    iterations=1200,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    l2_leaf_reg=3,
    bootstrap_type="Bernoulli",
    subsample=0.8,
    random_seed=RANDOM_STATE,
    auto_class_weights="Balanced",
    verbose=False
)

MODEL_OUT_CAT = "serving_catboost_topN.pkl"
MODEL_OUT_VOTE = "soft_voting_model.pkl"
REPORT_OUT = "training_report_topN_voting.txt"
COMPARE_CSV = "model_comparison_top3_voting.csv"

# ===================== LOAD & CLEAN =====================
df = pd.read_csv(DATA_PATH)
dist_master = pd.read_csv(DIST_PATH)

df = df.rename(columns={"Extent\n(AC)": "Farm_Acres", "Crop before": "Crop_Sown"})
df["District"] = df["District"].replace({
    "Anantapur": "Ananthapur",
    "S.P.S.Nellore": "Nellore",
    "S.P.S. Nellore": "Nellore",
    "Kadapa YSR": "Kadapa"
})

if {"District","Kharif_rain","Rabi_rain","Zaid_rain"}.issubset(dist_master.columns):
    rain_df = dist_master[["District","Kharif_rain","Rabi_rain","Zaid_rain"]].drop_duplicates()
    df = df.merge(rain_df, on="District", how="left")

drop_cols = [
    "Sl no","Date","Farmer No","Macro/ Micro nutrient","Farmer Name",
    "Fathers Name","Time","Recommended Sowing Time","Survey No.",
    "Latitude","Longitude","Farm_Acres"
]
df.drop(columns=[c for c in drop_cols if c in df.columns], inplace=True, errors="ignore")

for col in ["OC","Avail-S","Avail-B"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

MASTER_SOIL = ["Black","Red","Sandy","Loam","Clay","Brown","Yellow","White","Laterite","Saline","Alkaline","Alluvial","Gravel/Stony","Mixed","Other"]

def _clean_text(s): return re.sub(r"[^a-z\s\+\-]", "", str(s).lower().replace("soil", "").strip())
def standardize_soil(raw):
    if not isinstance(raw, str) or not raw.strip(): return "Unknown"
    txt = _clean_text(raw)
    match, score, _ = process.extractOne(txt, MASTER_SOIL, scorer=fuzz.WRatio)
    return match if score >= 80 else "Unknown"
def standardize_crop(raw):
    if not isinstance(raw, str) or not raw.strip(): return "Other"
    txt = raw.lower()
    aliases = {
        "paddy":"Rice","vari":"Rice","rice":"Rice",
        "maize":"Maize","sweetcorn":"Maize",
        "ground nut":"Groundnut","groundnut":"Groundnut","g.nut":"Groundnut",
        "cotton":"Cotton","castor":"Castor","sesamum":"Sesame","sesame":"Sesame",
        "sunflower":"Sunflower","soyabean":"Soyabean","soybean":"Soyabean",
        "chilli":"Chilli","chillies":"Chilli","mirchi":"Chilli",
        "tomato":"Tomato","brinjal":"Brinjal","okra":"Okra","benda":"Okra",
        "ragi":"Ragi","sorghum":"Sorghum","jowar":"Sorghum","jonna":"Sorghum",
        "bajra":"Pearl Millet","korra":"Foxtail Millet"
    }
    for k,v in aliases.items():
        if k in txt: return v
    return raw.title()

df["Soil_Type_Standard"] = df["Soil type"].apply(standardize_soil)
df["Crop_Sown_Standard"] = df["Crop_Sown"].apply(standardize_crop)

num_cols = [c for c in [
    "pH","EC","OC","Avail-P","Exch-K","Avail-Ca","Avail-Mg","Avail-S",
    "Avail-Zn","Avail-B","Avail-Fe","Avail-Cu","Avail-Mn",
    "Kharif_rain","Rabi_rain","Zaid_rain"
] if c in df.columns]
cat_cols = [c for c in ["District","Soil_Type_Standard"] if c in df.columns]
target = "Crop_Sown_Standard"

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

def safe_mul(a,b): return pd.to_numeric(a, errors="coerce").fillna(0)*pd.to_numeric(b, errors="coerce").fillna(0)
if "pH" in df and "EC" in df: df["pH_x_EC"] = safe_mul(df["pH"], df["EC"])
if "OC" in df and "Avail-P" in df: df["OC_x_AvailP"] = safe_mul(df["OC"], df["Avail-P"])
num_cols += [c for c in ["pH_x_EC","OC_x_AvailP"] if c in df.columns]

if USE_TOP_N_CLASSES:
    top_classes = df[target].value_counts().head(TOP_N).index
    df = df[df[target].isin(top_classes)].copy()

for c in cat_cols: df[c] = df[c].astype("string").fillna("Unknown")
for c in num_cols: df[c] = df[c].fillna(0).astype(float)

X = df[num_cols + cat_cols].copy()
y = df[target].copy()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)
le = LabelEncoder().fit(y)
y_true_int = le.transform(y_test)

# ===================== CLONE-SAFE CATBOOST WRAPPER =====================
class CatBoostAutoCat(CatBoostClassifier):
    def __init__(self, cat_feature_names=None, **kwargs):
        self.cat_feature_names = cat_feature_names
        super().__init__(**kwargs)
    def get_params(self, deep=True):
        params = super().get_params(deep)
        params["cat_feature_names"] = self.cat_feature_names
        return params
    def set_params(self, **params):
        if "cat_feature_names" in params:
            self.cat_feature_names = params.pop("cat_feature_names")
        return super().set_params(**params)
    def fit(self, X, y=None, **fit_params):
        if isinstance(X, pd.DataFrame) and self.cat_feature_names:
            cf = [c for c in self.cat_feature_names if c in X.columns]
            return super().fit(X, y, cat_features=cf, **fit_params)
        return super().fit(X, y, **fit_params)

# ===================== BASE MODELS =====================
prep = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])
rf  = Pipeline([("prep", prep), ("clf", RandomForestClassifier(n_estimators=500, random_state=RANDOM_STATE))])
gb  = Pipeline([("prep", prep), ("clf", GradientBoostingClassifier(random_state=RANDOM_STATE))])
knn = Pipeline([("prep", prep), ("clf", KNeighborsClassifier(n_neighbors=15, weights="distance"))])
cb  = CatBoostAutoCat(cat_feature_names=cat_cols, **CB_PARAMS)

# ===================== INDIVIDUAL MODEL EVALUATION =====================
print("🔹 Training and evaluating individual models...")

models = {
    "CatBoost": cb,
    "RandomForest": rf,
    "GradientBoosting": gb,
    "KNN": knn
}

results = []

for name, model in models.items():
    print(f"\n▶ Training {name}...")
    model.fit(X_train, y_train)

    probs = model.predict_proba(X_test)
    order = [np.where(model.classes_ == c)[0][0] for c in le.classes_]
    probs_aligned = probs[:, order]

    acc1  = top_k_accuracy_score(y_true_int, probs_aligned, k=1)
    acc3  = top_k_accuracy_score(y_true_int, probs_aligned, k=3)
    acc10 = top_k_accuracy_score(y_true_int, probs_aligned, k=min(10, probs_aligned.shape[1]))

    print(f"{name} — Top-1: {acc1:.4f} | Top-3: {acc3:.4f} | Top-10: {acc10:.4f}")
    results.append((name, acc1, acc3, acc10))

# ===================== SOFT VOTING ENSEMBLE =====================
print("\n🔹 Training Soft VotingClassifier (CatBoost + RF + GB + KNN)…")

voter = VotingClassifier(
    estimators=[
        ("CatBoost", cb),
        ("RandomForest", rf),
        ("GradientBoosting", gb),
        ("KNN", knn)
    ],
    voting="soft"
)
voter.fit(X_train, y_train)

# ===================== EVALUATION =====================
probs_vote = voter.predict_proba(X_test)
order = [np.where(voter.classes_ == c)[0][0] for c in le.classes_]
probs_aligned = probs_vote[:, order]

acc_top1  = top_k_accuracy_score(y_true_int, probs_aligned, k=1)
acc_top3  = top_k_accuracy_score(y_true_int, probs_aligned, k=3)
acc_top10 = top_k_accuracy_score(y_true_int, probs_aligned, k=min(10, probs_aligned.shape[1]))

print(f"\n✅ SoftVoting — Top-1: {acc_top1:.4f} | Top-3: {acc_top3:.4f} | Top-10: {acc_top10:.4f}")

# Save results to CSV for easy comparison
compare_df = pd.DataFrame(results, columns=["Model", "Top-1", "Top-3", "Top-10"])
compare_df.loc[len(compare_df)] = ["SoftVoting", acc_top1, acc_top3, acc_top10]
compare_df.to_csv(COMPARE_CSV, index=False)

print(f"\n💾 Accuracy comparison saved to {COMPARE_CSV}")




joblib.dump({
    "model": voter,
    "feature_names": X.columns.tolist(),
    "cat_cols": cat_cols,
    "num_cols": num_cols,
    "label_encoder": le
}, MODEL_OUT_VOTE)
print("💾 Saved model:", MODEL_OUT_VOTE)





🔹 Training and evaluating individual models...

▶ Training CatBoost...
