In [11]:
# train_high_accuracy_top3_voting.py
import os, re, warnings, joblib
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import top_k_accuracy_score, accuracy_score, classification_report

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.base import BaseEstimator, ClassifierMixin

from catboost import CatBoostClassifier, Pool
warnings.filterwarnings("ignore")

# ===================== CONFIG =====================
DATA_PATH = "data/AP_data.csv"
DIST_PATH = "data/AP_district_level_master.csv"
USE_TOP_N_CLASSES = True
TOP_N = 12
TEST_SIZE = 0.25
RANDOM_STATE = 42

CB_PARAMS = dict(
    iterations=1200,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    l2_leaf_reg=3,
    bootstrap_type="Bernoulli",
    subsample=0.8,
    random_seed=RANDOM_STATE,
    auto_class_weights="Balanced",
    verbose=False
)

MODEL_OUT_CAT = "serving_catboost_topN.pkl"
MODEL_OUT_VOTE = "soft_voting_model.pkl"
REPORT_OUT = "training_report_topN_voting.txt"
COMPARE_CSV = "model_comparison_top3_voting.csv"

# ===================== LOAD & CLEAN =====================
df = pd.read_csv(DATA_PATH)
dist_master = pd.read_csv(DIST_PATH)

df = df.rename(columns={"Extent\n(AC)": "Farm_Acres", "Crop before": "Crop_Sown"})
df["District"] = df["District"].replace({
    "Anantapur": "Ananthapur",
    "S.P.S.Nellore": "Nellore",
    "S.P.S. Nellore": "Nellore",
    "Kadapa YSR": "Kadapa"
})

if {"District","Kharif_rain","Rabi_rain","Zaid_rain"}.issubset(dist_master.columns):
    rain_df = dist_master[["District","Kharif_rain","Rabi_rain","Zaid_rain"]].drop_duplicates()
    df = df.merge(rain_df, on="District", how="left")

drop_cols = [
    "Sl no","Date","Farmer No","Macro/ Micro nutrient","Farmer Name",
    "Fathers Name","Time","Recommended Sowing Time","Survey No.",
    "Latitude","Longitude","Farm_Acres"
]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

for col in ["OC","Avail-S","Avail-B"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# ===================== STANDARDIZATION =====================
MASTER_SOIL = ["Black","Red","Sandy","Loam","Clay","Brown","Yellow","White","Laterite","Saline","Alkaline","Alluvial","Gravel/Stony","Mixed","Other"]

def _clean_text(s: str) -> str:
    s = str(s).lower().strip()
    s = re.sub(r"soil", "", s)
    s = re.sub(r"[^a-z\s\+\-]", "", s)
    return s.strip()

def standardize_soil(raw: str) -> str:
    if not isinstance(raw, str) or not raw.strip():
        return "Unknown"
    txt = _clean_text(raw)
    match, score, _ = process.extractOne(txt, MASTER_SOIL, scorer=fuzz.WRatio)
    return match if score >= 80 else "Unknown"

def standardize_crop(raw: str) -> str:
    if not isinstance(raw, str) or not raw.strip():
        return "Other"
    txt = raw.lower()
    aliases = {
        "paddy":"Rice","vari":"Rice","rice":"Rice",
        "maize":"Maize","sweetcorn":"Maize",
        "ground nut":"Groundnut","groundnut":"Groundnut","g.nut":"Groundnut",
        "cotton":"Cotton","castor":"Castor","sesamum":"Sesame","sesame":"Sesame",
        "sunflower":"Sunflower","soyabean":"Soyabean","soybean":"Soyabean",
        "chilli":"Chilli","chillies":"Chilli","mirchi":"Chilli",
        "tomato":"Tomato","brinjal":"Brinjal","okra":"Okra","benda":"Okra",
        "ragi":"Ragi","sorghum":"Sorghum","jowar":"Sorghum","jonna":"Sorghum",
        "bajra":"Pearl Millet","korra":"Foxtail Millet"
    }
    for k,v in aliases.items():
        if k in txt: return v
    return raw.title()

df["Soil_Type_Standard"] = df["Soil type"].apply(standardize_soil)
df["Crop_Sown_Standard"] = df["Crop_Sown"].apply(standardize_crop)

# ===================== FEATURES =====================
num_cols = [c for c in [
    "pH","EC","OC","Avail-P","Exch-K","Avail-Ca","Avail-Mg","Avail-S",
    "Avail-Zn","Avail-B","Avail-Fe","Avail-Cu","Avail-Mn",
    "Kharif_rain","Rabi_rain","Zaid_rain"
] if c in df.columns]
cat_cols = [c for c in ["District","Soil_Type_Standard"] if c in df.columns]
target = "Crop_Sown_Standard"

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

# Interactions
def safe_mul(a,b): return pd.to_numeric(a, errors="coerce").fillna(0)*pd.to_numeric(b, errors="coerce").fillna(0)
def add_interactions(d):
    if "pH" in d and "EC" in d: d["pH_x_EC"] = safe_mul(d["pH"], d["EC"])
    if "OC" in d and "Avail-P" in d: d["OC_x_AvailP"] = safe_mul(d["OC"], d["Avail-P"])
    return d
df = add_interactions(df)
extra_nums = [c for c in ["pH_x_EC","OC_x_AvailP"] if c in df.columns]
num_cols = num_cols + extra_nums

if USE_TOP_N_CLASSES:
    top_classes = df[target].value_counts().head(TOP_N).index
    df = df[df[target].isin(top_classes)].copy()

# Clean types
for c in cat_cols: df[c] = df[c].astype("string").fillna("Unknown")
for c in num_cols: df[c] = df[c].fillna(0).astype(float)

X = df[num_cols + cat_cols].copy()
y = df[target].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=RANDOM_STATE)
le = LabelEncoder().fit(y)
y_true_int = le.transform(y_test)

# ===================== BASELINES / INDIVIDUAL MODELS =====================
# CatBoost (solo) for comparison
cat_idx = [X.columns.get_loc(c) for c in cat_cols]
cb = CatBoostClassifier(**CB_PARAMS)
cb.fit(Pool(X_train, y_train, cat_features=cat_idx))
probs_cb = cb.predict_proba(X_test)
classes_cb = cb.classes_
order_cb = [np.where(classes_cb == c)[0][0] for c in le.classes_]
probs_cb = probs_cb[:, order_cb]
acc_cb = top_k_accuracy_score(y_true_int, probs_cb, k=3)

# sklearn baselines
prep = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])
models = {
    "RandomForest": RandomForestClassifier(n_estimators=500, random_state=RANDOM_STATE),
    "GradientBoosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "KNN": KNeighborsClassifier(n_neighbors=15, weights="distance")
}
results, probs_dict = [("CatBoost", acc_cb)], {"CatBoost": probs_cb}
for name, clf in models.items():
    pipe = Pipeline([("prep", prep), ("clf", clf)])
    pipe.fit(X_train, y_train)
    probs = pipe.predict_proba(X_test)
    order = [np.where(pipe.named_steps["clf"].classes_ == c)[0][0] for c in le.classes_]
    probs = probs[:, order]
    acc = top_k_accuracy_score(y_true_int, probs, k=3)
    results.append((name, acc))
    probs_dict[name] = probs

# =====================  CATBOOST WRAPPER =====================
# --- replace your previous CatBoostAutoCat with THIS version ---
from catboost import CatBoostClassifier
import pandas as pd
from sklearn.base import clone

class CatBoostAutoCat(CatBoostClassifier):
    """
    CatBoost that survives sklearn cloning inside VotingClassifier and
    always treats the given column NAMES as categorical when X is a DataFrame.
    """
    def __init__(self, cat_feature_names=None, **kwargs):
        # store before super() so get_params sees it
        self.cat_feature_names = list(cat_feature_names) if cat_feature_names else []
        super().__init__(**kwargs)

    # make sure sklearn's clone() keeps our custom param
    def get_params(self, deep=True):
        params = super().get_params(deep=deep)
        params["cat_feature_names"] = self.cat_feature_names
        return params

    def set_params(self, **params):
        if "cat_feature_names" in params:
            self.cat_feature_names = list(params.pop("cat_feature_names"))
        return super().set_params(**params)

    def fit(self, X, y=None, **fit_params):
        cf = None
        if isinstance(X, pd.DataFrame) and self.cat_feature_names:
            # CatBoost accepts column NAMES for pandas.DataFrame
            cf = [c for c in self.cat_feature_names if c in X.columns]
        return super().fit(X, y, cat_features=cf, **fit_params)

from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.neighbors import KNeighborsClassifier

# preprocessing for sklearn models
prep = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    # if your sklearn version < 1.2, use `sparse=False` instead of `sparse_output=False`
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

rf  = Pipeline([("prep", prep), ("clf", RandomForestClassifier(n_estimators=500, random_state=RANDOM_STATE))])
gb  = Pipeline([("prep", prep), ("clf", GradientBoostingClassifier(random_state=RANDOM_STATE))])
knn = Pipeline([("prep", prep), ("clf", KNeighborsClassifier(n_neighbors=15, weights="distance"))])

# our CatBoost that auto-handles categorical cols
cb  = CatBoostAutoCat(cat_feature_names=cat_cols, **CB_PARAMS)

# SOFT VOTING ENSEMBLE (CatBoost + RF + GB + KNN)
voter = VotingClassifier(
    estimators=[
        ("CatBoost", cb),
        ("RandomForest", rf),
        ("GradientBoosting", gb),
        ("KNN", knn),
    ],
    voting="soft"
)

print("🔹 Training Soft VotingClassifier (CatBoost + RF + GB + KNN)…")
voter.fit(X_train, y_train)





🔹 Training Soft VotingClassifier (CatBoost + RF + GB + KNN)…


RuntimeError: Cannot clone object <__main__.CatBoostAutoCat object at 0x000001CC9AC2AF90>, as the constructor either does not set or modifies parameter cat_feature_names

In [12]:


# Evaluate Top-1 / Top-3 / Top-10
from sklearn.metrics import top_k_accuracy_score
import joblib
import numpy as np
import pandas as pd

# Top-1 / Top-3 / Top-10
y_pred_vote = voter.predict(X_test)
probs_vote  = voter.predict_proba(X_test)

le = LabelEncoder().fit(y)                 # already in your script; reuse if present
y_true_int = le.transform(y_test)          # integer labels aligned to le.classes_

# Ensure columns align to le.classes_ order
order = [np.where(voter.classes_ == c)[0][0] for c in le.classes_]
probs_aligned = probs_vote[:, order]

acc_top1  = top_k_accuracy_score(y_true_int, probs_aligned, k=1)
acc_top3  = top_k_accuracy_score(y_true_int, probs_aligned, k=3)
acc_top10 = top_k_accuracy_score(y_true_int, probs_aligned, k=min(10, probs_aligned.shape[1]))

print(f"SoftVoting — Top-1: {acc_top1:.4f} | Top-3: {acc_top3:.4f} | Top-10: {acc_top10:.4f}")

# Save the ensemble for serving
joblib.dump({
    "model": voter,
    "feature_names": X.columns.tolist(),
    "cat_cols": cat_cols,
    "num_cols": num_cols,
    "label_encoder": le
}, "serving_voter_topN.pkl")

print("✅ Saved soft-voting model → serving_voter_topN.pkl")

AttributeError: 'VotingClassifier' object has no attribute 'estimators_'

In [None]:

pd.DataFrame(results, columns=["Model", "Top3_Accuracy"]).to_csv(COMPARE_CSV, index=False)

with open(REPORT_OUT, "w") as f:
    f.write("=== Top-3 Accuracy (k=3) ===\n")
    for m,a in results:
        f.write(f"{m}: {a:.4f}\n")
    f.write("\n=== Soft Voting (extra metrics) ===\n")
    f.write(f"Top-1:  {acc_top1_vote:.4f}\n")
    f.write(f"Top-3:  {acc_top3_vote:.4f}\n")
    f.write(f"Top-10: {acc_top10_vote:.4f}\n")
    f.write("\nClasses:\n")
    f.write(", ".join(le.classes_))

# ===================== PRINT =====================
print("\n=== Top-3 Accuracy (k=3) ===")
for m,a in results:
    print(f"{m:<16} : {a:.4f}")

print("\n✅ Soft Voting Ensemble (CatBoost + RF + GB + KNN)")
print(f"Top-1  : {acc_top1_vote:.4f}")
print(f"Top-3  : {acc_top3_vote:.4f}")
print(f"Top-10 : {acc_top10_vote:.4f}")

print("\nSaved:")
print(" - Solo CatBoost :", MODEL_OUT_CAT)
print(" - Soft Voting   :", MODEL_OUT_VOTE)
print(" - Report        :", REPORT_OUT)
print(" - Compare CSV   :", COMPARE_CSV)