In [1]:
import re, warnings, joblib
import numpy as np
import pandas as pd
from rapidfuzz import process, fuzz

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

warnings.filterwarnings("ignore")

# ---------- Load data ----------
DATA_PATH = "data/AP_data.csv"                       # adjust path if needed
DIST_PATH = "data/AP_district_level_master.csv"

df = pd.read_csv(DATA_PATH)
dist_master = pd.read_csv(DIST_PATH)

df.rename(columns={"Extent\n(AC)":"Farm_Acres","Crop before":"Crop_Sown"}, inplace=True)
df["District"] = df["District"].replace({
    "Anantapur": "Ananthapur",
    "S.P.S.Nellore": "Nellore",
    "S.P.S. Nellore": "Nellore",
    "Kadapa YSR": "Kadapa"
})

rain_df = dist_master[["District","Kharif_rain","Rabi_rain","Zaid_rain"]].drop_duplicates()
df = df.merge(rain_df, on="District", how="left")

# Drop IDs / timestamps / free-text
drop_cols = ["Sl no","Date","Farmer No","Macro/ Micro nutrient","Farmer Name","Fathers Name","Time",
             "Recommended Sowing Time","Season","Farm_Acres","Survey No.","Latitude","Longitude"]
df = df.drop(columns=[c for c in drop_cols if c in df.columns], errors="ignore")

# Coerce numeric-ish text
for col in ["OC","Avail-S","Avail-B"]:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

# ---------- Soil normalization (light) ----------
MASTER_SOIL = ["Black","Red","Sandy","Loam","Clay","Brown","Yellow","White","Laterite",
               "Saline","Alkaline","Alluvial","Gravel/Stony","Mixed","Other"]

def clean_text(s: str) -> str:
    s = str(s).lower().strip()
    s = re.sub(r"soil", "", s)
    s = re.sub(r"[^a-z\s\+\-]", "", s)
    return s.strip()

def standardize_soil(raw: str) -> str:
    if not isinstance(raw, str) or not raw.strip(): return "Other"
    txt = clean_text(raw)
    match, score, _ = process.extractOne(txt, MASTER_SOIL, scorer=fuzz.WRatio)
    return match if score >= 80 else "Other"

if "Soil type" in df.columns:
    df["Soil_Type_Standard"] = df["Soil type"].apply(standardize_soil)
else:
    alt = [c for c in df.columns if c.lower().strip() == "soil type"]
    if alt:
        df["Soil_Type_Standard"] = df[alt[0]].apply(standardize_soil)

# ---------- Crop normalization (simple aliases) ----------
def standardize_crop(raw: str) -> str:
    if not isinstance(raw, str) or not raw.strip(): return "Other"
    txt = raw.lower()
    aliases = {
        "paddy":"Rice","vari":"Rice","rice":"Rice",
        "maize":"Maize","sweetcorn":"Maize",
        "ground nut":"Groundnut","groundnut":"Groundnut","g.nut":"Groundnut",
        "cotton":"Cotton","castor":"Castor","sesamum":"Sesame","sesame":"Sesame",
        "sunflower":"Sunflower","soyabean":"Soyabean","soybean":"Soyabean",
        "chilli":"Chilli","chillies":"Chilli","mirchi":"Chilli",
        "tomato":"Tomato","brinjal":"Brinjal","okra":"Okra","benda":"Okra",
        "ragi":"Ragi","sorghum":"Sorghum","jowar":"Sorghum","jonna":"Sorghum",
        "bajra":"Pearl Millet","korra":"Foxtail Millet"
    }
    for k,v in aliases.items():
        if k in txt: return v
    return raw.title()

df["Crop_Sown_Standard"] = df["Crop_Sown"].apply(standardize_crop)

# keep top 25 classes, rest -> "Other"
top_classes = df["Crop_Sown_Standard"].value_counts().head(25).index
df["Crop_Sown_Standard"] = np.where(df["Crop_Sown_Standard"].isin(top_classes),
                                    df["Crop_Sown_Standard"], "Other")

# ---------- Features ----------
num_cols = [c for c in [
    "pH","EC","OC","Avail-P","Exch-K","Avail-Ca","Avail-Mg","Avail-S",
    "Avail-Zn","Avail-B","Avail-Fe","Avail-Cu","Avail-Mn",
    "Kharif_rain","Rabi_rain","Zaid_rain"
] if c in df.columns]
cat_cols = [c for c in ["District","Soil_Type_Standard"] if c in df.columns]
target = "Crop_Sown_Standard"

for c in num_cols:
    df[c] = pd.to_numeric(df[c], errors="coerce").fillna(df[c].median())

df = df[df[target].notna()].copy()
X = df[num_cols + cat_cols]
y = df[target]

# ensure stratifyable (drop classes with <2 samples)
vc = y.value_counts()
keep = vc[vc >= 2].index
mask = y.isin(keep)
X, y = X[mask], y[mask]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=42, stratify=y
)

# ---------- Pipelines (trees: no PCA) ----------
num_pipe = Pipeline([("scaler", StandardScaler())])
cat_enc = OneHotEncoder(handle_unknown="ignore")
pre = ColumnTransformer([("num", num_pipe, num_cols), ("cat", cat_enc, cat_cols)])

rf_pipe = Pipeline([("pre", pre), ("clf", RandomForestClassifier(random_state=42))])
gb_pipe = Pipeline([("pre", pre), ("clf", GradientBoostingClassifier(random_state=42))])

# ---------- Compact grids (fast) ----------
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

rf_grid = {
    "clf__n_estimators": [300, 500],
    "clf__max_depth": [None, 20],
    "clf__min_samples_split": [2, 5],
    "clf__min_samples_leaf": [1, 2]
}
gb_grid = {
    "clf__n_estimators": [150, 250],
    "clf__learning_rate": [0.05, 0.1],
    "clf__max_depth": [2, 3]
}

rf_search = GridSearchCV(rf_pipe, rf_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=0)
gb_search = GridSearchCV(gb_pipe, gb_grid, cv=cv, scoring="accuracy", n_jobs=-1, verbose=0)

rf_search.fit(X_train, y_train)
gb_search.fit(X_train, y_train)

def evaluate(name, est):
    y_pred = est.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"\n=== {name} ===")
    print("Test Accuracy:", round(acc,4))
    print(classification_report(y_test, y_pred, zero_division=0))
    return acc

rf_acc = evaluate("RandomForest (tuned)", rf_search.best_estimator_)
gb_acc = evaluate("GradientBoosting (tuned)", gb_search.best_estimator_)

# pick best
best_name, best_est, best_acc = (
    ("RandomForest (tuned)", rf_search.best_estimator_, rf_acc)
    if rf_acc >= gb_acc else
    ("GradientBoosting (tuned)", gb_search.best_estimator_, gb_acc)
)

print("\nWinner:", best_name, "→ Test Accuracy:", round(best_acc,4))
print("RF best params:", rf_search.best_params_)
print("GB best params:", gb_search.best_params_)

# save outputs
pd.DataFrame([{
    "rf_best_params": rf_search.best_params_,
    "rf_cv_best": rf_search.best_score_,
    "rf_test_accuracy": rf_acc,
    "gb_best_params": gb_search.best_params_,
    "gb_cv_best": gb_search.best_score_,
    "gb_test_accuracy": gb_acc,
    "winner": best_name,
    "winner_test_accuracy": best_acc
}]).to_csv("tuning_summary.csv", index=False)

joblib.dump(best_est, "serving_pipeline_tuned.pkl")
print("\nSaved: tuning_summary.csv, serving_pipeline_tuned.pkl")



=== RandomForest (tuned) ===
Test Accuracy: 0.6287
                precision    recall  f1-score   support

        Banana       0.00      0.00      0.00        10
Banana/Coconut       0.42      0.38      0.40        13
    Black Gram       0.62      0.43      0.51        35
    Cashew Nut       0.80      0.36      0.50        11
        Chilli       0.43      0.41      0.42        29
        Citrus       0.73      0.79      0.76        28
       Coconut       0.30      0.17      0.21        18
        Cotton       0.57      0.54      0.55       119
        Cowpea       0.57      0.92      0.71        52
     Groundnut       0.57      0.80      0.66       152
    Horse Gram       0.20      0.08      0.11        13
         Maize       0.53      0.61      0.57        92
         Mango       0.00      0.00      0.00         8
      Oil Palm       0.11      0.10      0.11        10
         Other       0.55      0.45      0.50       124
     Pigeonpea       0.40      0.12      0.19      