# Rice Classification — Engineered Notebook (v3, dtype‑safe)

**Fix for your error**: some environments load CSV numeric columns as `object`, so the numeric selector
picked **0 columns**. This version **coerces all feature columns to numeric** with `pd.to_numeric(errors="coerce")`
*before* building the pipeline. Everything else（CV、GridSearch、评估、持久化）保持不变。


In [None]:
import warnings, sys
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score, GridSearchCV, learning_curve
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    classification_report, confusion_matrix, ConfusionMatrixDisplay,
    roc_auc_score, roc_curve, auc,
    precision_recall_curve, average_precision_score, f1_score
)
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC

import joblib, sklearn, matplotlib

warnings.filterwarnings("ignore")
np.random.seed(0)
print("Python:", sys.version.split()[0])
print("sklearn:", sklearn.__version__, "| pandas:", pd.__version__, "| matplotlib:", matplotlib.__version__)

In [None]:
def find_data_file():
    for p in [Path("../data/rice-final2.csv"), Path("./data/rice-final2.csv"), Path("rice-final2.csv")]:
        if p.exists():
            return p
    raise FileNotFoundError("Could not find rice-final2.csv in ../data, ./data or current dir.")

data_path = find_data_file()
print("Using dataset:", data_path.resolve())
df = pd.read_csv(data_path)
print("Shape:", df.shape)
display(df.head(3))
print("\nRaw dtypes:\n", df.dtypes)

In [None]:
# --- DTYPE FIX: coerce features to numeric ---
assert "class" in df.columns, "Missing target column 'class'"
label_map = {"class1": 0, "class2": 1}
y = df["class"].map(label_map).astype(int).values

X = df.drop(columns=["class"]).copy()
for c in X.columns:
    X[c] = pd.to_numeric(X[c], errors="coerce")

print("\nAfter coercion dtypes:\n", X.dtypes)
numeric_features = list(X.columns)
assert len(numeric_features) > 0, "No numeric features after coercion."
print(f"Numeric features: {len(numeric_features)} ->", numeric_features)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=0
)
len(X_train), len(X_test)

## Baselines — 10‑Fold Stratified CV (no tuning)

In [None]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", make_pipeline(SimpleImputer(strategy="mean"), MinMaxScaler()), numeric_features),
    ],
    remainder="drop"
)

models = {
    "LogisticRegression": LogisticRegression(max_iter=1000, random_state=0),
    "GaussianNB": GaussianNB(),
    "DecisionTree(entropy)": DecisionTreeClassifier(criterion="entropy", random_state=0),
    "KNN(k=5,p=1)": KNeighborsClassifier(n_neighbors=5, p=1),
    "SVM(RBF)": SVC(kernel="rbf", probability=True, random_state=0),
    "RandomForest": RandomForestClassifier(random_state=0),
    "GradientBoosting": GradientBoostingClassifier(random_state=0),
    "AdaBoost": AdaBoostClassifier(random_state=0),
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)

rows = []
for name, clf in models.items():
    pipe = Pipeline([("pre", preprocess), ("clf", clf)])
    scores = cross_val_score(pipe, X_train, y_train, cv=cv, scoring="accuracy")
    rows.append((name, scores.mean(), scores.std()))

import pandas as pd
cv_table = pd.DataFrame(rows, columns=["Model", "CV_Acc_Mean", "CV_Acc_Std"]).sort_values("CV_Acc_Mean", ascending=False)
cv_table.reset_index(drop=True, inplace=True)
cv_table

## Hyperparameter Tuning (SVM & RF)

In [None]:
pipe_svm = Pipeline([("pre", preprocess), ("clf", SVC(kernel="rbf", probability=True, random_state=0))])
param_svm = {"clf__C": [0.5, 1, 2, 5], "clf__gamma": ["scale", 0.5, 1, 2]}

pipe_rf = Pipeline([("pre", preprocess), ("clf", RandomForestClassifier(random_state=0))])
param_rf = {"clf__n_estimators": [50, 100, 200],
            "clf__max_leaf_nodes": [None, 12, 24],
            "clf__max_features": ["sqrt", "log2"],
            "clf__criterion": ["gini", "entropy"]}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
searches = {"SVM(RBF)": (pipe_svm, param_svm), "RandomForest": (pipe_rf, param_rf)}
best_models = {}
for name, (pipe, param) in searches.items():
    gs = GridSearchCV(pipe, param_grid=param, scoring="accuracy", cv=cv, refit=True)
    gs.fit(X_train, y_train)
    best_models[name] = gs
    print(f"[{name}] Best params:", gs.best_params_, "CV Best:", round(gs.best_score_, 4))

name_best = max(best_models, key=lambda k: best_models[k].best_score_)
gs_best = best_models[name_best]
print("\n=> Selected best by CV:", name_best)

y_pred = gs_best.predict(X_test)
if hasattr(gs_best, "predict_proba"):
    y_score = gs_best.predict_proba(X_test)[:, 1]
else:
    y_score = gs_best.decision_function(X_test)

print("Test accuracy:", round((y_pred == y_test).mean(), 4))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))

test_metrics = {
    "model": name_best,
    "test_accuracy": float(np.mean(y_pred == y_test)),
    "test_f1_macro": float(f1_score(y_test, y_pred, average="macro")),
    "test_f1_weighted": float(f1_score(y_test, y_pred, average="weighted")),
    "roc_auc": float(roc_auc_score(y_test, y_score)),
    "pr_auc": float(average_precision_score(y_test, y_score)),
}
test_metrics

## Plots — Confusion Matrix, ROC, PR

In [None]:
cm = confusion_matrix(y_test, y_pred)
fig = plt.figure()
ConfusionMatrixDisplay(confusion_matrix=cm).plot(values_format="d")
plt.title(f"Confusion Matrix — {name_best}")
plt.show()

fpr, tpr, _ = roc_curve(y_test, y_score)
plt.figure()
plt.plot(fpr, tpr, label=f"AUC={auc(fpr, tpr):.3f}")
plt.plot([0,1],[0,1],'--')
plt.xlabel("FPR"); plt.ylabel("TPR"); plt.title(f"ROC — {name_best}"); plt.legend(); plt.show()

precision, recall, _ = precision_recall_curve(y_test, y_score)
plt.figure()
plt.plot(recall, precision, label=f"AP={average_precision_score(y_test, y_score):.3f}")
plt.xlabel("Recall"); plt.ylabel("Precision"); plt.title(f"PR — {name_best}"); plt.legend(); plt.show()

## Save model & quick inference

In [None]:
art_dir = Path("../artifacts") if Path.cwd().name == "notebooks" else Path("./artifacts")
art_dir.mkdir(parents=True, exist_ok=True)
model_path = art_dir / f"best_model_{name_best.replace(' ','_').replace('(','').replace(')','')}.joblib"
joblib.dump(gs_best.best_estimator_, model_path)
print("Saved:", model_path.resolve())

loaded = joblib.load(model_path)
demo = pd.DataFrame(X_test, columns=X.columns).head(5)
print("Demo preds:", loaded.predict(demo).tolist())