<a href="https://colab.research.google.com/github/2303A51689/Python-for-DS-1689/blob/main/Untitled5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#!/usr/bin/env python3
"""
ml_full_india_pipeline.py
Full ML pipeline for the India pesticide toxicity risk dataset.

Usage:
  python ml_full_india_pipeline.py --data /mnt/data/india_pesticide_toxicity_risk.csv --outdir ./ml_results

Arguments:
  --data           Path to CSV dataset
  --outdir         Output folder (default ./ml_results)
  --max-rows       If set, subsample to this many rows (int)
  --rf-estimators  Number of trees for RandomForest/ExtraTrees (default 150)
  --skip-tsne      If provided, skip t-SNE (speeds things up)
"""
import argparse, os, math, warnings, joblib
from pathlib import Path
import numpy as np, pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression, Ridge, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, ExtraTreesRegressor, ExtraTreesClassifier, IsolationForest
from sklearn.neighbors import KNeighborsRegressor, KNeighborsClassifier
from sklearn.svm import SVR, SVC, OneClassSVM
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, silhouette_score
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.mixture import GaussianMixture
import matplotlib
matplotlib.use("Agg")
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

def main(args):
    OUT = Path(args.outdir); OUT.mkdir(parents=True, exist_ok=True)

    # Load
    df = pd.read_csv(args.data)
    df.columns = [c.strip() for c in df.columns]
    if args.max_rows and len(df) > args.max_rows:
        df = df.sample(n=args.max_rows, random_state=42)

    # Targets detection
    reg_target = "use_quantity_kg_ai_modeled" if "use_quantity_kg_ai_modeled" in df.columns else ("use_quantity_kg_ai" if "use_quantity_kg_ai" in df.columns else None)
    clf_target = "is_hhp_flag" if "is_hhp_flag" in df.columns else ("hhp_flag" if "hhp_flag" in df.columns else None)

    # Features candidates
    num_candidates = ["year","bee_ld50_mg_per_bee","aetl_lethal_doses"]
    num_features = [c for c in num_candidates if c in df.columns]
    cat_candidates = ["admin1","category","active_ingredient","country","iso3"]
    cat_features = [c for c in cat_candidates if c in df.columns]
    if "year" not in df.columns:
        df["year"] = pd.to_datetime(df.get("created_at_utc", pd.NaT), errors="coerce").dt.year.fillna(2005).astype(int)
    if "year" not in num_features:
        num_features = ["year"] + num_features

    features = [f for f in (num_features + cat_features) if f in df.columns]
    X = df[features].copy()

    # Preprocessing
    num_pipe = Pipeline([("imputer", SimpleImputer(strategy="median")), ("scaler", StandardScaler())])
    cat_pipe = Pipeline([("imputer", SimpleImputer(strategy="constant", fill_value="__missing__")), ("ohe", OneHotEncoder(handle_unknown="ignore"))])
    pre = ColumnTransformer([("num", num_pipe, num_features), ("cat", cat_pipe, cat_features)])

    # Regression models
    if reg_target:
        y = df[reg_target].astype(float).fillna(0.0)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        models_reg = {
            "LinearRegression": LinearRegression(),
            "Ridge": Ridge(alpha=1.0),
            "Lasso": Lasso(alpha=0.001),
            "RandomForestRegressor": RandomForestRegressor(n_estimators=args.rf_estimators, random_state=42, n_jobs=-1),
            "ExtraTreesRegressor": ExtraTreesRegressor(n_estimators=args.rf_estimators, random_state=42, n_jobs=-1),
            "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42),
            "KNeighborsRegressor": KNeighborsRegressor(n_neighbors=7),
            "SVR": SVR()
        }
        metrics_reg = []
        for name, model in models_reg.items():
            try:
                pipe = Pipeline([("pre", pre), ("model", model)])
                pipe.fit(X_train, y_train)
                pred = pipe.predict(X_test)
                rmse = math.sqrt(mean_squared_error(y_test, pred))
                mae = mean_absolute_error(y_test, pred)
                r2 = r2_score(y_test, pred)
                metrics_reg.append({"model":name, "rmse":rmse, "mae":mae, "r2":r2})
                joblib.dump(pipe, OUT/f"reg_{name}.joblib")
                print(f"[Regression] {name} -> RMSE {rmse:.3f} R2 {r2:.3f}")
            except Exception as e:
                print(f"[Regression] {name} failed: {e}")
        pd.DataFrame(metrics_reg).to_csv(OUT/"metrics_regression.csv", index=False)

    # Classification models
    if clf_target:
        y = df[clf_target].astype(int).fillna(0).astype(int)
        if len(np.unique(y)) > 1:
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
            models_clf = {
                "LogisticRegression": LogisticRegression(max_iter=1000),
                "RandomForestClassifier": RandomForestClassifier(n_estimators=args.rf_estimators, random_state=42, n_jobs=-1),
                "ExtraTreesClassifier": ExtraTreesClassifier(n_estimators=args.rf_estimators, random_state=42, n_jobs=-1),
                "GradientBoostingClassifier": GradientBoostingClassifier(random_state=42),
                "KNeighborsClassifier": KNeighborsClassifier(n_neighbors=7),
                "SVC": SVC(probability=True)
            }
            metrics_clf = []
            for name, model in models_clf.items():
                try:
                    pipe = Pipeline([("pre", pre), ("model", model)])
                    pipe.fit(X_train, y_train)
                    pred = pipe.predict(X_test)
                    proba = pipe.predict_proba(X_test)[:,1] if hasattr(pipe.named_steps["model"], "predict_proba") else None
                    acc = accuracy_score(y_test, pred)
                    prec = precision_score(y_test, pred, zero_division=0)
                    rec = recall_score(y_test, pred, zero_division=0)
                    f1 = f1_score(y_test, pred, zero_division=0)
                    auc = roc_auc_score(y_test, proba) if proba is not None else float("nan")
                    metrics_clf.append({"model":name, "accuracy":acc, "precision":prec, "recall":rec, "f1":f1, "roc_auc":auc})
                    joblib.dump(pipe, OUT/f"clf_{name}.joblib")
                    print(f"[Classification] {name} -> acc {acc:.3f} f1 {f1:.3f} auc {auc if not np.isnan(auc) else 'NA'}")
                except Exception as e:
                    print(f"[Classification] {name} failed: {e}")
            pd.DataFrame(metrics_clf).to_csv(OUT/"metrics_classification.csv", index=False)
        else:
            print("Classification target has a single class; skipping classification.")

    # Unsupervised: PCA/t-SNE/clustering/anomaly detection
    X_trans = pre.fit_transform(X)
    if hasattr(X_trans, "toarray"):
        X_arr = X_trans.toarray()
    else:
        X_arr = X_trans

    # PCA 2D & explained variance
    pca2 = PCA(n_components=2, random_state=42).fit_transform(X_arr)
    pd.DataFrame(pca2, columns=["pc1","pc2"]).to_csv(OUT/"pca_2d.csv", index=False)
    pca10 = PCA(n_components=min(10, X_arr.shape[1]-1)).fit(X_arr)
    pd.DataFrame({"component": list(range(1, pca10.n_components_+1)), "explained_variance_ratio": pca10.explained_variance_ratio_}).to_csv(OUT/"pca_explained_variance.csv", index=False)

    if not args.skip_tsne:
        idx = np.random.RandomState(42).choice(X_arr.shape[0], size=min(5000, X_arr.shape[0]), replace=False)
        tsne = TSNE(n_components=2, random_state=42, perplexity=30, learning_rate="auto", init="pca", n_iter=800)
        X_tsne = tsne.fit_transform(X_arr[idx])
        pd.DataFrame(X_tsne, columns=["tsne1","tsne2"]).to_csv(OUT/"tsne_2d_subsample.csv", index=False)

    # Clustering: KMeans, Agglomerative, DBSCAN, GMM
    clust_results = []
    for k in [3,4,6]:
        try:
            km = KMeans(n_clusters=k, random_state=42, n_init="auto").fit(X_arr)
            sil = silhouette_score(X_arr, km.labels_) if len(np.unique(km.labels_))>1 else float("nan")
            clust_results.append({"algorithm":"KMeans","k":k,"silhouette":sil})
        except Exception as e:
            clust_results.append({"algorithm":"KMeans","k":k,"error":str(e)})
    for k in [3,4,6]:
        try:
            ag = AgglomerativeClustering(n_clusters=k).fit(X_arr)
            sil = silhouette_score(X_arr, ag.labels_) if len(np.unique(ag.labels_))>1 else float("nan")
            clust_results.append({"algorithm":"Agglomerative","k":k,"silhouette":sil})
        except Exception as e:
            clust_results.append({"algorithm":"Agglomerative","k":k,"error":str(e)})
    for eps in [0.5, 1.0]:
        try:
            db = DBSCAN(eps=eps, min_samples=10).fit(X_arr)
            labels = db.labels_
            nclusters = len([l for l in np.unique(labels) if l!=-1])
            sil = silhouette_score(X_arr[labels!=-1], labels[labels!=-1]) if nclusters>1 else float("nan")
            clust_results.append({"algorithm":"DBSCAN","eps":eps,"nclusters":nclusters,"silhouette":sil})
        except Exception as e:
            clust_results.append({"algorithm":"DBSCAN","eps":eps,"error":str(e)})
    for k in [3,4,6]:
        try:
            gm = GaussianMixture(n_components=k, random_state=42).fit(X_arr)
            labels = gm.predict(X_arr)
            sil = silhouette_score(X_arr, labels) if len(np.unique(labels))>1 else float("nan")
            clust_results.append({"algorithm":"GMM","k":k,"silhouette":sil})
        except Exception as e:
            clust_results.append({"algorithm":"GMM","k":k,"error":str(e)})

    pd.DataFrame(clust_results).to_csv(OUT/"metrics_clustering.csv", index=False)

    # Anomaly detection: IsolationForest, OneClassSVM (sampled)
    iso = IsolationForest(n_estimators=200, contamination=0.01, random_state=42).fit(X_arr)
    iso_labels = iso.predict(X_arr)
    iso_rate = float((iso_labels==-1).mean())
    try:
        ocs = OneClassSVM(gamma="scale", nu=0.05).fit(X_arr[:min(10000, X_arr.shape[0])])
        oc_labels = ocs.predict(X_arr[:min(10000, X_arr.shape[0])])
        oc_rate = float((oc_labels==-1).mean())
    except Exception as e:
        oc_rate = None
    pd.DataFrame([{"algorithm":"IsolationForest","anomaly_rate":iso_rate},{"algorithm":"OneClassSVM_sample","anomaly_rate":oc_rate}]).to_csv(OUT/"anomaly_rates.csv", index=False)

    print("Full pipeline finished. Check outputs in:", OUT.resolve())

if __name__ == "__main__":
    # Create a dummy parser and pass arguments directly
    ap = argparse.ArgumentParser()
    ap.add_argument("--data", required=True)
    ap.add_argument("--outdir", default="./ml_results")
    ap.add_argument("--max-rows", type=int, default=None)
    ap.add_argument("--rf-estimators", type=int, default=150)
    ap.add_argument("--skip-tsne", action="store_true")
    # Pass the arguments as a list to parse_args
    args = ap.parse_args(["--data", "/content/india_pesticide_toxicity_risk.zip"])
    main(args)

[Regression] LinearRegression -> RMSE 447.174 R2 0.993
[Regression] Ridge -> RMSE 448.985 R2 0.993
[Regression] Lasso -> RMSE 447.164 R2 0.993
[Regression] RandomForestRegressor -> RMSE 0.000 R2 1.000
[Regression] ExtraTreesRegressor -> RMSE 0.616 R2 1.000
[Regression] GradientBoostingRegressor -> RMSE 185.316 R2 0.999
[Regression] KNeighborsRegressor -> RMSE 543.954 R2 0.990
[Regression] SVR -> RMSE 4851.152 R2 0.210
[Classification] LogisticRegression -> acc 1.000 f1 1.000 auc 1.0
[Classification] RandomForestClassifier -> acc 1.000 f1 1.000 auc 1.0
[Classification] ExtraTreesClassifier -> acc 1.000 f1 1.000 auc 1.0
[Classification] GradientBoostingClassifier -> acc 1.000 f1 1.000 auc 1.0
[Classification] KNeighborsClassifier -> acc 1.000 f1 1.000 auc 1.0
[Classification] SVC -> acc 1.000 f1 1.000 auc 1.0
