# Run Kisaan ML without local GPU: Colab/Kaggle/Hugging Face options

This notebook lets you execute the Kisaan Topic/Sub-topic classification problem even without a local GPU. Fill the config first, then choose one of: CPU baseline, Colab/Kaggle GPU, or Hugging Face Inference API.
import os
import json
from pathlib import Path


In [None]:
## 1) Project Variables (fill before running)

# Configure your run here. For this Kisaan repo, defaults are set to text classification.
CONFIG = {
    "problem_type": "classification",  # classification|regression|seq2seq|vision
    "problem_statement_path": "docs/docs_problem_statement_Version2.md",
    # Local path or URL; for Colab/Kaggle, point to /content/drive/MyDrive/Kisaan/Datasets/... if using Drive
    "dataset_source": "Datasets/KCC_MarMay2025_combined.csv",
    # For this project, label is trained per-head. Here we drive a CPU baseline for topic only.
    "label_column": "topic",
    "text_column": "QueryText",
    "metrics": ["f1", "precision", "recall", "accuracy"],
    # Choose: cpu_baseline (sklearn), torch_cpu (slow), remote_api (HF Inference API)
    "model_choice": "cpu_baseline",
    # Remote inference (optional)
    "hf_model_id": "distilbert-base-uncased-finetuned-sst-2-english",
    "hf_token_env": "HF_TOKEN",  # set this env var if using HF API
    # Training/runtime knobs
    "batch_size": 2048,
    "max_epochs": 1,
    "internet_allowed": True,
}

# Basic validation
assert CONFIG["problem_type"] in {"classification", "regression", "seq2seq", "vision"}
assert isinstance(CONFIG["dataset_source"], str) and len(CONFIG["dataset_source"]) > 0
print("Config loaded:", json.dumps(CONFIG, indent=2))

In [None]:
## 2) Hardware and Runtime Detection
import os, platform, sys, shutil, subprocess, psutil
import torch

print(f"Python: {sys.version.split()[0]} on {platform.platform()}")
print(f"Pip: {shutil.which('pip')}")
print(f"CWD: {os.getcwd()}")

# RAM and Disk
try:
    import psutil as _ps
    mem = _ps.virtual_memory()
    print(f"RAM: {mem.total/1e9:.2f} GB total, {mem.available/1e9:.2f} GB available")
except Exception:
    pass

# GPU
cuda_ok = torch.cuda.is_available()
if cuda_ok:
    print("CUDA available. Devices:")
    for i in range(torch.cuda.device_count()):
        print(f" - {i}: {torch.cuda.get_device_name(i)}")
    try:
        out = subprocess.check_output(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader']).decode()
        print("nvidia-smi:\n", out)
    except Exception as e:
        print("nvidia-smi not accessible:", e)
else:
    print("No GPU detected. Consider switching to Colab/Kaggle for training. This notebook includes CPU baselines and remote options.")

USE_GPU = bool(cuda_ok)
print("USE_GPU:", USE_GPU)

In [None]:
## 3) Runtime Helpers: Colab/Kaggle/Hugging Face Spaces Options
import os
import sys

IN_COLAB = False
IN_KAGGLE = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    pass
IN_KAGGLE = os.environ.get("KAGGLE_KERNEL_RUN_TYPE") is not None

print({"IN_COLAB": IN_COLAB, "IN_KAGGLE": IN_KAGGLE})

if IN_COLAB:
    print("Colab detected. If needed, set Runtime -> Change runtime type -> GPU.")
    from google.colab import drive  # type: ignore
    def mount_drive():
        drive.mount('/content/drive')
        print("Drive mounted at /content/drive")
else:
    def mount_drive():
        print("Not in Colab; mount_drive() is a no-op.")

# (Optional) Helpers for creating a Hugging Face Space for inference
SPACE_HELP = """
To deploy a small Gradio demo on Hugging Face Spaces:
1) pip install gradio huggingface_hub
2) huggingface-cli login  # use your HF token
3) Create a repo: huggingface_hub.create_repo('your-username/kisaan-demo', repo_type='space', space_sdk='gradio')
4) Push app.py and requirements.txt to that repo.
"""
print("Spaces helper available. See SPACE_HELP variable for steps.")

In [None]:
## 4) Dependency Installation (auto-detect or explicit)
import os, sys, subprocess

def pip_install(pkgs):
    cmd = [sys.executable, '-m', 'pip', 'install', '--quiet'] + pkgs
    print('Running:', ' '.join(cmd))
    return subprocess.check_call(cmd)

# Install if in Colab, or if user toggles via CONFIG["internet_allowed"]
if IN_COLAB and CONFIG.get("internet_allowed", True):
    pip_install([
        'transformers>=4.41.0', 'datasets>=2.20.0', 'accelerate>=0.31.0', 'peft>=0.11.1', 'evaluate>=0.4.2',
        'sentencepiece>=0.1.99', 'scikit-learn>=1.3', 'pandas>=2.0', 'numpy>=1.24', 'tqdm>=4.66', 'pyarrow>=14.0',
        'matplotlib>=3.8', 'seaborn>=0.13', 'openpyxl>=3.1'
    ])

# Verify key imports
import pandas as pd
import numpy as np
print('pandas', pd.__version__, 'numpy', np.__version__)

In [None]:
## 5) Problem Statement Loader and Validator
from pathlib import Path

ps_path = Path(CONFIG["problem_statement_path"]) 
if not ps_path.exists():
    raise FileNotFoundError(f"Problem statement missing: {ps_path}")

md = ps_path.read_text(encoding='utf-8')
print(md.splitlines()[0:12])  # show first lines

# Naive hints
hints = {
    'classification': any(k in md.lower() for k in ['classif', 'label', 'topic']),
    'regression': 'regression' in md.lower(),
    'seq2seq': any(k in md.lower() for k in ['translation','summariz','generate']),
}
print('Task hints:', hints)

In [None]:
## 6) Dataset Ingestion (local or remote)
import pandas as pd
from pathlib import Path

data_path = CONFIG["dataset_source"]
if data_path.startswith("http://") or data_path.startswith("https://"):
    import urllib.request, tempfile
    tmpf, _ = urllib.request.urlretrieve(data_path)
    df = pd.read_csv(tmpf)
else:
    df = pd.read_csv(data_path, encoding='utf-8-sig')

assert CONFIG["text_column"] in df.columns, f"Missing {CONFIG['text_column']} in dataset"
print("Rows:", len(df), "Columns:", list(df.columns)[:20])
print(df[[CONFIG["text_column"]]].head())

# If label exists, inspect balance (topic only here)
if CONFIG["label_column"] in df.columns:
    vc = df[CONFIG["label_column"]].fillna("Other").astype(str).value_counts().head(20)
    print("Top 20 labels:\n", vc)

In [None]:
## 7) Lightweight Baseline on CPU (scikit-learn)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, accuracy_score

if CONFIG["problem_type"] == "classification":
    texts = df[CONFIG["text_column"]].fillna("").astype(str)
    labels = df.get(CONFIG["label_column"])  # may be missing
    if labels is None:
        print("Label column not found; skipping baseline.")
    else:
        y = labels.fillna("Other").astype(str)
        X_train, X_test, y_train, y_test = train_test_split(texts, y, test_size=0.2, random_state=42, stratify=y)
        pipe = Pipeline([
            ("tfidf", TfidfVectorizer(max_features=50000, ngram_range=(1,2))),
            ("clf", LogisticRegression(max_iter=200, n_jobs=-1))
        ])
        pipe.fit(X_train, y_train)
        preds = pipe.predict(X_test)
        print(classification_report(y_test, preds, zero_division=0))
        metrics = {
            "micro_f1": f1_score(y_test, preds, average="micro", zero_division=0),
            "macro_f1": f1_score(y_test, preds, average="macro", zero_division=0),
            "micro_p": precision_score(y_test, preds, average="micro", zero_division=0),
            "micro_r": recall_score(y_test, preds, average="micro", zero_division=0),
            "accuracy": accuracy_score(y_test, preds),
        }
        print("Baseline metrics:", metrics)
        # Save baseline
        import joblib
        Path("artifacts").mkdir(exist_ok=True)
        joblib.dump(pipe, "artifacts/baseline_logreg_tfidf.joblib")
        json.dump(metrics, open("artifacts/baseline_metrics.json","w"))

In [None]:
## 8) Optional: PyTorch Training on CPU with Gradient Accumulation (slow)
# For this project, we recommend Colab GPU for transformer fine-tuning. This cell is a stub.
print("Skipping heavy CPU training; use Colab with the training cells below.")

In [None]:
## 9) Remote Inference via Hugging Face Inference API (generic example)
import os, time, requests

if CONFIG["model_choice"] == "remote_api":
    HF_TOKEN = os.environ.get(CONFIG["hf_token_env"], None)
    assert HF_TOKEN, f"Set env var {CONFIG['hf_token_env']} to your HF token"
    headers = {"Authorization": f"Bearer {HF_TOKEN}"}
    api_url = f"https://api-inference.huggingface.co/models/{CONFIG['hf_model_id']}"

    def hf_query(payload):
        for attempt in range(5):
            resp = requests.post(api_url, headers=headers, json=payload, timeout=60)
            if resp.status_code == 200:
                return resp.json()
            if resp.status_code in (503, 504):  # model loading / warmup
                time.sleep(2 ** attempt)
                continue
            raise RuntimeError(f"HF API error {resp.status_code}: {resp.text}")

    sample_texts = df[CONFIG["text_column"]].dropna().astype(str).head(3).tolist()
    for t in sample_texts:
        out = hf_query({"inputs": t})
        print(t, "\n -> ", out, "\n")
else:
    print("Remote API not selected; skipping.")

In [None]:
## 10) Evaluation and Metrics (for baseline)
# Already printed in Section 7; this cell ensures metrics.json exists even if skipped.
import json
from pathlib import Path

metrics_path = Path("artifacts/baseline_metrics.json")
if metrics_path.exists():
    print(metrics_path.read_text())
else:
    print("No baseline metrics available.")

In [None]:
## 11) Save Artifacts and Reproducibility
import random, numpy as np, torch, subprocess, sys
random.seed(42)
np.random.seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

Path("artifacts").mkdir(exist_ok=True)
with open("artifacts/config.json","w") as f:
    json.dump(CONFIG, f, indent=2)

# Freeze environment for reproducibility (best-effort)
try:
    req_txt = subprocess.check_output([sys.executable, "-m", "pip", "freeze"]).decode()
    Path("artifacts/requirements-locked.txt").write_text(req_txt)
except Exception as e:
    print("Could not freeze environment:", e)

In [None]:
## 12) Unit Tests Runner (pytest)
# Minimal smoke tests inline
try:
    assert isinstance(CONFIG, dict) and CONFIG["problem_type"] in {"classification","regression","seq2seq","vision"}
    assert len(df) > 0 and CONFIG["text_column"] in df.columns
    print("Smoke tests passed.")
except AssertionError as e:
    raise AssertionError(f"Smoke test failed: {e}")

In [None]:
## 13) Troubleshooting and Auto-fixes
# Common issues handler: adjust batch sizes, handle missing columns, unicode

issues = []
if CONFIG["text_column"] not in df.columns:
    issues.append("Missing text column")
if len(df) == 0:
    issues.append("Empty dataset")

if issues:
    print("Detected issues:", issues)
    # Example auto-fix: do nothing here; guide the user
    print("Please check dataset_source path and column names. For Colab, ensure Drive is mounted and paths are correct.")
else:
    print("No common issues detected.")

In [None]:
## Optional: Kisaan Colab Training using repo scripts (GPU recommended)
# Run this in Colab after mounting Drive and placing the repository under /content/drive/MyDrive/Kisaan

if IN_COLAB:
    mount_drive()
    PROJECT_DIR = "/content/drive/MyDrive/Kisaan"
    DATA_CSV = f"{PROJECT_DIR}/Datasets/KCC_MarMay2025_combined.csv"
    OUT_TOPIC = f"{PROJECT_DIR}/models/topic"
    OUT_SUB   = f"{PROJECT_DIR}/models/subtopic"

    import os
    os.makedirs(OUT_TOPIC, exist_ok=True)
    os.makedirs(OUT_SUB, exist_ok=True)

    # Install training deps (if needed)
    if CONFIG.get("internet_allowed", True):
        import sys, subprocess
        subprocess.check_call([sys.executable, '-m', 'pip', 'install', '--quiet',
                               'transformers>=4.41.0','datasets>=2.20.0','accelerate>=0.31.0','peft>=0.11.1',
                               'evaluate>=0.4.2','sentencepiece>=0.1.99','scikit-learn>=1.3','pandas>=2.0','numpy>=1.24','tqdm>=4.66','pyarrow>=14.0'])

    # Run Topic training
    !python {PROJECT_DIR}/src/train_topic_subtopic_peft.py --data_csv "{DATA_CSV}" --out_dir "{OUT_TOPIC}" --label_col topic --text_col QueryText --base_model xlm-roberta-base --epochs 4 --batch_size 16 --max_length 160 --lr 2e-5

    # Run Sub-topic training
    !python {PROJECT_DIR}/src/train_topic_subtopic_peft.py --data_csv "{DATA_CSV}" --out_dir "{OUT_SUB}" --label_col sub_topic --text_col QueryText --base_model xlm-roberta-base --epochs 4 --batch_size 16 --max_length 160 --lr 2e-5

    # Inference (GPU/CPU auto)
    OUT_SCORED = f"{PROJECT_DIR}/Datasets/KCC_MarMay2025_scored.csv"
    !python {PROJECT_DIR}/src/predict_local.py --data_csv "{DATA_CSV}" --model_topic "{OUT_TOPIC}" --model_subtopic "{OUT_SUB}" --text_col QueryText --out_csv "{OUT_SCORED}" --device auto --batch_size 64

    import pandas as pd
    df_scored = pd.read_csv(OUT_SCORED, encoding='utf-8-sig')
    print(df_scored.filter(regex='^(pred_|prob_topic::|prob_sub::|QueryText)').head())
else:
    print("Not in Colab; skipping GPU training cells.")