# Kisaan Colab Training (GPU)
Use this notebook on Google Colab to fine-tune the Topic and Sub-topic heads with PEFT/LoRA, then run inference on the combined CSV. Run cells top-to-bottom.

In [None]:
# 1. Configure project paths
from pathlib import Path
import os
import json
PROJECT_DIR = Path("/content/Kisaan")  # path after cloning the GitHub repo
DATASET_PATH = PROJECT_DIR / "Datasets" / "KCC_MarMay2025_combined.csv"
MODELS_DIR = PROJECT_DIR / "models"
PROCESSED_DIR = PROJECT_DIR / "Datasets" / "processed"
print("Project directory:", PROJECT_DIR)
print("Dataset path:", DATASET_PATH)

In [None]:
# 2. (Optional) Mount Google Drive for long-term storage
IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    pass
print({"IN_COLAB": IN_COLAB})
if IN_COLAB:
    from google.colab import drive  # type: ignore
    drive.mount("/content/drive", force_remount=False)
    DRIVE_PROJECT_DIR = Path("/content/drive/MyDrive/Kisaan")
    print("Drive project dir:", DRIVE_PROJECT_DIR)
else:
    print("Not running inside Colab; skipping Drive mount.")

In [None]:
# 3. Clone the GitHub repository if not present
import subprocess
import sys
if not PROJECT_DIR.exists():
    subprocess.run(["git", "clone", "https://github.com/7009soham/Kisaan.git", str(PROJECT_DIR)], check=True)
else:
    print("Repository already present at", PROJECT_DIR)
%cd {PROJECT_DIR}
!git status -sb

In [None]:
# 4. Install dependencies (Python 3.12 compatible)
!pip install --quiet --no-cache-dir --upgrade \\
    torch \\
    transformers==4.46.1 \\
    datasets==3.0.1 \\
    accelerate==1.0.1 \\
    peft==0.13.2 \\
    sentencepiece==0.1.99 \\
    scikit-learn==1.5.2 \\
    pandas==2.2.3 \\
    numpy==2.1.1 \\
    tqdm==4.66.5 \\
    pyarrow==16.1.0 \\
    matplotlib==3.9.2 \\
    seaborn==0.13.2

In [None]:
# 4b. Verify library versions (ensure imports use new wheels)
import numpy as np
import pandas as pd
import sklearn
print('numpy', np.__version__)
print('pandas', pd.__version__)
print('sklearn', sklearn.__version__)

In [None]:
# 5. Validate dataset paths
assert DATASET_PATH.exists(), f"Dataset not found: {DATASET_PATH}"
print("Dataset rows:", sum(1 for _ in open(DATASET_PATH, encoding="utf-8-sig")) - 1)
print("Ready to preprocess labels.")

In [None]:
# 6. Preprocess dataset for stable stratified splits
import pandas as pd
from pathlib import Path
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)
df_full = pd.read_csv(DATASET_PATH, encoding="utf-8-sig")
def ensure_column(df, label_col):
    if label_col not in df.columns:
        if label_col == "topic" and "QueryType" in df.columns:
            df[label_col] = df["QueryType"].fillna("Other")
        else:
            df[label_col] = "Other"
    df[label_col] = df[label_col].fillna("Other").astype(str)
    return df
def sanitize_label_column(series):
    def split_labels(val):
        parts = [p.strip() for p in str(val).split(";") if p.strip()]
        return parts if parts else ["Other"]
    first_labels = series.apply(lambda s: split_labels(s)[0])
    counts = first_labels.value_counts()
    rare = set(counts[counts < 2].index)
    def replace_if_rare(val):
        parts = split_labels(val)
        return "Other" if parts[0] in rare else ";".join(parts)
    return series.apply(replace_if_rare)
processed_paths = {}
for col in ["topic", "sub_topic"]:
    df = ensure_column(df_full.copy(), col)
    df[col] = sanitize_label_column(df[col])
    out_path = PROCESSED_DIR / f"KCC_MarMay2025_{col}_train.csv"
    df.to_csv(out_path, index=False, encoding="utf-8-sig")
    processed_paths[col] = out_path
    print(f"Prepared {col} dataset -> {out_path}")
processed_paths

In [None]:
# 7. Train Topic head with PEFT/LoRA
topic_data = processed_paths["topic"]
topic_out = MODELS_DIR / "topic"
topic_out.mkdir(parents=True, exist_ok=True)
!python src/train_topic_subtopic_peft.py \
    --data_csv "{topic_data}" \
    --out_dir "{topic_out}" \
    --label_col topic \
    --text_col QueryText \
    --base_model xlm-roberta-base \
    --epochs 4 \
    --batch_size 16 \
    --max_length 160 \
    --lr 2e-5

In [None]:
# 8. Train Sub-topic head
sub_data = processed_paths["sub_topic"]
sub_out = MODELS_DIR / "subtopic"
sub_out.mkdir(parents=True, exist_ok=True)
!python src/train_topic_subtopic_peft.py \
    --data_csv "{sub_data}" \
    --out_dir "{sub_out}" \
    --label_col sub_topic \
    --text_col QueryText \
    --base_model xlm-roberta-base \
    --epochs 4 \
    --batch_size 16 \
    --max_length 160 \
    --lr 2e-5

In [None]:
# 9. Run inference on the full combined CSV
scored_csv = PROJECT_DIR / "Datasets" / "KCC_MarMay2025_scored.csv"
!python src/predict_local.py \
    --data_csv "{DATASET_PATH}" \
    --model_topic "{topic_out}" \
    --model_subtopic "{sub_out}" \
    --text_col QueryText \
    --out_csv "{scored_csv}" \
    --device auto \
    --batch_size 64
assert scored_csv.exists(), f"Scored file missing: {scored_csv}"

In [None]:
# 10. Preview predictions
import pandas as pd
df_scored = pd.read_csv(scored_csv, encoding="utf-8-sig")
cols = [c for c in df_scored.columns if c.startswith("prob_topic::")]
print(df_scored[["QueryText", "pred_topic", "pred_sub_topic"] + cols[:5]].head())
print("Saved scored CSV ->", scored_csv)

## Next steps
- Download `models/topic`, `models/subtopic`, and `Datasets/KCC_MarMay2025_scored.csv` for local CPU inference.
- Update thresholds or taxonomy and re-run as needed.
- (Optional) Sync artifacts back to `/content/drive/MyDrive/Kisaan` for persistence.