In [None]:
# 0. Setup and Environment Repair (run first)
# - Pins NumPy/Pandas/PyArrow/Scikit-Learn to stable versions
# - Silently uninstalls conflicting preinstalls and reinstalls pinned versions
# - Creates project directories under /content/Kisaan
# - Loads the training CSV safely with UTF-8-SIG and prints shape/columns
# - Shows whether GPU is available (e.g., T4/L4)

import sys, subprocess, warnings, shutil
from pathlib import Path

print('\n=== Environment repair: pin scientific stack ===')

# Helper to run pip quietly and suppress output

def _pip(args):
    result = subprocess.run([sys.executable, '-m', 'pip', *args],
                            stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if result.returncode != 0:
        # Show truncated stderr on failure for quick diagnosis
        tail = (result.stderr or '').strip().splitlines()[-10:]
        raise RuntimeError('pip failed:\n' + '\n'.join(tail))

# Uninstall and reinstall pinned versions to avoid binary/API mismatches
PINNED = {
    'numpy': '1.26.4',
    'pandas': '2.2.2',
    'pyarrow': '16.1.0',
    'scikit-learn': '1.5.2',
}
try:
    _pip(['uninstall', '-y', *PINNED.keys()])
    _pip(['install', '--no-cache-dir', '--force-reinstall', '-q', *[f'{k}=={v}' for k, v in PINNED.items()]])
    print('Pinned scientific stack installed: ' + ', '.join([f"{k}=={v}" for k, v in PINNED.items()]))
except Exception as e:
    print('Note: pip change encountered a non-fatal issue, continuing. Details:', e)

# Verify imports and versions
import numpy as _np
import pandas as _pd
import pyarrow as _pa
import sklearn as _sk
print(f"numpy=={_np.__version__} | pandas=={_pd.__version__} | pyarrow=={_pa.__version__} | scikit-learn=={_sk.__version__}")

print('\n=== Create project directories ===')
PROJECT_DIR = Path('/content/Kisaan')
RAW_DIR = PROJECT_DIR / 'Datasets' / 'raw'
PROCESSED_DIR = PROJECT_DIR / 'Datasets' / 'processed'
MODELS_DIR = PROJECT_DIR / 'models'
for d in (PROJECT_DIR, RAW_DIR, PROCESSED_DIR, MODELS_DIR):
    d.mkdir(parents=True, exist_ok=True)
print('Project dir:', PROJECT_DIR)
print('Raw data dir:', RAW_DIR)
print('Processed dir:', PROCESSED_DIR)

print('\n=== Runtime check (GPU/CPU) ===')
_gpu_msg = 'No GPU detected. Tip: In Colab, enable GPU via Runtime → Change runtime type.'
try:
    import torch as _torch
    if _torch.cuda.is_available():
        try:
            _gpu_name = _torch.cuda.get_device_name(0)
        except Exception:
            _gpu_name = 'CUDA device'
        print('GPU available:', _gpu_name)
    else:
        print(_gpu_msg)
except Exception:
    # Fallback to nvidia-smi, if present
    if shutil.which('nvidia-smi'):
        out = subprocess.run(['nvidia-smi', '--query-gpu=name', '--format=csv,noheader'],
                             stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
        line = (out.stdout or '').strip().splitlines()[0] if out.stdout else ''
        print('GPU available:' if line else _gpu_msg, line)
    else:
        print(_gpu_msg)

print('\n=== Load CSV safely (UTF-8-SIG) ===')
import pandas as pd
warnings.filterwarnings('ignore', category=pd.errors.DtypeWarning)
warnings.filterwarnings('ignore', category=UserWarning)
DATASET_PATH = RAW_DIR / 'KCC_MarMay2025_train_ready_min2.csv'
df_full = None
if DATASET_PATH.exists():
    try:
        df_full = pd.read_csv(DATASET_PATH, encoding='utf-8-sig', low_memory=False, engine='python')
        print(f'Loaded: {DATASET_PATH}')
        print(f'Shape: {df_full.shape[0]:,} rows × {df_full.shape[1]:,} cols')
        print('Columns:', list(df_full.columns))
    except Exception as e:
        raise RuntimeError(f'Failed to read CSV at {DATASET_PATH}: {e}')
else:
    print('CSV not found at:', DATASET_PATH)
    print('Please upload your file to this path, then re-run this cell.')

print('\nSetup complete. You can proceed to the next cells.')

# Kisaan Training (Clean Colab Notebook)
Use this notebook on Google Colab. It keeps dependencies minimal, avoids version pinning conflicts, and runs Topic/Sub-topic training end to end.

In [None]:
# 1. Configure project paths (uses globals from cell 0)
from pathlib import Path
print('Project dir:', PROJECT_DIR)
print('Dataset path:', DATASET_PATH)
MODELS_DIR = MODELS_DIR  # already defined in cell 0


Project dir: \content\Kisaan
Dataset path: \content\Kisaan\Datasets\KCC_MarMay2025_combined.csv


In [2]:
# 2. (Optional) Mount Google Drive
IN_COLAB = False
try:
    import google.colab  # type: ignore
    IN_COLAB = True
except Exception:
    pass
print({'IN_COLAB': IN_COLAB})
if IN_COLAB:
    from google.colab import drive  # type: ignore
    drive.mount('/content/drive', force_remount=False)
    DRIVE_PROJECT_DIR = Path('/content/drive/MyDrive/Kisaan')
    print('Drive project dir:', DRIVE_PROJECT_DIR)
else:
    print('Not running inside Colab; skipping mount.')

{'IN_COLAB': False}
Not running inside Colab; skipping mount.


In [None]:
# 3. Ensure repository code is present (robust clone/merge)
import os, shutil, subprocess
from pathlib import Path

REPO_URL = 'https://github.com/7009soham/Kisaan.git'
TARGET_FILE = PROJECT_DIR / 'src' / 'train_topic_subtopic_peft.py'
TMP_CLONE = PROJECT_DIR / '_repo_clone'

needs_code = not TARGET_FILE.exists()
if needs_code:
    print('Project code missing; fetching from repo...')
    if TMP_CLONE.exists():
        shutil.rmtree(TMP_CLONE, ignore_errors=True)
    subprocess.run(['git', 'clone', REPO_URL, str(TMP_CLONE)], check=True)

    # Merge selected paths into PROJECT_DIR without disturbing your Datasets/models
    def copy_into(src_dir: Path, dst_dir: Path, names: list[str]):
        for name in names:
            s = src_dir / name
            if not s.exists():
                continue
            d = dst_dir / name
            if s.is_dir():
                shutil.copytree(s, d, dirs_exist_ok=True)
            else:
                d.parent.mkdir(parents=True, exist_ok=True)
                shutil.copy2(s, d)

    copy_into(TMP_CLONE, PROJECT_DIR, [
        'src', 'docs', 'notebooks', 'requirements-colab.txt', 'requirements-local.txt', 'README.md'
    ])
    shutil.rmtree(TMP_CLONE, ignore_errors=True)
else:
    print('Repository code already present.')

# Show status; if this folder is a git repo, print status; otherwise just list tree
%cd {PROJECT_DIR}
if (PROJECT_DIR / '.git').exists():
    !git status -sb
else:
    import itertools
    print('Listing project tree (top-level):')
    for p in itertools.islice(sorted(PROJECT_DIR.iterdir()), 0, 20):
        print(' -', p.name)


Repo already present.
c:\content\Kisaan
## main...origin/main
?? Datasets/processed/
## main...origin/main
?? Datasets/processed/


In [None]:
# 4. Install ML dependencies (quiet)
import sys, subprocess

def _pipq(args):
    r = subprocess.run([sys.executable, '-m', 'pip', *args],
                       stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
    if r.returncode != 0:
        tail = (r.stderr or '').strip().splitlines()[-10:]
        raise RuntimeError('pip failed:\n' + '\n'.join(tail))

# Keep these versions compatible with our pinned scientific stack
_pkgs = [
    'transformers==4.44.2',
    'datasets==2.20.0',
    'accelerate==0.32.1',
    'peft==0.12.0',
    'sentencepiece'
]
_pipq(['install', '--quiet', '--no-cache-dir', *_pkgs])

# Verify versions
import transformers, datasets, accelerate, peft
print('Installed:',
      f"transformers=={transformers.__version__}",
      f"datasets=={datasets.__version__}",
      f"accelerate=={accelerate.__version__}",
      f"peft=={peft.__version__}")


Note: you may need to restart the kernel to use updated packages.


In [None]:
# 5. GPU tuning (optimized defaults; auto-detect A100)
import os, torch
print('CUDA available:', torch.cuda.is_available())
GPU_NAME = torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'CPU'
print('Detected device:', GPU_NAME)

# Prefer bf16 on Ampere+ (A100/H100/L4 etc.) and enable TF32 for speed
try:
    torch.set_float32_matmul_precision('high')
    import torch.backends.cuda as cuda_backend
    import torch.backends.cudnn as cudnn_backend
    cuda_backend.matmul.allow_tf32 = True
    cudnn_backend.allow_tf32 = True
    print('TF32 enabled for matmul and cuDNN')
except Exception as _e:
    print('TF32 setup skipped:', _e)

# Dynamic training hyperparameters based on GPU
USE_BF16 = torch.cuda.is_available() and any(x in GPU_NAME for x in ['A100','H100','L4','A10','A30','A40'])
BATCH_SIZE = 32 if 'A100' in GPU_NAME else (24 if 'L4' in GPU_NAME else 16)
MAX_LENGTH = 192 if 'A100' in GPU_NAME else 160

print({'USE_BF16': USE_BF16, 'BATCH_SIZE': BATCH_SIZE, 'MAX_LENGTH': MAX_LENGTH})


In [None]:
# 5. Verify dataset (robust finder with fallbacks and optional upload)
import os
import pandas as pd
from pathlib import Path

# If df_full already loaded in cell 0, just summarize it
if 'df_full' in globals() and isinstance(df_full, pd.DataFrame) and not df_full.empty:
    print(f'Using df_full already loaded: {df_full.shape[0]:,} rows × {df_full.shape[1]:,} cols')
    print('Columns:', list(df_full.columns))
else:
    found_path = None
    candidates = [
        DATASET_PATH,
        PROJECT_DIR / 'Datasets' / 'KCC_MarMay2025_combined.csv',
        PROJECT_DIR / 'Datasets' / 'KCC_MarMay2025.csv',
    ]
    for p in candidates:
        if p.exists():
            found_path = p
            break
    # Optional: search within project if not found in common locations (cheap glob)
    if not found_path:
        for p in (PROJECT_DIR / 'Datasets').glob('**/KCC_MarMay2025*.csv'):
            found_path = p
            break
    if found_path:
        DATASET_PATH = Path(found_path)  # update global for downstream cells
        df_full = pd.read_csv(DATASET_PATH, encoding='utf-8-sig', low_memory=False, engine='python')
        print(f'Loaded: {DATASET_PATH}')
        print(f'Shape: {df_full.shape[0]:,} rows × {df_full.shape[1]:,} cols')
        print('Columns:', list(df_full.columns))
    else:
        print('Dataset not found at expected paths.')
        print('Looked for:')
        for p in candidates:
            print(' -', p)
        try:
            import google.colab  # type: ignore
            from google.colab import files  # type: ignore
            print('\nYou can upload the CSV now; it will be saved as:', RAW_DIR / 'KCC_MarMay2025_train_ready_min2.csv')
            uploaded = files.upload()
            if uploaded:
                name = next(iter(uploaded))
                src = Path(name)
                dst = RAW_DIR / 'KCC_MarMay2025_train_ready_min2.csv'
                os.replace(src, dst)
                DATASET_PATH = dst
                df_full = pd.read_csv(DATASET_PATH, encoding='utf-8-sig', low_memory=False, engine='python')
                print(f'Loaded after upload: {DATASET_PATH}')
                print(f'Shape: {df_full.shape[0]:,} rows × {df_full.shape[1]:,} cols')
                print('Columns:', list(df_full.columns))
            else:
                raise FileNotFoundError('Upload canceled. Please upload the CSV and re-run this cell.')
        except Exception as e:
            raise FileNotFoundError('CSV missing. In Colab, upload the file via this cell; otherwise place it at the RAW_DIR path and re-run.') from e

Dataset records: 42,086
Columns: ['StateName', 'DistrictName', 'BlockName', 'Season', 'Sector', 'Category', 'Crop', 'QueryType', 'QueryText', 'KccAns', 'CreatedOn', 'year', 'month']


In [None]:
# 6. Preprocess labels (ensure stratify works)
import pandas as pd
PROCESSED_DIR = PROJECT_DIR / 'Datasets' / 'processed'
PROCESSED_DIR.mkdir(parents=True, exist_ok=True)

# Reuse df_full if available; otherwise, read using DATASET_PATH (after cell 5 set it)
if 'df_full' not in globals() or not isinstance(df_full, pd.DataFrame) or df_full.empty:
    if not DATASET_PATH.exists():
        raise FileNotFoundError(f'Dataset not found at {DATASET_PATH}. Please run the previous cell to locate/upload it.')
    df_full = pd.read_csv(DATASET_PATH, encoding='utf-8-sig', low_memory=False, engine='python')


def ensure_labels(df, col):
    if col not in df.columns:
        if col == 'topic' and 'QueryType' in df.columns:
            df[col] = df['QueryType'].fillna('Other')
        else:
            df[col] = 'Other'
    return df[col].fillna('Other').astype(str)


def stabilize(series):
    def first_label(val):
        parts = [p.strip() for p in str(val).split(';') if p.strip()]
        return parts[0] if parts else 'Other'
    counts = series.apply(first_label).value_counts()
    rare = set(counts[counts < 2].index)

    def normalize(val):
        parts = [p.strip() for p in str(val).split(';') if p.strip()]
        if not parts:
            return 'Other'
        return 'Other' if parts[0] in rare else ';'.join(parts)

    return series.apply(normalize)


processed_paths = {}
for col in ['topic', 'sub_topic']:
    df_copy = df_full.copy()
    df_copy[col] = stabilize(ensure_labels(df_copy, col))
    out_path = PROCESSED_DIR / f'KCC_MarMay2025_{col}_train.csv'
    df_copy.to_csv(out_path, index=False, encoding='utf-8-sig')
    processed_paths[col] = out_path
    print(f'Wrote {out_path}')

Wrote \content\Kisaan\Datasets\processed\KCC_MarMay2025_topic_train.csv
Wrote \content\Kisaan\Datasets\processed\KCC_MarMay2025_sub_topic_train.csv
Wrote \content\Kisaan\Datasets\processed\KCC_MarMay2025_sub_topic_train.csv


In [None]:
# 6A. Hotfix: ensure float32 labels in training script (for BCEWithLogits)
from pathlib import Path
script_path = PROJECT_DIR / 'src' / 'train_topic_subtopic_peft.py'
if script_path.exists():
    txt = script_path.read_text(encoding='utf-8')
    changed = False
    if 'astype(np.float32)' not in txt:
        txt = txt.replace('mlb.fit_transform(y_train)', 'mlb.fit_transform(y_train).astype(np.float32)')
        txt = txt.replace('mlb.transform(y_val)', 'mlb.transform(y_val).astype(np.float32)')
        txt = txt.replace('mlb.transform(y_test)', 'mlb.transform(y_test).astype(np.float32)')
        changed = True
    if 'Dataset.from_dict({"text": X_train, "labels": list(y_train_bin)})' in txt:
        txt = txt.replace('Dataset.from_dict({"text": X_train, "labels": list(y_train_bin)})',
                          'Dataset.from_dict({"text": X_train, "labels": [row.tolist() for row in y_train_bin]})')
        changed = True
    if 'Dataset.from_dict({"text": X_val,   "labels": list(y_val_bin)})' in txt:
        txt = txt.replace('Dataset.from_dict({"text": X_val,   "labels": list(y_val_bin)})',
                          'Dataset.from_dict({"text": X_val,   "labels": [row.tolist() for row in y_val_bin]})')
        changed = True
    if 'Dataset.from_dict({"text": X_test,  "labels": list(y_test_bin)})' in txt:
        txt = txt.replace('Dataset.from_dict({"text": X_test,  "labels": list(y_test_bin)})',
                          'Dataset.from_dict({"text": X_test,  "labels": [row.tolist() for row in y_test_bin]})')
        changed = True
    if changed:
        script_path.write_text(txt, encoding='utf-8')
        print('Applied float32 labels hotfix to training script.')
    else:
        print('Training script already ensures float32 labels. No changes made.')
else:
    print('Training script not found at', script_path)


In [None]:
# 7. Train Topic head
topic_csv = processed_paths['topic']
topic_out = MODELS_DIR / 'topic'
topic_out.mkdir(parents=True, exist_ok=True)
!python src/train_topic_subtopic_peft.py --data_csv "{topic_csv}" --out_dir "{topic_out}" --label_col topic --text_col QueryText --base_model xlm-roberta-base --epochs 4 --batch_size {BATCH_SIZE} --max_length {MAX_LENGTH} --lr 2e-5


Map:   0%|          | 0/33668 [00:00<?, ? examples/s]
Map:  18%|█▊        | 6000/33668 [00:00<00:00, 49915.45 examples/s]
Map:  33%|███▎      | 11000/33668 [00:00<00:00, 34011.90 examples/s]
Map:  48%|████▊     | 16000/33668 [00:00<00:00, 34536.39 examples/s]
Map:  62%|██████▏   | 21000/33668 [00:00<00:00, 36748.25 examples/s]
Map:  80%|████████  | 27000/33668 [00:00<00:00, 40041.30 examples/s]
Map:  98%|█████████▊| 33000/33668 [00:00<00:00, 42449.65 examples/s]
Map: 100%|██████████| 33668/33668 [00:00<00:00, 39895.26 examples/s]

Map:   0%|          | 0/4209 [00:00<?, ? examples/s]
Map: 100%|██████████| 4209/4209 [00:00<00:00, 48542.46 examples/s]

Map:   0%|          | 0/4209 [00:00<?, ? examples/s]
Map:  48%|████▊     | 2000/4209 [00:00<00:00, 16148.46 examples/s]
Map: 100%|██████████| 4209/4209 [00:00<00:00, 25131.11 examples/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classi

In [None]:
# 8. Train Sub-topic head
sub_csv = processed_paths['sub_topic']
sub_out = MODELS_DIR / 'subtopic'
sub_out.mkdir(parents=True, exist_ok=True)
!python src/train_topic_subtopic_peft.py --data_csv "{sub_csv}" --out_dir "{sub_out}" --label_col sub_topic --text_col QueryText --base_model xlm-roberta-base --epochs 4 --batch_size {BATCH_SIZE} --max_length {MAX_LENGTH} --lr 2e-5


Map:   0%|          | 0/33668 [00:00<?, ? examples/s]
Map:  18%|█▊        | 6000/33668 [00:00<00:00, 46840.97 examples/s]
Map:  33%|███▎      | 11000/33668 [00:00<00:00, 33553.92 examples/s]
Map:  50%|█████     | 17000/33668 [00:00<00:00, 38868.10 examples/s]
Map:  68%|██████▊   | 23000/33668 [00:00<00:00, 41860.14 examples/s]
Map:  86%|████████▌ | 29000/33668 [00:00<00:00, 42770.25 examples/s]
Map: 100%|██████████| 33668/33668 [00:00<00:00, 41328.39 examples/s]
Map: 100%|██████████| 33668/33668 [00:00<00:00, 40687.92 examples/s]

Map:   0%|          | 0/4209 [00:00<?, ? examples/s]
Map: 100%|██████████| 4209/4209 [00:00<00:00, 43178.38 examples/s]

Map:   0%|          | 0/4209 [00:00<?, ? examples/s]
Map:  48%|████▊     | 2000/4209 [00:00<00:00, 15391.46 examples/s]
Map: 100%|██████████| 4209/4209 [00:00<00:00, 23739.84 examples/s]
Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classi

In [9]:
# 9. Run inference on combined CSV
scored_csv = PROJECT_DIR / 'Datasets' / 'KCC_MarMay2025_scored.csv'
!python src/predict_local.py --data_csv "{DATASET_PATH}" --model_topic "{topic_out}" --model_subtopic "{sub_out}" --text_col QueryText --out_csv "{scored_csv}" --device auto --batch_size 64
print('Scored CSV:', scored_csv)

Scored CSV: \content\Kisaan\Datasets\KCC_MarMay2025_scored.csv


Traceback (most recent call last):
  File "c:\content\Kisaan\src\predict_local.py", line 94, in <module>
    main()
  File "c:\content\Kisaan\src\predict_local.py", line 67, in main
    tok_t, mdl_t, labels_t, thr_t = load_head(Path(args.model_topic))
                                    ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\content\Kisaan\src\predict_local.py", line 25, in load_head
    labels = json.loads((model_dir / "labels.json").read_text(encoding="utf-8"))
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python311\Lib\pathlib.py", line 1058, in read_text
    with self.open(mode='r', encoding=encoding, errors=errors) as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\Dell\AppData\Local\Programs\Python\Python311\Lib\pathlib.py", line 1044, in open
    return io.open(self, mode, buffering, encoding, errors, newline)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^

In [10]:
# 10. Preview predictions
import pandas as pd
df_scored = pd.read_csv(scored_csv, encoding='utf-8-sig')
print(df_scored[['QueryText', 'pred_topic', 'pred_sub_topic']].head())

FileNotFoundError: [Errno 2] No such file or directory: '\\content\\Kisaan\\Datasets\\KCC_MarMay2025_scored.csv'

## Notes
- If you need faster experiments, reduce `--epochs` to 1.
- After training, download `models/topic`, `models/subtopic`, and the scored CSV for local inference.