# **converting dataset to csv**

In [None]:
!pip install pillow pytesseract pandas tqdm scikit-learn --quiet
!apt-get update >/dev/null 2>&1 && apt-get install -y tesseract-ocr >/dev/null 2>&1

import os
from pathlib import Path
from PIL import Image, ImageOps, ImageSequence
import pytesseract
import pandas as pd
from tqdm import tqdm
from IPython.display import FileLink, display
from sklearn.model_selection import train_test_split
import numpy as np

# ---------- OCR helper ----------
def ocr_tiff(path, resize_max=2500, preprocess=True):
    try:
        img = Image.open(path)
    except Exception:
        return "", False
    pages = []
    for p in ImageSequence.Iterator(img):
        p = p.convert("RGB")
        if max(p.size) > resize_max:
            scale = resize_max / max(p.size)
            p = p.resize((int(p.width*scale), int(p.height*scale)), Image.LANCZOS)
        if preprocess:
            p = ImageOps.grayscale(p)
        try:
            txt = pytesseract.image_to_string(p, config="--psm 3")
        except Exception:
            txt = pytesseract.image_to_string(p)
        pages.append(txt.strip())
    return "\n\n".join([t for t in pages if t]), True

# ---------- collect rows from a directory of label subfolders ----------
def rows_from_directory(root_dir, exts=(".tif", ".tiff"), save_texts=False, texts_out_dir="/kaggle/working/texts"):
    root = Path(root_dir)
    if not root.exists() or not root.is_dir():
        raise FileNotFoundError(f"Directory not found: {root_dir}")
    rows = []
    labels = sorted([d for d in root.iterdir() if d.is_dir()])
    if not labels:
        raise SystemExit(f"No label subfolders found under {root_dir}. Each class must be a subfolder.")
    if save_texts:
        Path(texts_out_dir).mkdir(parents=True, exist_ok=True)
    for lab in labels:
        tiffs = [p for p in lab.rglob("*") if p.is_file() and p.suffix.lower() in exts]
        for f in tqdm(sorted(tiffs), desc=f"Processing {lab.name}", unit="file"):
            text, ocr_used = ocr_tiff(f)
            rows.append({
                "filepath": str(f.resolve()),
                "label": lab.name,
                "filetype": f.suffix.lower().lstrip("."),
                "text": text,
                "ocr_used": bool(ocr_used)
            })
            if save_texts:
                outname = Path(texts_out_dir) / (f.stem + ".txt")
                try:
                    outname.write_text(text or "", encoding="utf-8")
                except Exception:
                    pass
    return rows

# ---------- MAIN ----------
print("Provide full paths to your TRAIN / VAL / TEST directories (each contains label subfolders).")
train_dir = input("TRAIN directory : ").strip()
val_dir   = input("VAL directory : ").strip()
test_dir  = input("TEST  directory : ").strip()
save_texts_ans = input("Save per-document .txt files for inspection? (y/N): ").strip().lower()
save_texts = save_texts_ans == "y"

if not train_dir:
    raise SystemExit("TRAIN directory is required. Provide the full path (e.g. /kaggle/input/your-dataset/train).")

# Process TRAIN
print("\nProcessing TRAIN directory:", train_dir)
rows_train = rows_from_directory(train_dir, save_texts=save_texts)
for r in rows_train: r["_source_split"] = "train"

# Process VAL if given; otherwise will be created from TRAIN rows
rows_val = []
if val_dir:
    print("\nProcessing VAL directory:", val_dir)
    rows_val = rows_from_directory(val_dir, save_texts=save_texts)
    for r in rows_val: r["_source_split"] = "val"
else:
    print("\nNo VAL directory provided — will create stratified 10% validation from TRAIN after OCR.")

# Process TEST if given
rows_test = []
if test_dir:
    print("\nProcessing TEST directory:", test_dir)
    rows_test = rows_from_directory(test_dir, save_texts=save_texts)
    for r in rows_test: r["_source_split"] = "test"

# Combine and possibly split
df_train = pd.DataFrame(rows_train)
df_val = pd.DataFrame(rows_val) if len(rows_val) else pd.DataFrame(columns=df_train.columns)
df_test = pd.DataFrame(rows_test) if len(rows_test) else pd.DataFrame(columns=df_train.columns)

# If val not provided, create stratified split from df_train
if df_val.empty:
    # drop empty OCR rows before split
    df_nonempty = df_train[df_train["text"].str.strip() != ""].copy()
    if df_nonempty.empty:
        print("Warning: OCR produced empty text for all TRAIN files; val split skipped.")
    else:
        # we want 10% of original train as validation
        try:
            train_df, val_df = train_test_split(df_nonempty, test_size=0.10, stratify=df_nonempty["label"], random_state=42)
        except Exception:
            # fallback to random split without stratify if some classes have too few examples
            train_df, val_df = train_test_split(df_nonempty, test_size=0.10, random_state=42)
        df_train = train_df.reset_index(drop=True)
        df_val = val_df.reset_index(drop=True)
        # Note: if some rows were empty and removed above, they are not in train/val — you can inspect /kaggle/working/manifest.csv

# Save CSVs to /kaggle/working
train_out = "/kaggle/working/train.csv"
val_out   = "/kaggle/working/val.csv"
test_out  = "/kaggle/working/test.csv"
manifest_out = "/kaggle/working/manifest.csv"

if not df_train.empty:
    df_train = df_train.drop(columns=["_source_split"], errors="ignore")
    df_train.to_csv(train_out, index=False, encoding="utf-8")
    print(f"\nSaved: {train_out}  rows: {df_train.shape[0]}")
    display(FileLink(train_out))
else:
    print("\nNo train rows to save.")

if not df_val.empty:
    df_val = df_val.drop(columns=["_source_split"], errors="ignore")
    df_val.to_csv(val_out, index=False, encoding="utf-8")
    print(f"Saved: {val_out}  rows: {df_val.shape[0]}")
    display(FileLink(val_out))
else:
    print("No val rows to save.")

if not df_test.empty:
    df_test = df_test.drop(columns=["_source_split"], errors="ignore")
    df_test.to_csv(test_out, index=False, encoding="utf-8")
    print(f"Saved: {test_out}  rows: {df_test.shape[0]}")
    display(FileLink(test_out))
else:
    print("No test rows to save (test dir was not provided).")

# Combined manifest (all rows)
df_all = pd.concat([df_train.assign(_source="train"), df_val.assign(_source="val"), df_test.assign(_source="test")], ignore_index=True, sort=False)
df_all.to_csv(manifest_out, index=False, encoding="utf-8")
print(f"\nSaved manifest: {manifest_out}  total rows: {df_all.shape[0]}")
display(FileLink(manifest_out))

print("\nDone — CSVs are in /kaggle/working/.")


In [None]:
# Kaggle-ready: generate ner_train.csv & ner_val.csv from provided train.csv & val.csv
# Paste into a Kaggle notebook cell and run.

!pip install -q spacy pandas scikit-learn tqdm
!python -m spacy download en_core_web_sm >/dev/null 2>&1

import re
from pathlib import Path
from tqdm.auto import tqdm
import pandas as pd
import spacy

# --------------- CONFIG ---------------
TRAIN_CSV = "/kaggle/working/train.csv"   # must exist and contain 'text' column
VAL_CSV   = "/kaggle/working/val.csv"     # must exist and contain 'text' column
OUT_TRAIN = "ner_train.csv"
OUT_VAL   = "ner_val.csv"
SPACY_MODEL = "en_core_web_sm"
BATCH_SIZE = 16
# --------------------------------------

# checks
if not Path(TRAIN_CSV).exists():
    raise SystemExit(f"{TRAIN_CSV} not found. Upload train.csv to the working directory.")
if not Path(VAL_CSV).exists():
    raise SystemExit(f"{VAL_CSV} not found. Upload val.csv to the working directory.")

df_train = pd.read_csv(TRAIN_CSV)
df_val   = pd.read_csv(VAL_CSV)

if "text" not in df_train.columns:
    raise SystemExit("train.csv must contain a 'text' column.")
if "text" not in df_val.columns:
    raise SystemExit("val.csv must contain a 'text' column.")

# load spaCy and ensure sentence segmentation
nlp = spacy.load(SPACY_MODEL)
if "sentencizer" not in nlp.pipe_names:
    nlp.add_pipe("sentencizer")

# regex rules to augment spaCy NER (common doc fields)
email_re    = re.compile(r"[A-Za-z0-9+_.-]+@[A-Za-z0-9.-]+")
phone_re    = re.compile(r"(\+?\d{1,3}[-\s]?)?(\d{10}|\d{3}[-\s]\d{3}[-\s]\d{4})")
invoice_re  = re.compile(r"(INV[-_\s]?\d+|Invoice\s*(No|#|Number)[:\s]*[A-Za-z0-9\-_/]+)", flags=re.I)
money_re    = re.compile(r"([₹$€£]\s?\d[\d,]*(?:\.\d{1,2})?)")

def token_indices_from_span(start, end, tokens):
    """Return token indices in tokens list that overlap char span [start,end)."""
    idxs = []
    for i, tok in enumerate(tokens):
        ts, te = tok.idx, tok.idx + len(tok.text)
        if te <= start:
            continue
        if ts >= end:
            break
        idxs.append(i)
    return idxs

def doc_to_bio_rows(doc, sent_start_id=0):
    """
    Convert a spaCy Doc to BIO token rows with regex overrides.
    Returns (rows, last_sentence_id)
    """
    rows = []
    sid = sent_start_id

    # build override spans (absolute char positions) with labels
    overrides = {}
    for rgx, lab in [(email_re, "EMAIL"), (phone_re, "PHONE"), (invoice_re, "INVOICE_ID"), (money_re, "MONEY")]:
        for m in rgx.finditer(doc.text):
            overrides[(m.start(), m.end())] = lab

    for sent in doc.sents:
        tokens = list(sent)
        labels = ["O"] * len(tokens)

        # spaCy entities first
        for ent in sent.ents:
            ent_idxs = token_indices_from_span(ent.start_char, ent.end_char, tokens)
            if ent_idxs:
                labels[ent_idxs[0]] = f"B-{ent.label_}"
                for j in ent_idxs[1:]:
                    labels[j] = f"I-{ent.label_}"

        # apply regex overrides (may add/override labels)
        for (s,e), lab in overrides.items():
            if e <= sent.start_char or s >= sent.end_char:
                continue
            rel_start = max(s, sent.start_char)
            rel_end   = min(e, sent.end_char)
            ov_idx = token_indices_from_span(rel_start, rel_end, tokens)
            if ov_idx:
                labels[ov_idx[0]] = f"B-{lab}"
                for j in ov_idx[1:]:
                    labels[j] = f"I-{lab}"

        # append token rows for this sentence
        for tok, lab in zip(tokens, labels):
            rows.append((sid, tok.text, lab))
        sid += 1

    last_sid = sid - 1 if sid > sent_start_id else sent_start_id
    return rows, last_sid

def annotate_texts(texts, start_sid=0):
    all_rows = []
    sid = start_sid
    for doc in nlp.pipe(texts, batch_size=BATCH_SIZE):
        rows, last = doc_to_bio_rows(doc, sent_start_id=sid)
        all_rows.extend(rows)
        sid = last + 1
    return all_rows, sid

# Annotate TRAIN
train_texts = df_train["text"].astype(str).tolist()
print(f"Annotating {len(train_texts)} training documents ...")
train_rows, next_sid = annotate_texts(train_texts, start_sid=0)

# Annotate VAL (start sentence ids after train to keep unique sentence ids across datasets)
val_texts = df_val["text"].astype(str).tolist()
print(f"Annotating {len(val_texts)} validation documents ...")
val_rows, _ = annotate_texts(val_texts, start_sid=next_sid+1)

# Save CSVs
df_ner_train = pd.DataFrame(train_rows, columns=["sentence_id","word","label"])
df_ner_val   = pd.DataFrame(val_rows, columns=["sentence_id","word","label"])

df_ner_train.to_csv(OUT_TRAIN, index=False)
df_ner_val.to_csv(OUT_VAL, index=False)

print(f"\nSaved {OUT_TRAIN} ({len(df_ner_train)} rows) and {OUT_VAL} ({len(df_ner_val)} rows).")
print("\nSample (ner_train.csv):")
print(df_ner_train.head(30).to_string(index=False))

print("\nDone — open these CSVs in Doccano/LabelStudio to correct labels before training NER for best results.")
