In [22]:
from pathlib import Path
import csv, re, math, unicodedata, time, json, random
from collections import defaultdict, Counter
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

DATA_DIR = Path("csv-datasets")
GLOVE_OUT = Path("glove-outcomes")
GLOVE_OUT.mkdir(parents=True, exist_ok=True)
LOWERCASE = True
EMBEDDING_DIM = 50
LEARNING_RATE = 0.05
NUM_EPOCHS = 5
WINDOW_SIZE = 4
XMAX = 100.0
ALPHA = 0.75
RNG_SEED = 42

In [None]:
def nfc(s):
    return unicodedata.normalize("NFC", s)

def collapse_ws(s):
    return re.sub(r"\s+", " ", s).strip()

SPLIT_EN = re.compile(r"(?<=[.!?])\s+")
SPLIT_HI = re.compile(r"(?:।|\?|!)\s+")

def sent_split(text, lang):
    rx = SPLIT_EN if lang == "en" else SPLIT_HI
    return [s for s in rx.split(text) if s.strip()]

from nltk.tokenize import RegexpTokenizer
tok_en = RegexpTokenizer(r"[A-Za-z]+'[A-Za-z]+|[A-Za-z]+|\d+")
tok_hi = RegexpTokenizer(r"[\w]+")

def word_tokens(text, lang):
    return tok_en.tokenize(text) if lang == "en" else tok_hi.tokenize(text)

def normalize_text(text, lowercase=True):
    t = collapse_ws(nfc(text))
    return t.lower() if lowercase else t

In [24]:
def read_csv_labeled(path: Path):
    texts, labels = [], []
    with path.open("r", encoding="utf-8", newline="") as f:
        rdr = csv.DictReader(f)
        for r in rdr:
            texts.append(r["Text"])
            labels.append(r["Categories"])
    return texts, labels

def make_lm_sentences(texts, lang, lowercase=True):
    out = []
    for txt in texts:
        t = normalize_text(txt, lowercase)
        for s in sent_split(t, lang):
            toks = word_tokens(s, lang)
            if toks:
                out.append(toks)
    return out

def build_cooccurrence(sentences, window_size=4):
    vocab_set = set()
    cooc = defaultdict(float)
    token_freq = Counter()
    for words in sentences:
        token_freq.update(words)
        n = len(words)
        for i in range(n):
            wi = words[i]
            vocab_set.add(wi)
            left = max(0, i - window_size)
            right = min(n, i + window_size + 1)
            for j in range(left, right):
                if i == j:
                    continue
                wj = words[j]
                cooc[(wi, wj)] += 1.0 / abs(i - j)
    vocab = sorted(vocab_set)
    return vocab, cooc, token_freq

In [25]:
def f_weight(x):
    return (x / XMAX) ** ALPHA if x < XMAX else 1.0

def init_glove_params(vocab, dim=50, seed=RNG_SEED):
    rng = np.random.default_rng(seed)
    W  = {w: rng.normal(0.0, 0.1, size=dim).astype(np.float64) for w in vocab}
    C  = {w: rng.normal(0.0, 0.1, size=dim).astype(np.float64) for w in vocab}
    bw = {w: 0.0 for w in vocab}
    bc = {w: 0.0 for w in vocab}
    return W, C, bw, bc

def train_glove(cooc, W, C, bw, bc, lr=LEARNING_RATE, epochs=NUM_EPOCHS, seed=RNG_SEED, verbose=True):
    rng = np.random.default_rng(seed)
    pairs = list(cooc.items())
    for ep in range(1, epochs+1):
        rng.shuffle(pairs)
        loss = 0.0
        for (wi, wj), xij in pairs:
            wi_vec = W[wi]; wj_vec = C[wj]
            bi = bw[wi];  bj = bc[wj]

            pred = float(np.dot(wi_vec, wj_vec) + bi + bj)
            diff = pred - math.log(xij)
            ww = f_weight(xij)

            loss += 0.5 * ww * (diff**2)

            g = ww * diff
            grad_wi = g * wj_vec
            grad_wj = g * wi_vec

            W[wi]  -= lr * grad_wi
            C[wj]  -= lr * grad_wj
            bw[wi] -= lr * g
            bc[wj] -= lr * g
        if verbose:
            print(f"Epoch={ep}/{epochs}  Loss={loss:.6f}")
    return W, C, bw, bc

def final_embeddings(W, C):
    keys = list(W.keys())
    emb = np.stack([W[k] + C[k] for k in keys])
    return keys, emb


def save_embeddings(lang, size, vocab, emb):
    out_npz = GLOVE_OUT / f"{lang}_{size}_embeddings.npz"
    out_vocab = GLOVE_OUT / f"{lang}_{size}_vocab.txt"
    np.savez_compressed(out_npz, vocab=np.array(vocab, dtype=object), emb=emb.astype(np.float32))
    with out_vocab.open("w", encoding="utf-8") as f:
        for w in vocab:
            f.write(w + "\n")
    return out_npz, out_vocab

In [26]:
def build_word_lookup(vocab, emb):
    w2i = {w:i for i,w in enumerate(vocab)}
    return w2i, emb

def sentence_embedding(text, lang, w2i, emb, lowercase=True):
    toks = []
    t = normalize_text(text, lowercase)
    for s in sent_split(t, lang):
        toks.extend(word_tokens(s, lang))
    vecs = [emb[w2i[w]] for w in toks if w in w2i]
    if not vecs:
        return np.zeros(emb.shape[1], dtype=np.float64)
    return np.mean(np.vstack(vecs), axis=0)

In [27]:
def run_single_experiment(lang: str, size: str):
    sub = "english" if lang == "en" else "hindi"
    train_csv = DATA_DIR / sub / f"{sub}_{size}.csv"
    test_csv  = DATA_DIR / sub / f"{sub}_test.csv"
    print(f"{sub}_{size} - {sub}_test\n")

    t0 = time.perf_counter()
    # 1) Load & tokenize train
    train_texts, train_labels = read_csv_labeled(train_csv)
    train_sents = make_lm_sentences(train_texts, lang, LOWERCASE)
    t_tok = time.perf_counter()

    # 2) Co-occurrence
    vocab, cooc, _ = build_cooccurrence(train_sents, WINDOW_SIZE)
    t_co = time.perf_counter()

    # 3) Train GloVe
    W, C, bw, bc = init_glove_params(vocab, EMBEDDING_DIM, seed=RNG_SEED)
    W, C, bw, bc = train_glove(cooc, W, C, bw, bc, lr=LEARNING_RATE, epochs=NUM_EPOCHS, seed=RNG_SEED, verbose=True)
    t_tr = time.perf_counter()

    # 4) Final Embeddings + save
    vocab_list, emb = final_embeddings(W, C)
    out_npz, out_vocab = save_embeddings(sub, size, vocab_list, emb)

    # 5) Vectorize train/test
    test_texts, test_labels = read_csv_labeled(test_csv)
    t_feat0 = time.perf_counter()
    w2i, emb_mat = build_word_lookup(vocab_list, emb)
    X_train = np.vstack([sentence_embedding(txt, lang, w2i, emb_mat, LOWERCASE) for txt in train_texts])
    X_test  = np.vstack([sentence_embedding(txt, lang, w2i, emb_mat, LOWERCASE) for txt in test_texts])
    t_feat1 = time.perf_counter()

    # 6) Train classifier
    t_cls0 = time.perf_counter()
    clf = LogisticRegression(solver="saga", max_iter=300)
    clf.fit(X_train, train_labels)
    t_cls1 = time.perf_counter()

    # 7) Evaluate
    y_pred = clf.predict(X_test)
    acc  = accuracy_score(test_labels, y_pred)
    prec = precision_score(test_labels, y_pred, average="macro", zero_division=0)
    rec  = recall_score(test_labels, y_pred, average="macro", zero_division=0)
    f1   = f1_score(test_labels, y_pred, average="macro", zero_division=0)

    timings = {
        "tokenize_sec": t_tok - t0,
        "cooc_sec": t_co - t_tok,
        "train_sec": t_tr - t_co,
        "feature_sec": t_feat1 - t_feat0,
        "clf_train_sec": t_cls1 - t_cls0,
        "total_sec": t_cls1 - t0
    }
    metrics = {"Acc": float(acc), "Prec": float(prec), "Recall": float(rec), "F1": float(f1)}

    print(f"[{sub}_{size}] Metrics: {metrics}")
    print(f"[{sub}_{size}] Timings (s): {timings}")

    # 8) Save run result JSON
    res_json = GLOVE_OUT / f"results_{sub}_{size}.json"
    with res_json.open("w", encoding="utf-8") as f:
        json.dump({
            "lang": sub,
            "size": size,
            "metrics": metrics,
            "timings": timings
        }, f, ensure_ascii=False, indent=2)
    print(f"Saved: {res_json}")
    return {"lang": sub, "size": size, "metrics": metrics, "timings": timings}

In [28]:
random.seed(RNG_SEED)
np.random.seed(RNG_SEED)

print("Glove + Logistic Regression")
todo = [
    ("en","2500"), ("en","15000"), ("en","30000"),
    ("hi","2500"), ("hi","15000"), ("hi","30000"),
]
all_results = []
for lang, size in todo:
    res = run_single_experiment(lang, size)
    print()
    all_results.append(res)

Glove + Logistic Regression
english_2500 - english_test

Epoch=1/5  Loss=41307.345871
Epoch=2/5  Loss=26360.474205
Epoch=3/5  Loss=21333.858986
Epoch=4/5  Loss=18789.498185
Epoch=5/5  Loss=17096.100725
[english_2500] Metrics: {'Acc': 0.2451853050655446, 'Prec': 0.0013579112638496928, 'Recall': 0.0032570167643570704, 'F1': 0.0018813081323930624}
[english_2500] Timings (s): {'tokenize_sec': 4.838905300013721, 'cooc_sec': 3.3478284999728203, 'train_sec': 79.30883130000439, 'feature_sec': 7.048829399980605, 'clf_train_sec': 18.19417229993269, 'total_sec': 113.49654329998884}
Saved: glove-outcomes\results_english_2500.json

english_15000 - english_test

Epoch=1/5  Loss=263028.978827
Epoch=2/5  Loss=167110.927128
Epoch=3/5  Loss=141259.752241
Epoch=4/5  Loss=124376.689225
Epoch=5/5  Loss=112610.020840
[english_15000] Metrics: {'Acc': 0.28855801909694123, 'Prec': 0.005755672644408116, 'Recall': 0.005124240164740214, 'F1': 0.004200307697232256}
[english_15000] Timings (s): {'tokenize_sec': 6.3



[hindi_2500] Metrics: {'Acc': 0.2243, 'Prec': 0.034512738860322904, 'Recall': 0.030723259457458976, 'F1': 0.023913177034079785}
[hindi_2500] Timings (s): {'tokenize_sec': 0.20848449994809926, 'cooc_sec': 0.8926194000523537, 'train_sec': 6.580366399954073, 'feature_sec': 7.547845099936239, 'clf_train_sec': 7.504426900064573, 'total_sec': 23.041300100041553}
Saved: glove-outcomes\results_hindi_2500.json

hindi_15000 - hindi_test

Epoch=1/5  Loss=25276.139193
Epoch=2/5  Loss=13818.812897
Epoch=3/5  Loss=11608.385477
Epoch=4/5  Loss=9915.649718
Epoch=5/5  Loss=8479.549426




[hindi_15000] Metrics: {'Acc': 0.34103333333333335, 'Prec': 0.05475782613061436, 'Recall': 0.055935698148649196, 'F1': 0.04850995508412934}
[hindi_15000] Timings (s): {'tokenize_sec': 1.1657862999709323, 'cooc_sec': 4.970735699986108, 'train_sec': 6.601880800095387, 'feature_sec': 9.478118999977596, 'clf_train_sec': 43.84306139999535, 'total_sec': 66.32599689997733}
Saved: glove-outcomes\results_hindi_15000.json

hindi_30000 - hindi_test

Epoch=1/5  Loss=36929.438454
Epoch=2/5  Loss=20866.370931
Epoch=3/5  Loss=17562.649332
Epoch=4/5  Loss=14790.620442
Epoch=5/5  Loss=12521.217120
[hindi_30000] Metrics: {'Acc': 0.41126666666666667, 'Prec': 0.11377371231771151, 'Recall': 0.08375792572382872, 'F1': 0.0845313043199627}
[hindi_30000] Timings (s): {'tokenize_sec': 2.177713099983521, 'cooc_sec': 9.093387400032952, 'train_sec': 6.892722499906085, 'feature_sec': 11.295224100002088, 'clf_train_sec': 83.61398270004429, 'total_sec': 113.30818529997487}
Saved: glove-outcomes\results_hindi_30000.js