The purpose of this notebook is to check how many tokens we need for optimal language classification

In [1]:
import os
import logging
import pandas as pd
import numpy as np
import gc
logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)


raw_dir = "/home/peterr/macocu/taskB/data/raw"
interim_dir = "/home/peterr/macocu/taskB/data/interim"
final_dir = "/home/peterr/macocu/taskB/data/final"

In [2]:
import parse
from typing import List


def get_N_tokens(N=5000, path="/home/peterr/macocu/taskB/task4/toy_tokens.csv") -> set:

    df = pd.read_csv("toy_tokens.csv", index_col=0)
    NUM_FEATS = N

    for column in df.columns:
        new_column_name = column + "_f"
        corpus_size = df[column].sum()
        df[new_column_name] = df[column] * 1e6 / corpus_size

    N = 1

    df["HR_SR"] = (df["hrwac_head_pp_f"] + N) / (df["srwac_head_pp_f"] + N)
    df["SR_HR"] = (df["srwac_head_pp_f"] + N) / (df["hrwac_head_pp_f"] + N)

    df["HR_CNR"] = (df["hrwac_head_pp_f"] + N) / (df["cnrwac_head_pp_f"] + N)
    df["CNR_HR"] = (df["cnrwac_head_pp_f"] + N) / (df["hrwac_head_pp_f"] + N)

    df["HR_BS"] = (df["hrwac_head_pp_f"] + N) / (df["bswac_head_pp_f"] + N)
    df["BS_HR"] = (df["bswac_head_pp_f"] + N) / (df["hrwac_head_pp_f"] + N)

    df["BS_SR"] = (df["bswac_head_pp_f"] + N) / (df["srwac_head_pp_f"] + N)
    df["SR_BS"] = (df["srwac_head_pp_f"] + N) / (df["bswac_head_pp_f"] + N)

    df["BS_CNR"] = (df["bswac_head_pp_f"] + N) / (df["cnrwac_head_pp_f"] + N)
    df["CNR_BS"] = (df["cnrwac_head_pp_f"] + N) / (df["bswac_head_pp_f"] + N)

    df["CNR_SR"] = (df["cnrwac_head_pp_f"] + N) / (df["srwac_head_pp_f"] + N)
    df["SR_CNR"] = (df["srwac_head_pp_f"] + N) / (df["cnrwac_head_pp_f"] + N)

    combos = ['HR_SR', 'SR_HR', 'HR_CNR', 'CNR_HR', 'HR_BS', 'BS_HR',
              'BS_SR', 'SR_BS', 'BS_CNR', 'CNR_BS', 'CNR_SR', 'SR_CNR']

    important_features = set()

    for lang_comb in combos:
        s = df.sort_values(lang_comb, ascending=False)[lang_comb]
        current_features = s.index[:NUM_FEATS].values
        important_features = important_features.union(set(current_features))
    try:
        important_features.remove(np.nan)
    except KeyError:
        pass
    return important_features


def read_and_split_file(path: str) -> List[str]:
    texts = list()
    chunk = ""
    with open(path, "r") as f:
        content = f.readlines()
    for line in content:
        # Handle splits
        if line == "\n":
            texts.append(chunk)
            chunk = ""
        # Filter only lowercase alphabetical words:
        from utils import is_alpha
        line = line.replace("\n", " ")
        words = [w if is_alpha(w) else " " for w in line.split(" ")]
        chunk += " ".join(words)
    return texts


texts, labels = list(), list()

files = [
    "bswac_tail_pp",
    "cnrwac_tail_pp",
    "hrwac_tail_pp",
    "srwac_tail_pp"]

langs = ["bs", "me", "hr", "sr"]

for file, lang in zip(files, langs):
    full_path = os.path.join(interim_dir, file)
    current_texts = read_and_split_file(full_path)
    len_cur_texts = len(current_texts)
    texts.extend(current_texts)
    labels.extend([lang]*len_cur_texts)

train = pd.DataFrame(data={"text": texts, "labels": labels})

del texts, labels

SETimes = list()
for split in ["train", "test", "dev"]:
    with open(os.path.join(final_dir, f"{split}.fasttxt"), "r") as f:
        lines = f.readlines()
        SETimes.extend(lines)

p = parse.compile("__label__{lang} {text}")
langs = list()
texts = list()

for line in SETimes:
    results = p.parse(line)
    if not results:
        logging.error(f"Error parsing line {line}")
        continue
    langs.append(results["lang"])
    texts.append(results["text"])

eval_df = pd.DataFrame(data={"text": texts, "labels": langs})

del texts, langs, SETimes, line, lines, p


def get_stats(N: int):
    import gc
    import time
    from sklearn.naive_bayes import GaussianNB
    from sklearn.feature_extraction.text import CountVectorizer
    gc.collect()
    start = time.time()
    vectorizer = CountVectorizer(
        vocabulary=get_N_tokens(N), lowercase=True, binary=True)

    train_vectors = vectorizer.fit_transform(train.text)
    train_labels = train.labels

    test_vectors = vectorizer.fit_transform(eval_df.text)
    y_true = eval_df.labels

    clf = GaussianNB()
    train_start = time.time()
    clf.fit(train_vectors.toarray(), train_labels)
    predict_start = time.time()
    y_pred = clf.predict(test_vectors.toarray())
    predict_end = time.time()
    from sklearn.metrics import f1_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
    import matplotlib.pyplot as plt
    LABELS = ["hr", "bs", "sr",  "me"]
    LABELS = ["hr", "bs", "sr",  "me"]

    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    return {
        "N": N,
        "microF1": micro,
        "macroF1": macro,
        "accuracy": acc,
        "overall_time": time.time() - start,
        "cm": cm,
        "vectorizer_fitting": train_start - start,
        "training_time": predict_start - train_start,
        "predicting_time": predict_end - predict_start
    }


In [4]:
results = list()
Ns = np.logspace(2, 5, 15, dtype=np.int)
for N in Ns:
    try:
        gc.collect()
        logging.info(f"{N=}")
        cur_result = get_stats(N)
        logging.info(f"Done!")
        results.append(cur_result)
    except Exception as e:
        logging.error(f"For {N=} got Exception: {e}")

2022-01-03 13:01:19,700 - N=100
2022-01-03 13:01:20,650 - NumExpr defaulting to 8 threads.
