The purpose of this notebook is to evaluate the classifiers we produced in the first part on test splits.

For with-feature-selection classifier we shall take 70 tokens per language pair for SETimes, and for Twitter 167 tokens per language pair will be taken. 

In [2]:
import pandas as pd
pd.set_option('display.max_rows', 200)
pd.set_option('display.max_colwidth', 200)
setimes = pd.read_csv("1_part_SETIMES_hyperparams_optimization.csv")
twitter = pd.read_csv("1_part_TWITTER_hyperparams_optimization.csv")

In [3]:
best_macro = twitter.macroF1.argmax()
best_micro = twitter.microF1.argmax()
twitter.iloc[[best_macro, best_micro], :]

Unnamed: 0.1,Unnamed: 0,N,macroF1,microF1,accuracy,cm,dev
15,15,167,0.80441,0.913043,0.913043,[[ 50 3 0 0]\n [ 1 41 0 3]\n [ 1 3 232 0]\n [ 0 15 6 13]],Twitter train
15,15,167,0.80441,0.913043,0.913043,[[ 50 3 0 0]\n [ 1 41 0 3]\n [ 1 3 232 0]\n [ 0 15 6 13]],Twitter train


In the Twitter case above best macro and best micro F1 scores coincide.

In [4]:
best_macro = setimes.macroF1.argmax()
best_micro = setimes.microF1.argmax()
setimes.iloc[[best_macro, best_micro], :]

Unnamed: 0.1,Unnamed: 0,N,macroF1,microF1,accuracy,cm,dev
21,21,70,0.60187,0.735829,0.735829,[[1651 815 0 90]\n [ 18 1715 0 823]\n [ 0 13 2060 189]\n [ 0 0 0 0]],SET train
33,33,215,0.591024,0.74288,0.74288,[[1369 1123 0 64]\n [ 4 1986 0 566]\n [ 0 18 2123 121]\n [ 0 0 0 0]],SET train


In the SETimes case we take N=70 for maximal macro F1.

# Evaluation

In [5]:
from utils import transliterate, is_alpha, load_SET_dataset, load_twitter_dataset, get_N_tokens, LABELS, read_and_split_file

def load_tails():
    interim_dir = "/home/peterr/macocu/taskB/data/interim"
    texts, labels = list(), list()
    files = ["bswac_tail_pp", "cnrwac_tail_pp_corrected_2", "hrwac_tail_pp", "srwac_tail_pp"]
    langs = ["bs", "me", "hr", "sr"]

    for file, lang in zip(files, langs):
        full_path = os.path.join(interim_dir, file)
        current_texts = read_and_split_file(full_path)
        len_cur_texts = len(current_texts)
        texts.extend(current_texts)
        labels.extend([lang] * len_cur_texts)

    return pd.DataFrame(data={"text": texts, "labels": labels})

train = load_tails()
SET = load_SET_dataset().rename(columns={"language":"labels"})

dev_df = SET.loc[SET.split == "train", ["text", "labels"]]
test_df = SET.loc[SET.split != "train", ["text", "labels"]]

def train_clf(N=1000):
    from sklearn.svm import LinearSVC
    import gc
    from sklearn.feature_extraction.text import CountVectorizer
    gc.collect()
    train = load_tails()
    clf = LinearSVC(dual=False)
    vocabulary = get_N_tokens(N)
    vectorizer = CountVectorizer(vocabulary=vocabulary, lowercase=True, binary=True)
    train_vectors = vectorizer.fit_transform(train.text)
    train_labels = train.labels
    clf.fit(train_vectors.toarray(), train_labels)

    return clf, vectorizer

def eval_clf(clf, eval_df, vectorizer):
    from sklearn.metrics import (
        f1_score,
        ConfusionMatrixDisplay,
        confusion_matrix,
        accuracy_score,
    )
    test_vectors = vectorizer.fit_transform(eval_df.text)
    y_true = eval_df.labels
    y_pred = clf.predict(test_vectors.toarray())
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS, average="micro")
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    returndict = dict(
        N=N,
        macroF1=macro,
        microF1=micro,
        accuracy=acc,
        cm = cm,

    )
    return returndict



## SETimes

In [7]:
N = 70
clf, vectorizer = train_clf(N=N)
rezdict = eval_clf(clf, test_df, vectorizer)
print(rezdict)

{'N': 70, 'macroF1': 0.6033297686474621, 'microF1': 0.7356687898089171, 'accuracy': 0.7356687898089171, 'cm': array([[430, 186,   0,  24],
       [  3, 408,   0, 228],
       [  0,   1, 548,  56],
       [  0,   0,   0,   0]])}


## Twitter

In [8]:
tw = load_twitter_dataset().rename(columns={"language":"labels"})
tw["text"] = tw.tweets.apply(lambda l: " ".join(l))

dev_df = tw.loc[tw.split == "train", ["text", "labels"]]
test_df = tw.loc[tw.split != "train", ["text", "labels"]]

N=167
clf, vectorizer = train_clf(N=N)
rezdict = eval_clf(clf, test_df, vectorizer)
print(rezdict)

{'N': 167, 'macroF1': 0.7376482549826837, 'microF1': 0.8902439024390244, 'accuracy': 0.8902439024390244, 'cm': array([[ 33,   3,   0,   0],
       [  1,  25,   2,   2],
       [  0,   2, 156,   0],
       [  1,  11,   5,   5]])}
