In [1]:
from utils import transliterate, is_alpha, load_SET_dataset, load_twitter_dataset, get_N_tokens, LABELS, read_and_split_file
import pandas as pd

def load_tails():
    interim_dir = "/home/peterr/macocu/taskB/data/interim"
    texts, labels = list(), list()
    files = ["bswac_tail_pp", "cnrwac_tail_pp_corrected_2", "hrwac_tail_pp", "srwac_tail_pp"]
    langs = ["bs", "me", "hr", "sr"]

    for file, lang in zip(files, langs):
        full_path = os.path.join(interim_dir, file)
        current_texts = read_and_split_file(full_path)
        len_cur_texts = len(current_texts)
        texts.extend(current_texts)
        labels.extend([lang] * len_cur_texts)

    return pd.DataFrame(data={"text": texts, "labels": labels})

train = load_tails()
SET = load_SET_dataset().rename(columns={"language":"labels"})

dev_df = SET.loc[SET.split == "train", ["text", "labels"]]
test_df = SET.loc[SET.split != "train", ["text", "labels"]]



In [5]:
def train_clf(N, train):
    from sklearn.svm import LinearSVC
    import gc
    from sklearn.feature_extraction.text import CountVectorizer
    gc.collect()
    clf = LinearSVC(dual=False)
    vectorizer = CountVectorizer(lowercase=True, binary=True, ngram_range=(3,3), analyzer="char", max_features=N)
    train_vectors = vectorizer.fit_transform(train.text)
    train_labels = train.labels
    clf.fit(train_vectors.toarray(), train_labels)

    return clf, vectorizer

def eval_clf(clf, eval_df, vectorizer):
    from sklearn.metrics import (
        f1_score,
        ConfusionMatrixDisplay,
        confusion_matrix,
        accuracy_score,
    )
    test_vectors = vectorizer.fit_transform(eval_df.text)
    y_true = eval_df.labels
    y_pred = clf.predict(test_vectors.toarray())
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS, average="micro")
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    returndict = dict(
        N=N,
        macroF1=macro,
        microF1=micro,
        accuracy=acc,
        cm = cm,

    )
    return returndict
import numpy as np
#results = list()
for N in [15000, 20000]:
    try:
        clf, vectorizer = train_clf(N, train)
        rezdict = eval_clf(clf, dev_df, vectorizer)
        rezdict["dev"] = "SET train"
        results.append(rezdict)
    except MemoryError:
        print("Failed at", N, ",  quitting.")
        break
    finally:
        pd.DataFrame(data=results).to_csv("2_part_SETIMES_hyperparams_optimization_3gram.csv")

ValueError: X has 20000 features per sample; expecting 19293

In [6]:
import pandas as pd

print(pd.read_csv("2_part_SETIMES_hyperparams_optimization_3gram.csv").to_markdown())

|    |   Unnamed: 0 |     N |    macroF1 |   microF1 |   accuracy | cm                      | dev       |
|---:|-------------:|------:|-----------:|----------:|-----------:|:------------------------|:----------|
|  0 |            0 |   100 | 0.00183192 | 0.0012205 |  0.0012205 | [[   4    1    3 2548]  | SET train |
|    |              |       |            |           |            |  [   1    2    3 2550]  |           |
|    |              |       |            |           |            |  [   1    0    3 2258]  |           |
|    |              |       |            |           |            |  [   0    0    0    0]] |           |
|  1 |            1 |   500 | 0.0799999  | 0.0735015 |  0.0735015 | [[ 413   20  122 2001]  | SET train |
|    |              |       |            |           |            |  [ 479   24   95 1958]  |           |
|    |              |       |            |           |            |  [ 301    8  105 1848]  |           |
|    |              |       |            |    

In [4]:
results

[{'N': 100,
  'macroF1': 0.0018319197539269265,
  'microF1': 0.0012205044751830757,
  'accuracy': 0.0012205044751830757,
  'cm': array([[   4,    1,    3, 2548],
         [   1,    2,    3, 2550],
         [   1,    0,    3, 2258],
         [   0,    0,    0,    0]]),
  'dev': 'SET train'},
 {'N': 500,
  'macroF1': 0.07999991948317281,
  'microF1': 0.0735014917276919,
  'accuracy': 0.0735014917276919,
  'cm': array([[ 413,   20,  122, 2001],
         [ 479,   24,   95, 1958],
         [ 301,    8,  105, 1848],
         [   0,    0,    0,    0]]),
  'dev': 'SET train'},
 {'N': 1000,
  'macroF1': 0.1604714433193727,
  'microF1': 0.19066992134526717,
  'accuracy': 0.19066992134526717,
  'cm': array([[ 896,   66,  507, 1087],
         [1138,   99,  354,  965],
         [ 702,   81,  411, 1068],
         [   0,    0,    0,    0]]),
  'dev': 'SET train'},
 {'N': 5000,
  'macroF1': 0.22397422750275797,
  'microF1': 0.3041768375372932,
  'accuracy': 0.3041768375372932,
  'cm': array([[ 634,  4