# Pars prima: SETimes dataset

Feature selection dataset - web data (as was used until now)


Training dataset - web data (as was used until now)


Hyperparameter selection dataset - train split


Evaluation dataset - dev+test split, log micro- macro-F1 and Confusion Matrices


In [17]:
from utils import transliterate, is_alpha, load_SET_dataset, load_twitter_dataset, get_N_tokens, LABELS, read_and_split_file
import pandas as pd
N = 5000

def load_tails():
    interim_dir = "/home/peterr/macocu/taskB/data/interim"
    texts, labels = list(), list()
    files = ["bswac_tail_pp", "cnrwac_tail_pp_corrected_2", "hrwac_tail_pp", "srwac_tail_pp"]
    langs = ["bs", "me", "hr", "sr"]

    for file, lang in zip(files, langs):
        full_path = os.path.join(interim_dir, file)
        current_texts = read_and_split_file(full_path)
        len_cur_texts = len(current_texts)
        texts.extend(current_texts)
        labels.extend([lang] * len_cur_texts)

    return pd.DataFrame(data={"text": texts, "labels": labels})

train = load_tails()
tokens = get_N_tokens(N=1000)
SET = load_SET_dataset().rename(columns={"language":"labels"})

dev_df = SET.loc[SET.split == "train", ["text", "labels"]]
test_df = SET.loc[SET.split != "train", ["text", "labels"]]


In [21]:
def train_clf(N=1000):
    train = load_tails()
    tokens = get_N_tokens(N=N)
    from sklearn.svm import LinearSVC
    import gc
    from sklearn.feature_extraction.text import CountVectorizer
    gc.collect()
    clf = LinearSVC(dual=False)
    vocabulary = get_N_tokens(N)
    vectorizer = CountVectorizer(vocabulary=vocabulary, lowercase=True, binary=True)
    train_vectors = vectorizer.fit_transform(train.text)
    train_labels = train.labels
    


    clf.fit(train_vectors.toarray(), train_labels)

    return clf, vectorizer

def eval_clf(clf, eval_df, vectorizer):
    from sklearn.metrics import (
        f1_score,
        ConfusionMatrixDisplay,
        confusion_matrix,
        accuracy_score,
    )
    test_vectors = vectorizer.fit_transform(eval_df.text)
    y_true = eval_df.labels
    y_pred = clf.predict(test_vectors.toarray())
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS, average="micro")
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    returndict = dict(
        N=N,
        macroF1=macro,
        microF1=micro,
        accuracy=acc,
        cm = cm,

    )
    return returndict



In [23]:
import numpy as np
Ns = np.logspace(1, 5, 30, dtype=np.int)
results = list()
for N in Ns:
    try:
        clf, vectorizer = train_clf(N=N)
        rezdict = eval_clf(clf, dev_df, vectorizer)
        rezdict["dev"] = "SET train"
        results.append(rezdict)
    except:
        pass

results


[{'N': 10,
  'macroF1': 0.47089707221419697,
  'microF1': 0.4888798481149986,
  'accuracy': 0.4888798481149986,
  'cm': array([[1667,  216,    5,  668],
         [  59,  753,   14, 1730],
         [   1,   49, 1185, 1027],
         [   0,    0,    0,    0]]),
  'dev': 'SET train'},
 {'N': 13,
  'macroF1': 0.5376364466739568,
  'microF1': 0.5951993490642798,
  'accuracy': 0.5951993490642798,
  'cm': array([[1845,  220,    5,  486],
         [  46,  871,   11, 1628],
         [   2,   26, 1673,  561],
         [   0,    0,    0,    0]]),
  'dev': 'SET train'},
 {'N': 18,
  'macroF1': 0.551058162690964,
  'microF1': 0.6254407377271495,
  'accuracy': 0.6254407377271495,
  'cm': array([[1812,  305,    4,  435],
         [  27,  854,    7, 1668],
         [   2,   18, 1946,  296],
         [   0,    0,    0,    0]]),
  'dev': 'SET train'},
 {'N': 25,
  'macroF1': 0.5926963606079656,
  'microF1': 0.7028749660970979,
  'accuracy': 0.7028749660970979,
  'cm': array([[1781,  582,    1,  192],
  

In [25]:
pd.DataFrame(data=results).to_csv("1_part_SETIMES_hyperparams_optimization.csv")