In [2]:
from pathlib import Path
from utils import *
tokens = get_N_tokens(N=3)
len(tokens)

27

In [3]:
tails = Path("./data/interim/").glob("*wac_tail_pp")
texts = list()
labels = list()
for file in tails:
    import re
    label = re.findall(pattern=r".+/([a-z]+)wac_tail_pp", string=str(file))[0]
    new_texts = read_and_split_file(str(file))
    texts.extend(new_texts)
    labels.extend([label for i in new_texts])
    
import pandas as pd
train_df = pd.DataFrame(data={
    "labels": labels,
    "text": texts
}).sample(frac=1)

    

In [4]:
SETimes = Path("data/interim/").glob("SETimes_[t,d]*.fasttext")
setimes_df = pd.concat([
    load_fasttext(str(i)) for i in SETimes
]).sample(frac=1)

In [5]:
twitter_paths = Path("data/interim/").glob("Twitter*.fasttext")
twitter_df = pd.concat([
    load_fasttext(str(i)) for i in twitter_paths
]).sample(frac=1)

In [6]:

def get_stats(N: int, train_df: pd.DataFrame, 
              eval_df: pd.DataFrame,
              classifier_type: str = "LinearSVC",
              ) -> dict:

    from sklearn.naive_bayes import GaussianNB
    from sklearn.svm import LinearSVC
    from sklearn.feature_extraction.text import CountVectorizer

    vectorizer = CountVectorizer(
        vocabulary=get_N_tokens(N), lowercase=True, binary=True)

    train_vectors = vectorizer.fit_transform(train_df.text)
    train_labels = train_df.labels
    if classifier_type == "LinearSVC":
        clf = LinearSVC(dual=False)
    elif classifier_type == "NaiveBayes":
        clf = GaussianNB()
    else:
        raise AttributeError(f"Got weird classifier_type: {classifier_type}, expected either LinearSVC or NaiveBayes")
    clf.fit(train_vectors.toarray(), train_labels)
    def evaluate(vectorizer, clf, eval_df):
        test_vectors = vectorizer.fit_transform(eval_df.text)
        y_true = eval_df.labels
        y_pred = clf.predict(test_vectors.toarray())
        from sklearn.metrics import f1_score, ConfusionMatrixDisplay, confusion_matrix, accuracy_score
        LABELS = ["hr", "bs", "sr",  "me"]

        macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
        micro = f1_score(y_true, y_pred, labels=LABELS,  average="micro")
        acc = accuracy_score(y_true, y_pred)
        cm = confusion_matrix(y_true, y_pred, labels=LABELS)
        return {
            "N": N,
            "microF1": micro,
            "macroF1": macro,
            "accuracy": acc,
            "cm": cm,
            "y_true": y_true.tolist(),
            "y_pred": y_pred.tolist(),
            "classifier": str(type(clf))
        }
    return evaluate(vectorizer, clf, eval_df)

get_stats(100, train_df=train_df, eval_df=twitter_df)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'N': 100,
 'microF1': 0.9676258992805756,
 'macroF1': 0.7046275304751926,
 'accuracy': 0.96415770609319,
 'cm': array([[ 82,   6,   0,   0],
        [  1,  67,   4,   0],
        [  2,   3, 389,   0],
        [  0,   0,   0,   0]]),
 'y_true': ['hr',
  'sr',
  'sr',
  'hr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'hr',
  'hr',
  'sr',
  'hr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'bs',
  'sr',
  'bs',
  'sr',
  'sr',
  'sr',
  'bs',
  'sr',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'hr',
  'sr',
  'hr',
  'sr',
  'sr',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'bs',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'hr',
  'sr',
  'hr',
  'sr',
  'sr',
  'sr',
  'hr',
  'hr',
  'sr',
  'sr',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'sr',
  'hr',
  'sr',
  's

In [7]:
get_stats(100, train_df=train_df, eval_df=setimes_df)

  _warn_prf(average, "true nor predicted", "F-score is", len(true_sum))


{'N': 100,
 'microF1': 0.7971583689499827,
 'macroF1': 0.6025937630244177,
 'accuracy': 0.7454093756750918,
 'cm': array([[1967, 1132,    0,    0],
        [   8, 2244,    0,    0],
        [   0,   15, 2690,    0],
        [   0,    0,    0,    0]]),
 'y_true': ['sr',
  'bs',
  'sr',
  'hr',
  'bs',
  'hr',
  'hr',
  'sr',
  'sr',
  'bs',
  'hr',
  'sr',
  'sr',
  'bs',
  'bs',
  'bs',
  'bs',
  'hr',
  'hr',
  'sr',
  'sr',
  'hr',
  'sr',
  'sr',
  'hr',
  'bs',
  'sr',
  'hr',
  'sr',
  'hr',
  'bs',
  'bs',
  'sr',
  'hr',
  'bs',
  'hr',
  'bs',
  'bs',
  'bs',
  'sr',
  'bs',
  'bs',
  'sr',
  'sr',
  'sr',
  'sr',
  'hr',
  'bs',
  'sr',
  'hr',
  'hr',
  'sr',
  'bs',
  'hr',
  'bs',
  'sr',
  'bs',
  'sr',
  'bs',
  'bs',
  'hr',
  'bs',
  'sr',
  'bs',
  'bs',
  'hr',
  'bs',
  'bs',
  'sr',
  'bs',
  'bs',
  'bs',
  'hr',
  'bs',
  'sr',
  'sr',
  'sr',
  'hr',
  'bs',
  'bs',
  'bs',
  'hr',
  'hr',
  'bs',
  'hr',
  'bs',
  'sr',
  'bs',
  'sr',
  'sr',
  'hr',
  'sr',
  