Data preparation

In [1]:
import pandas as pd
from utils import LABELS, load_SET_dataset, load_twitter_dataset, read_and_split_file
def load_tails():
    interim_dir = "/home/peterr/macocu/taskB/data/interim"
    texts, labels = list(), list()
    files = ["bswac_tail_pp", "cnrwac_tail_pp_corrected_2", "hrwac_tail_pp", "srwac_tail_pp"]
    langs = ["bs", "me", "hr", "sr"]

    for file, lang in zip(files, langs):
        full_path = os.path.join(interim_dir, file)
        current_texts = read_and_split_file(full_path)
        len_cur_texts = len(current_texts)
        texts.extend(current_texts)
        labels.extend([lang] * len_cur_texts)

    return pd.DataFrame(data={"text": texts, "labels": labels})

train = load_tails()

In [2]:
SET = load_SET_dataset().rename(columns={"language":"labels"})
SET_dev_df = SET.loc[SET.split == "train", ["text", "labels"]]
SET_test_df = SET.loc[SET.split != "train", ["text", "labels"]]

tw = load_twitter_dataset().rename(columns={"language":"labels"})
tw["text"] = tw.tweets.apply(lambda l: " ".join(l))
tw_dev_df = tw.loc[tw.split == "train", ["text", "labels"]]
tw_test_df = tw.loc[tw.split != "train", ["text", "labels"]]

In [32]:
def train_model(N, output_dir="outputs/"):
    from simpletransformers.classification import ClassificationModel, ClassificationArgs
    model_args = ClassificationArgs()
    model_args.num_train_epochs = N
    #model_args.overwrite_output_dir = True
    model_args.output_dir = output_dir
    model_args.train_batch_size = 32
    #model_args.no_cache = True
    #model_args.no_save = True
    #model_args.save_steps = -1
    model_args.save_model_every_epoch = True,
    model_args.max_seq_length = 512
    model_args.labels_list = LABELS


    model = ClassificationModel("electra", "classla/bcms-bertic",
                                num_labels = len(LABELS),
                                use_cuda = True,
                                args = model_args,
                                )
    model.train_model(train )
    return model

def eval_model(model, df):
    y_true = df.labels
    y_pred = model.predict(df.text.tolist())[0]

    from sklearn.metrics import (
        f1_score,
        confusion_matrix,
        accuracy_score,
    )
    y_true = df.labels
    y_pred = model.predict(df.text.tolist())[0]
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS, average="micro")
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    returndict = dict(
        macroF1=macro,
        microF1=micro,
        CM = cm,
         )
    return returndict

Main loop:

In [34]:
results = list()
for N in [1,2,3]:
    model = train_model(N, output_dir=f"outputs/{N}epochs/")
    
    dataset = "Twitter"
    split = "dev"
    rezdict = eval_model(model, tw_dev_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)
    
    dataset = "Twitter"
    split = "test"
    rezdict = eval_model(model, tw_test_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)

    dataset = "SETimes"
    split = "dev"
    rezdict = eval_model(model, SET_dev_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)

    dataset = "SETimes"
    split = "test"
    rezdict = eval_model(model, SET_test_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)

Some weights of the model checkpoint at classla/bcms-bertic were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense_prediction.bias', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.weight', 'discriminator_predictions.dense.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['classifier.bias', 'classifier.weight', '

  0%|          | 0/331725 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/10367 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(


In [None]:
pd.DataFrame(data=results).to_csv("results_bertic.csv")
pd.DataFrame(data=results)

Unnamed: 0,macroF1,microF1,CM,dataset,split,clf
0,0.042289,0.092391,"[[0, 0, 0, 53], [0, 0, 0, 45], [0, 0, 0, 236],...",Twitter,dev,Bertic 1 epochs
1,0.041045,0.089431,"[[0, 0, 0, 36], [0, 0, 0, 30], [0, 0, 0, 158],...",Twitter,test,Bertic 1 epochs
2,0.0,0.0,"[[0, 0, 0, 2556], [0, 0, 0, 2556], [0, 0, 0, 2...",SETimes,dev,Bertic 1 epochs
3,0.0,0.0,"[[0, 0, 0, 640], [0, 0, 0, 639], [0, 0, 0, 605...",SETimes,test,Bertic 1 epochs
4,0.132057,0.163043,"[[45, 8, 0, 0], [31, 14, 0, 0], [158, 77, 0, 1...",Twitter,dev,Bertic 2 epochs
5,0.112981,0.158537,"[[31, 5, 0, 0], [22, 8, 0, 0], [102, 56, 0, 0]...",Twitter,test,Bertic 2 epochs
6,0.137606,0.329672,"[[7, 2549, 0, 0], [1, 2424, 0, 131], [0, 1367,...",SETimes,dev,Bertic 2 epochs
7,0.136172,0.323779,"[[1, 639, 0, 0], [1, 609, 0, 29], [0, 362, 0, ...",SETimes,test,Bertic 2 epochs
8,0.423726,0.611413,"[[14, 39, 0, 0], [4, 29, 7, 5], [0, 51, 178, 7...",Twitter,dev,Bertic 3 epochs
9,0.375476,0.597561,"[[7, 29, 0, 0], [1, 24, 3, 2], [0, 33, 116, 9]...",Twitter,test,Bertic 3 epochs
