Data preparation

In [1]:
import pandas as pd
from utils import LABELS, load_SET_dataset, load_twitter_dataset, read_and_split_file
def load_tails():
    interim_dir = "/home/peterr/macocu/taskB/data/interim"
    texts, labels = list(), list()
    files = ["bswac_tail_pp", "cnrwac_tail_pp_corrected_2", "hrwac_tail_pp", "srwac_tail_pp"]
    langs = ["bs", "me", "hr", "sr"]

    for file, lang in zip(files, langs):
        full_path = os.path.join(interim_dir, file)
        current_texts = read_and_split_file(full_path)
        len_cur_texts = len(current_texts)
        texts.extend(current_texts)
        labels.extend([lang] * len_cur_texts)

    return pd.DataFrame(data={"text": texts, "labels": labels})

train = load_tails()

In [2]:
SET = load_SET_dataset().rename(columns={"language":"labels"})
SET_dev_df = SET.loc[SET.split == "train", ["text", "labels"]]
SET_test_df = SET.loc[SET.split != "train", ["text", "labels"]]

tw = load_twitter_dataset().rename(columns={"language":"labels"})
tw["text"] = tw.tweets.apply(lambda l: " ".join(l))
tw_dev_df = tw.loc[tw.split == "train", ["text", "labels"]]
tw_test_df = tw.loc[tw.split != "train", ["text", "labels"]]

In [3]:
def train_model(N, output_dir="outputs/"):
    from simpletransformers.classification import ClassificationModel, ClassificationArgs
    model_args = ClassificationArgs()
    model_args.num_train_epochs = N
    model_args.overwrite_output_dir = True
    #model_args.output_dir = output_dir
    model_args.train_batch_size = 32
    model_args.no_cache = True
    model_args.no_save = True
    model_args.save_steps = -1
    model_args.save_model_every_epoch = True,
    model_args.max_seq_length = 512
    model_args.labels_list = LABELS


    model = ClassificationModel("electra", "classla/bcms-bertic",
                                num_labels = len(LABELS),
                                use_cuda = True,
                                args = model_args,
                                )
    model.train_model(train )
    return model

def eval_model(model, df):
    y_true = df.labels
    y_pred = model.predict(df.text.tolist())[0]

    from sklearn.metrics import (
        f1_score,
        confusion_matrix,
        accuracy_score,
    )
    y_true = df.labels
    y_pred = model.predict(df.text.tolist())[0]
    macro = f1_score(y_true, y_pred, labels=LABELS, average="macro")
    micro = f1_score(y_true, y_pred, labels=LABELS, average="micro")
    acc = accuracy_score(y_true, y_pred)
    cm = confusion_matrix(y_true, y_pred, labels=LABELS)
    returndict = dict(
        macroF1=macro,
        microF1=micro,
        CM = cm,
         )
    return returndict

Main loop:

In [4]:
results = list()
for N in [1,2,3]:
    model = train_model(N, output_dir=f"outputs/{N}epochs/")
    
    dataset = "Twitter"
    split = "dev"
    rezdict = eval_model(model, tw_dev_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)
    
    dataset = "Twitter"
    split = "test"
    rezdict = eval_model(model, tw_test_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)

    dataset = "SETimes"
    split = "dev"
    rezdict = eval_model(model, SET_dev_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)

    dataset = "SETimes"
    split = "test"
    rezdict = eval_model(model, SET_test_df)
    rezdict["dataset"] = dataset
    rezdict["split"] = split
    rezdict["clf"] = f"Bertic {N} epochs"
    results.append(rezdict)
    pd.DataFrame(data=results).to_csv("results_bertic.csv")

Some weights of the model checkpoint at classla/bcms-bertic were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['pooler.dense.bias', 'classifier.bias', '

  0%|          | 0/331725 [00:00<?, ?it/s]

Epoch:   0%|          | 0/1 [00:00<?, ?it/s]

Running Epoch 0 of 1:   0%|          | 0/10367 [00:00<?, ?it/s]

  torch.nn.utils.clip_grad_norm_(


  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/368 [00:00<?, ?it/s]

  0%|          | 0/46 [00:00<?, ?it/s]

  0%|          | 0/246 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/246 [00:00<?, ?it/s]

  0%|          | 0/31 [00:00<?, ?it/s]

  0%|          | 0/7374 [00:00<?, ?it/s]

  0%|          | 0/922 [00:00<?, ?it/s]

  0%|          | 0/7374 [00:00<?, ?it/s]

  0%|          | 0/922 [00:00<?, ?it/s]

  0%|          | 0/1884 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

  0%|          | 0/1884 [00:00<?, ?it/s]

  0%|          | 0/236 [00:00<?, ?it/s]

Some weights of the model checkpoint at classla/bcms-bertic were not used when initializing ElectraForSequenceClassification: ['discriminator_predictions.dense.weight', 'discriminator_predictions.dense_prediction.weight', 'discriminator_predictions.dense.bias', 'discriminator_predictions.dense_prediction.bias']
- This IS expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing ElectraForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of ElectraForSequenceClassification were not initialized from the model checkpoint at classla/bcms-bertic and are newly initialized: ['pooler.dense.bias', 'classifier.bias', '

  0%|          | 0/331725 [00:00<?, ?it/s]

Epoch:   0%|          | 0/2 [00:00<?, ?it/s]

Running Epoch 0 of 2:   0%|          | 0/10367 [00:00<?, ?it/s]

In [None]:
pd.DataFrame(data=results).to_csv("results_bertic.csv")
pd.DataFrame(data=results)