In [1]:

import json
import os
import torch
import pandas as pd

lr = 2e-5
epochs = 3
checkpoint = "distilbert-base-cased"


train_sets = ['../data/train_cat.csv', '../data/train.csv', '../data/train_aug.csv', '../data/train_augcat.csv',  '../data/train.csv', '../data/train_aug.csv', '../data/train_augcat.csv', '../data/train_cat.csv']
test_sets = [  '../data/test_cat.csv', '../data/test.csv', '../data/test.csv', '../data/test_cat.csv', '../data/test.csv', '../data/test.csv', '../data/test_cat.csv', '../data/test_cat.csv']
validation_sets = ['../data/val_cat.csv', '../data/validation.csv', '../data/validation.csv', '../data/val_cat.csv', '../data/validation.csv', '../data/validation.csv', '../data/val_cat.csv', '../data/val_cat.csv']




In [2]:
from datasets import DatasetDict, Dataset

def create_data(train_p, val_p, test_p):
        
    df_train = pd.read_csv(train_p)

    df_val = pd.read_csv(val_p)
    # df_val = df_val.loc[df_val['difficulty'].isin(['medium', 'hard'])]

    df_test = pd.read_csv(test_p)#

    df_test_easy = df_test[df_test["difficulty"] == "easy"]
    df_test_medium = df_test[df_test["difficulty"] == "medium"]
    df_test_hard = df_test[df_test["difficulty"] == "hard"]    

    raw_datasets = None
    
    if "cat" in train_p:
        columns = ["paragraph", "label"]

        # Creating raw dataset
        raw_datasets = DatasetDict({
            "train": Dataset.from_dict({
                "paragraph": df_train["paragraph"],
                "label": df_train["label"]
            }),
            "validation": Dataset.from_dict({
                "paragraph": df_val["paragraph"],
                "label": df_val["label"]
            }),
            "test_easy": Dataset.from_dict({
                "paragraph": df_test_easy["paragraph"],
                "label": df_test_easy["label"],
            }),
            "test_medium": Dataset.from_dict({
                "paragraph": df_test_medium["paragraph"],
                "label": df_test_medium["label"],
            }),
            "test_hard": Dataset.from_dict({
                "paragraph": df_test_hard["paragraph"],
                "label": df_test_hard["label"],
            }),
        })

    else:

        # Defining column names
        columns = ["paragraph1", "paragraph2", "label"]

        # Creating raw dataset
        raw_datasets = DatasetDict({
            "train": Dataset.from_dict({
                "paragraph1": df_train["paragraph1"],
                "paragraph2": df_train["paragraph2"],
                "label": df_train["label"]
            }),
            "validation": Dataset.from_dict({
                "paragraph1": df_val["paragraph1"],
                "paragraph2": df_val["paragraph2"],
                "label": df_val["label"]
            }),
            "test_easy": Dataset.from_dict({
                "paragraph1": df_test_easy["paragraph1"],
                "paragraph2": df_test_easy["paragraph2"],
                "label": df_test_easy["label"],
            }),
            "test_medium": Dataset.from_dict({
                "paragraph1": df_test_medium["paragraph1"],
                "paragraph2": df_test_medium["paragraph2"],
                "label": df_test_medium["label"],
            }),
            "test_hard": Dataset.from_dict({
                "paragraph1": df_test_hard["paragraph1"],
                "paragraph2": df_test_hard["paragraph2"],
                "label": df_test_hard["label"],
            }),
        })
    
    return raw_datasets

from transformers import AutoTokenizer, DataCollatorWithPadding
from torch.utils.data import DataLoader


def tokenize_encode(checkpoint, raw_datasets, is_cat=False):
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)

    if is_cat:
        def tokenize_function(sample):
            return tokenizer(
                sample["paragraph"],
                truncation=True
            )
    else:
        def tokenize_function(sample):
            return tokenizer(
                sample["paragraph1"],
                sample["paragraph2"],
                truncation=True
            )

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    collator = DataCollatorWithPadding(tokenizer=tokenizer)

    if is_cat:
        for key in tokenized_datasets.keys():
            tokenized_datasets[key] = tokenized_datasets[key].remove_columns(["paragraph"])
            tokenized_datasets[key] = tokenized_datasets[key].rename_column("label", "labels")
            tokenized_datasets[key] = tokenized_datasets[key].with_format("torch")
    else:
        for key in tokenized_datasets.keys():
            tokenized_datasets[key] = tokenized_datasets[key].remove_columns(["paragraph1", "paragraph2"])
            tokenized_datasets[key] = tokenized_datasets[key].rename_column("label", "labels")
            tokenized_datasets[key] = tokenized_datasets[key].with_format("torch")

    

    train_dataloader = DataLoader(
        tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=collator
    )

    eval_dataloader = DataLoader(
        tokenized_datasets["validation"], batch_size=8, collate_fn=collator
    )

    test_easy_loader = DataLoader(
        tokenized_datasets["test_easy"], batch_size=8, collate_fn=collator
    )
    test_medium_loader = DataLoader(
        tokenized_datasets["test_medium"], batch_size=8, collate_fn=collator
    )
    test_hard_loader = DataLoader(
        tokenized_datasets["test_hard"], batch_size=8, collate_fn=collator
    )

    return train_dataloader, eval_dataloader, test_easy_loader, test_medium_loader, test_hard_loader

from tqdm.auto import tqdm

def train_loop(model, train_dataloader, num_training_steps, num_epochs, device, optimizer, lr_scheduler):
    progress_bar = tqdm(range(num_training_steps))

    model.train()
    for epoch in range(num_epochs):
        for batch in train_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            outputs = model(**batch)
            loss = outputs.loss
            loss.backward()

            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            progress_bar.update(1)
    
    return model

import evaluate

def eval(model, eval_dataloader, device):
    metric = evaluate.load("glue", "mrpc")
    model.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        with torch.no_grad():
            outputs = model(**batch)

        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch["labels"])

    return metric.compute()

In [3]:
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")


for i in range(len(train_sets)):
    is_cat = "cat" in train_sets[i]
    raw_datasets = create_data(train_sets[i], validation_sets[i], test_sets[i])
    train_dataloader, eval_dataloader, test_easy_loader, test_medium_loader, test_hard_loader = tokenize_encode(checkpoint, raw_datasets, is_cat)

    model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=lr)

    num_training_steps = epochs * len(train_dataloader)
    lr_scheduler = get_scheduler(
        "linear",
        optimizer=optimizer,
        num_warmup_steps=0,
        num_training_steps=num_training_steps,
    )

    model = train_loop(model, train_dataloader, num_training_steps, epochs, device, optimizer, lr_scheduler)

    model.save_pretrained(pretrained_model_folder[i])

    easy = eval(model, test_easy_loader, device)
    medium = eval(model, test_medium_loader, device)
    hard = eval(model, test_hard_loader, device)

    print(f"Results for {pretrained_model_folder[i]}")
    print(f"easy: {easy}")
    print(f"medium: {medium}")
    print(f"hard: {hard}")
    print("\n\n")

    #clear memory
    del model
    del optimizer
    del lr_scheduler
    torch.cuda.empty_cache()
    


Map:   0%|          | 0/38574 [00:00<?, ? examples/s]

Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/14466 [00:00<?, ?it/s]

Results for pretrained/MH-cat
easy: {'accuracy': 0.7753351206434317, 'f1': 0.8625778943916038}
medium: {'accuracy': 0.7924932975871314, 'f1': 0.810757946210269}
hard: {'accuracy': 0.636461126005362, 'f1': 0.6266519823788547}





Map:   0%|          | 0/51962 [00:00<?, ? examples/s]

Map:   0%|          | 0/5599 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/19488 [00:00<?, ?it/s]

Results for pretrained/MHE
easy: {'accuracy': 0.9050938337801608, 'f1': 0.9461187214611873}
medium: {'accuracy': 0.8091152815013405, 'f1': 0.8309591642924976}
hard: {'accuracy': 0.6348525469168901, 'f1': 0.6110793832095945}





Map:   0%|          | 0/103924 [00:00<?, ? examples/s]

Map:   0%|          | 0/5599 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/38973 [00:00<?, ?it/s]

Results for pretrained/MH-aug
easy: {'accuracy': 0.9093833780160858, 'f1': 0.9484598963098506}
medium: {'accuracy': 0.7908847184986595, 'f1': 0.8163841807909604}
hard: {'accuracy': 0.6343163538873995, 'f1': 0.6333333333333333}





Map:   0%|          | 0/77148 [00:00<?, ? examples/s]

Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/28932 [00:00<?, ?it/s]

Results for pretrained/MH-augcat
easy: {'accuracy': 0.7630026809651475, 'f1': 0.853253652058433}
medium: {'accuracy': 0.797319034852547, 'f1': 0.8166828322017459}
hard: {'accuracy': 0.6203753351206435, 'f1': 0.6105610561056105}





Map:   0%|          | 0/51962 [00:00<?, ? examples/s]

Map:   0%|          | 0/5599 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/19488 [00:00<?, ?it/s]

Results for pretrained/MH
easy: {'accuracy': 0.9158176943699732, 'f1': 0.95220700152207}
medium: {'accuracy': 0.8037533512064343, 'f1': 0.8271954674220963}
hard: {'accuracy': 0.6418230563002681, 'f1': 0.6204545454545455}





Map:   0%|          | 0/103924 [00:00<?, ? examples/s]

Map:   0%|          | 0/5599 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/38973 [00:00<?, ?it/s]

Results for pretrained/MHE-aug
easy: {'accuracy': 0.9115281501340483, 'f1': 0.9495567104860899}
medium: {'accuracy': 0.7860589812332439, 'f1': 0.8127639605818865}
hard: {'accuracy': 0.6230563002680966, 'f1': 0.613948380010983}





Map:   0%|          | 0/77148 [00:00<?, ? examples/s]

Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/28932 [00:00<?, ?it/s]

Results for pretrained/MHE-augcat
easy: {'accuracy': 0.7640750670241286, 'f1': 0.8536260811709914}
medium: {'accuracy': 0.7833780160857908, 'f1': 0.8011811023622047}
hard: {'accuracy': 0.6176943699731904, 'f1': 0.5969474279253816}





Map:   0%|          | 0/38574 [00:00<?, ? examples/s]

Map:   0%|          | 0/5122 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Map:   0%|          | 0/1865 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/14466 [00:00<?, ?it/s]

Results for pretrained/MHE-cat
easy: {'accuracy': 0.7951742627345845, 'f1': 0.875893437296946}
medium: {'accuracy': 0.7898123324396783, 'f1': 0.8072763028515241}
hard: {'accuracy': 0.6407506702412868, 'f1': 0.624439461883408}



