# Data Preperation Dependencies

In [2]:
import json
import os
import torch
import pandas as pd

# Data preperation

In [4]:
# Train: Medium + Hard
df_train = pd.read_csv("../FINAL-DATA/train_aug.csv")
df_train = df_train.loc[df_train['difficulty'].isin(['medium', 'hard'])]

# Validation: Medium + Hard
df_val = pd.read_csv("../FINAL-DATA/validation_balanced.csv")
df_val = df_val.loc[df_val['difficulty'].isin(['medium', 'hard'])]

df_test = pd.read_csv("../FINAL-DATA/test.csv")#

df_test_easy = df_test[df_test["difficulty"] == "easy"]
df_test_medium = df_test[df_test["difficulty"] == "medium"]
df_test_hard = df_test[df_test["difficulty"] == "hard"]

In [5]:
# Importing dataset creation dependenceis
from datasets import DatasetDict, Dataset

# Defining column names
columns = ["paragraph1", "paragraph2", "label"]

# Creating raw dataset
raw_datasets = DatasetDict({
    "train": Dataset.from_dict({
        "paragraph1": df_train["paragraph1"],
        "paragraph2": df_train["paragraph2"],
        "label": df_train["label"]
    }),
    "validation": Dataset.from_dict({
        "paragraph1": df_val["paragraph1"],
        "paragraph2": df_val["paragraph2"],
        "label": df_val["label"]
    }),
    "test_easy": Dataset.from_dict({
        "paragraph1": df_test_easy["paragraph1"],
        "paragraph2": df_test_easy["paragraph2"],
        "label": df_test_easy["label"],
    }),
    "test_medium": Dataset.from_dict({
        "paragraph1": df_test_medium["paragraph1"],
        "paragraph2": df_test_medium["paragraph2"],
        "label": df_test_medium["label"],
    }),
    "test_hard": Dataset.from_dict({
        "paragraph1": df_test_hard["paragraph1"],
        "paragraph2": df_test_hard["paragraph2"],
        "label": df_test_hard["label"],
    }),
})

  from .autonotebook import tqdm as notebook_tqdm


In [6]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['paragraph1', 'paragraph2', 'label'],
        num_rows: 73308
    })
    validation: Dataset({
        features: ['paragraph1', 'paragraph2', 'label'],
        num_rows: 5122
    })
    test_easy: Dataset({
        features: ['paragraph1', 'paragraph2', 'label'],
        num_rows: 1865
    })
    test_medium: Dataset({
        features: ['paragraph1', 'paragraph2', 'label'],
        num_rows: 1865
    })
    test_hard: Dataset({
        features: ['paragraph1', 'paragraph2', 'label'],
        num_rows: 1865
    })
})

# Tokenizing and Encoder

In [7]:
from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [8]:
def tokenize_function(sample):
    return tokenizer(
        sample["paragraph1"],
        sample["paragraph2"],
        truncation=True
    )

In [9]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/73308 [00:00<?, ? examples/s]

Map: 100%|██████████| 73308/73308 [00:05<00:00, 13583.21 examples/s]
Map: 100%|██████████| 5122/5122 [00:00<00:00, 13943.56 examples/s]
Map: 100%|██████████| 1865/1865 [00:00<00:00, 11085.62 examples/s]
Map: 100%|██████████| 1865/1865 [00:00<00:00, 9696.88 examples/s]
Map: 100%|██████████| 1865/1865 [00:00<00:00, 13617.51 examples/s]


In [10]:
# Testing mapping
print(tokenized_datasets.column_names)

{'train': ['paragraph1', 'paragraph2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['paragraph1', 'paragraph2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'test_easy': ['paragraph1', 'paragraph2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'test_medium': ['paragraph1', 'paragraph2', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'test_hard': ['paragraph1', 'paragraph2', 'label', 'input_ids', 'token_type_ids', 'attention_mask']}


In [11]:
"""
ONLY FOR WHEN NOT USING THE TRAINER API
"""
# Post process removal
for key in tokenized_datasets.keys():
    tokenized_datasets[key] = tokenized_datasets[key].remove_columns(["paragraph1", "paragraph2"])
    tokenized_datasets[key] = tokenized_datasets[key].rename_column("label", "labels")
    tokenized_datasets[key] = tokenized_datasets[key].with_format("torch")

In [12]:
tokenized_datasets["test_easy"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [13]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=collator
)

test_easy_loader = DataLoader(
    tokenized_datasets["test_easy"], batch_size=8, collate_fn=collator
)
test_medium_loader = DataLoader(
    tokenized_datasets["test_medium"], batch_size=8, collate_fn=collator
)
test_hard_loader = DataLoader(
    tokenized_datasets["test_hard"], batch_size=8, collate_fn=collator
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 119]),
 'token_type_ids': torch.Size([8, 119]),
 'attention_mask': torch.Size([8, 119])}

In [14]:
next(iter(test_easy_loader))

{'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'input_ids': tensor([[  101, 14060,   119,  ...,     0,     0,     0],
        [  101, 20834,   117,  ...,     0,     0,     0],
        [  101,  4785,   119,  ...,  2182,   119,   102],
        ...,
        [  101,  3841,  1103,  ...,     0,     0,     0],
        [  101,  1249,  1677,  ...,     0,     0,     0],
        [  101,   146,   112,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [15]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.9308, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [17]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [18]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

45820


In [19]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [20]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 1/45820 [00:02<28:27:31,  2.24s/it]

KeyboardInterrupt: 

In [77]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.7729140611041629, 'f1': 0.8166738785518535}

In [19]:
model.save_pretrained("/home/jarl/LP2-multi-author-writing-style-detection/RoBERTa/pretrained")

In [73]:
model = AutoModelForSequenceClassification.from_pretrained("/home/jarl/LP2-multi-author-writing-style-detection/RoBERTa/pretrained").to(device)


In [84]:
import evaluate

metric = evaluate.load("glue", "mrpc")

model.eval()
for batch in test_easy_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(f"metrics easy test: {metric.compute()}")

{'labels': tensor([1, 1, 1, 0, 1, 1, 1, 1], device='cuda:0'), 'input_ids': tensor([[  101,   157, 12507,  ...,     0,     0,     0],
        [  101,  1130,  1103,  ..., 22852,   119,   102],
        [  101,   146,  1108,  ...,     0,     0,     0],
        ...,
        [  101,   786,  3291,  ...,     0,     0,     0],
        [  101,  1337,   112,  ...,     0,     0,     0],
        [  101, 24107,   119,  ...,     0,     0,     0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]], device='cuda:0')}
{'labels': tensor([0, 0, 1, 1, 1, 1, 

In [81]:
for batch in test_medium_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(f"metrics medium test: {metric.compute()}")

metrics medium test: {'accuracy': 1.0, 'f1': 1.0}


In [82]:
for batch in test_hard_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(predictions)
    # print(batch["labels"])
    # print("\n")
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(f"metrics hard test: {metric.compute()}")

metrics hard test: {'accuracy': 0.998, 'f1': 0.9977116704805492}
