# Data Preperation Dependencies

In [20]:
import json
import os
import torch
import pandas as pd

# Data preperation

In [21]:
# Train: Medium + Hard
df_train = pd.read_csv("../FINAL-DATA/train_augcat.csv")
# df_train = df_train.loc[df_train['difficulty'].isin(['medium', 'hard'])]
print(len(df_train))

# Validation: Medium + Hard
df_val = pd.read_csv("../FINAL-DATA/val_cat.csv")
# df_val = df_val.loc[df_val['difficulty'].isin(['medium', 'hard'])]
print(len(df_val))

df_test = pd.read_csv("../FINAL-DATA/test_cat.csv")#
print(len(df_test))

df_test_easy = df_test[df_test["difficulty"] == "easy"]
df_test_medium = df_test[df_test["difficulty"] == "medium"]
df_test_hard = df_test[df_test["difficulty"] == "hard"]

77148
5122
5595


In [3]:
# Importing dataset creation dependenceis
from datasets import DatasetDict, Dataset

# Defining column names
columns = ["paragraph", "label"]

# Creating raw dataset
raw_datasets = DatasetDict({
    "train": Dataset.from_dict({
        "paragraph": df_train["paragraph"],
        "label": df_train["label"]
    }),
    "validation": Dataset.from_dict({
        "paragraph": df_val["paragraph"],
        "label": df_val["label"]
    }),
    "test_easy": Dataset.from_dict({
        "paragraph": df_test_easy["paragraph"],
        "label": df_test_easy["label"],
    }),
    "test_medium": Dataset.from_dict({
        "paragraph": df_test_medium["paragraph"],
        "label": df_test_medium["label"],
    }),
    "test_hard": Dataset.from_dict({
        "paragraph": df_test_hard["paragraph"],
        "label": df_test_hard["label"],
    }),
})

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
raw_datasets

DatasetDict({
    train: Dataset({
        features: ['paragraph', 'label'],
        num_rows: 77148
    })
    validation: Dataset({
        features: ['paragraph', 'label'],
        num_rows: 10244
    })
    test_easy: Dataset({
        features: ['paragraph', 'label'],
        num_rows: 1865
    })
    test_medium: Dataset({
        features: ['paragraph', 'label'],
        num_rows: 1865
    })
    test_hard: Dataset({
        features: ['paragraph', 'label'],
        num_rows: 1865
    })
})

# Tokenizing and Encoder

In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
def tokenize_function(sample):
    return tokenizer(
        sample['paragraph'],
        truncation=True
    )

In [7]:
tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/77148 [00:00<?, ? examples/s]

Map: 100%|██████████| 77148/77148 [00:05<00:00, 13480.19 examples/s]
Map: 100%|██████████| 10244/10244 [00:00<00:00, 14146.87 examples/s]
Map: 100%|██████████| 1865/1865 [00:00<00:00, 11821.90 examples/s]
Map: 100%|██████████| 1865/1865 [00:00<00:00, 15565.40 examples/s]
Map: 100%|██████████| 1865/1865 [00:00<00:00, 14892.48 examples/s]


In [8]:
# Testing mapping
print(tokenized_datasets.column_names)

{'train': ['paragraph', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'validation': ['paragraph', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'test_easy': ['paragraph', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'test_medium': ['paragraph', 'label', 'input_ids', 'token_type_ids', 'attention_mask'], 'test_hard': ['paragraph', 'label', 'input_ids', 'token_type_ids', 'attention_mask']}


In [9]:
"""
ONLY FOR WHEN NOT USING THE TRAINER API
"""
# Post process removal
for key in tokenized_datasets.keys():
    tokenized_datasets[key] = tokenized_datasets[key].remove_columns(["paragraph"])
    tokenized_datasets[key] = tokenized_datasets[key].rename_column("label", "labels")
    tokenized_datasets[key] = tokenized_datasets[key].with_format("torch")

In [10]:
tokenized_datasets["test_easy"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [11]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=collator
)

test_easy_loader = DataLoader(
    tokenized_datasets["test_easy"], batch_size=8, collate_fn=collator
)
test_medium_loader = DataLoader(
    tokenized_datasets["test_medium"], batch_size=8, collate_fn=collator
)
test_hard_loader = DataLoader(
    tokenized_datasets["test_hard"], batch_size=8, collate_fn=collator
)

for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

{'labels': torch.Size([8]),
 'input_ids': torch.Size([8, 126]),
 'token_type_ids': torch.Size([8, 126]),
 'attention_mask': torch.Size([8, 126])}

In [12]:
next(iter(test_easy_loader))

{'labels': tensor([1, 1, 1, 1, 1, 1, 1, 1]), 'input_ids': tensor([[  101, 14060,   119,  ...,     0,     0,     0],
        [  101, 20834,   117,  ...,     0,     0,     0],
        [  101,  4785,   119,  ...,  2182,   119,   102],
        ...,
        [  101,  3841,  1103,  ...,     0,     0,     0],
        [  101,  1249,  1677,  ...,     0,     0,     0],
        [  101,   146,   112,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [13]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

tensor(0.7062, grad_fn=<NllLossBackward0>) torch.Size([8, 2])


In [15]:
from transformers import AdamW

optimizer = AdamW(model.parameters(), lr=5e-5)



In [16]:
from transformers import get_scheduler

num_epochs = 5
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)
print(num_training_steps)

48220


In [17]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cpu')

In [19]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 10/48220 [02:23<191:32:19, 14.30s/it]


KeyboardInterrupt: 

In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

In [None]:
model.save_pretrained("/home/jarl/LP2-multi-author-writing-style-detection/RoBERTa/pretrained")

In [None]:
model = AutoModelForSequenceClassification.from_pretrained("/home/jarl/LP2-multi-author-writing-style-detection/RoBERTa/pretrained").to(device)


In [None]:
import evaluate

metric = evaluate.load("glue", "mrpc")

model.eval()
for batch in test_easy_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(f"metrics easy test: {metric.compute()}")

In [None]:
for batch in test_medium_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(f"metrics medium test: {metric.compute()}")

In [None]:
for batch in test_hard_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    # print(predictions)
    # print(batch["labels"])
    # print("\n")
    metric.add_batch(predictions=predictions, references=batch["labels"])

print(f"metrics hard test: {metric.compute()}")