In [1]:
import json
import os
import torch
import pandas as pd

# Data preperation

In [2]:
df_train = pd.read_csv("../data/train.csv")
df_val = pd.read_csv("../data/validation.csv")

In [3]:
# Importing dataset creation dependenceis
from datasets import DatasetDict, Dataset

# Defining column names
columns = ["paragraph1", "paragraph2", "label"]

# Creating raw dataset
raw_datasets = DatasetDict({
    "train": Dataset.from_dict({
        "paragraph1": df_train["paragraph1"],
        "paragraph2": df_train["paragraph2"],
        "label": df_train["label"]
    }),
    "validation": Dataset.from_dict({
        "paragraph1": df_val["paragraph1"],
        "paragraph2": df_val["paragraph2"],
        "label": df_val["label"]
    })
})

# Tokenizing and Encoder

In [4]:
from transformers import AutoTokenizer, DataCollatorWithPadding
checkpoint = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(sample):
    return tokenizer(
        sample["paragraph1"],
        sample["paragraph2"],
        truncation=True
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
collator = DataCollatorWithPadding(tokenizer=tokenizer)

Map:   0%|          | 0/51962 [00:00<?, ? examples/s]

Map:   0%|          | 0/5599 [00:00<?, ? examples/s]

In [5]:
"""
ONLY FOR WHEN NOT USING THE TRAINER API
"""
# Post process removal
for key in tokenized_datasets.keys():
    tokenized_datasets[key] = tokenized_datasets[key].remove_columns(["paragraph1", "paragraph2"])
    tokenized_datasets[key] = tokenized_datasets[key].rename_column("label", "labels")
    tokenized_datasets[key] = tokenized_datasets[key].with_format("torch")

In [6]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_datasets["train"], shuffle=True, batch_size=8, collate_fn=collator
)

eval_dataloader = DataLoader(
    tokenized_datasets["validation"], batch_size=8, collate_fn=collator
)

## Model and Training

In [7]:
from transformers import AutoModelForSequenceClassification
from transformers import AdamW
from transformers import get_scheduler


model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

optimizer = AdamW(model.parameters(), lr=2e-5)

num_epochs = 10
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps,
)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
import torch

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)
device

device(type='cuda')

In [9]:
from tqdm.auto import tqdm

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

  0%|          | 0/64960 [00:00<?, ?it/s]

In [10]:
import evaluate

metric = evaluate.load("glue", "mrpc")
model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

{'accuracy': 0.65868905161636, 'f1': 0.65929755749688}

In [11]:
model.save_pretrained("/home/jarl/LP2-multi-author-writing-style-detection/RoBERTa/pretrained")

In [12]:
model = AutoModelForSequenceClassification.from_pretrained("/home/jarl/LP2-multi-author-writing-style-detection/RoBERTa/pretrained").to(device)


In [13]:
import wandb


In [14]:
sweep_config = {
    "method": "random"
}

metric = {
    "name": "f1",
    "goal": "maximize"
}

sweep_config["metric"] = metric

parameters_dict = {
    "batch_size": {
        "values": [8, 16, 32]
    },
    "learning_rate": {
        "values": [5e-5, 4e-5, 3e-5, 2e-5]
    },
    "epochs": {
        "values": [3, 4, 5]
    }
}

sweep_config["parameters"] = parameters_dict

import pprint

pprint.pprint(sweep_config)

{'method': 'random',
 'metric': {'goal': 'maximize', 'name': 'f1'},
 'parameters': {'batch_size': {'values': [8, 16, 32]},
                'epochs': {'values': [3, 4, 5]},
                'learning_rate': {'values': [5e-05, 4e-05, 3e-05, 2e-05]}}}


In [15]:
from tqdm.auto import tqdm
import evaluate


progress_bar = tqdm(range(num_training_steps))

def train(config=None):
    with wandb.init(config=config):
        config = wandb.config
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2).to(device)
        optimizer = AdamW(model.parameters(), lr=config.learning_rate)
        lr_scheduler = get_scheduler(
            "linear",
            optimizer=optimizer,
            num_warmup_steps=0,
            num_training_steps=num_training_steps,
        )

        for epoch in range(config.epochs):
            for batch in train_dataloader:
                batch = {k: v.to(device) for k, v in batch.items()}
                outputs = model(**batch)
                loss = outputs.loss
                loss.backward()

                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                progress_bar.update(1)

        metric = evaluate.load("glue", "mrpc")
        model.eval()
        for batch in eval_dataloader:
            batch = {k: v.to(device) for k, v in batch.items()}
            with torch.no_grad():
                outputs = model(**batch)

            logits = outputs.logits
            predictions = torch.argmax(logits, dim=-1)
            metric.add_batch(predictions=predictions, references=batch["labels"])

        wandb.log(metric.compute())

  0%|          | 0/64960 [00:00<?, ?it/s]

In [None]:
sweep_id = wandb.sweep(sweep_config, project="authorship-detection-sweep")

wandb.agent(sweep_id, train, count=5)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


Create sweep with ID: ox6p3mie
Sweep URL: https://wandb.ai/jarlku/authorship-detection-sweep/sweeps/ox6p3mie


[34m[1mwandb[0m: Agent Starting Run: k7w6yngl with config:
[34m[1mwandb[0m: 	batch_size: 32
[34m[1mwandb[0m: 	epochs: 4
[34m[1mwandb[0m: 	learning_rate: 4e-05
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mjarlsoeren[0m ([33mjarlku[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Ctrl + C detected. Stopping sweep.


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
