In [None]:
!pip install transformers
!pip install datasets
!pip install evaluate

## Use Hugging face

In [19]:
import evaluate
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

In [15]:
# Hyperparams
NUM_EPOCHS = 1
BATCH_SIZE = 16
LEARNING_RATE = 5e-5
PRETRAINED_MODEL_NAME = "bert-base-cased"

In [None]:
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
# print(train_data.head())

# Hugging face dataset object
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)

# Create the pre-trained model
tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

In [25]:
# Do necessary actions to process the dataset
def prepare_dataset(dataset):
    tokenized_dataset = dataset.map(tokenize_function, batched=True)
    prepared_dataset = tokenized_dataset.remove_columns(["text"])
    prepared_dataset = prepared_dataset.rename_column("label", "labels")
    prepared_dataset.set_format("torch")
    return prepared_dataset

train_prepared_dataset = prepare_dataset(train_dataset)
train_dataloader = DataLoader(train_prepared_dataset, shuffle=True, batch_size=BATCH_SIZE)

val_prepared_dataset = prepare_dataset(val_dataset)
val_dataloader = DataLoader(val_prepared_dataset, batch_size=BATCH_SIZE)
# Let's see what's made of you
print(train_prepared_dataset)

  0%|          | 0/8 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

Dataset({
    features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 7537
})


In [None]:
model = AutoModelForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=2)

optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)

# TODO add LR scheduler
# from transformers import get_scheduler
# num_epochs = 3
# num_training_steps = num_epochs * len(train_dataloader)
# lr_scheduler = get_scheduler(
#     name="linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps
# )
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

torch.cuda.empty_cache()

model.train()
pbar = tqdm(train_dataloader)
for epoch in range(NUM_EPOCHS):
    for batch in pbar:
        # batch is a dictionary of keys: labels, input_ids,token_type_ids, attention_mask
        batch = {k: v.to(device) for k, v in batch.items()}
        print(batch)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        # lr_scheduler.step()
        optimizer.zero_grad()
        pbar.set_description(f"train loss: {loss.item()}")

In [13]:
# Use the validation dataset to evaluate the model
model.eval()


# We are interested in the F1 score
metric = evaluate.load("f1")
for batch in val_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)

    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch["labels"])

metric.compute()

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

{'f1': 0.02247191011235955}