In [1]:
!pip install transformers
!pip install datasets
!pip install evaluate

[0m

## Use Hugging face

In [2]:
import evaluate
import pandas as pd
import numpy as np
import torch
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, get_scheduler
from torch.utils.data import DataLoader
from torch.optim import AdamW
from tqdm import tqdm

from train import get_dataloaders, train_model, calculate_f1

In [3]:
# Hyperparams
NUM_EPOCHS = 1
BATCH_SIZE = 16
LEARNING_RATE = 5e-5
USE_LR_SCHEDULER = True
PRETRAINED_MODEL_NAME = "roberta-base"

# Data augmentation params
DOWNSAMPLE_LABEL_0 = True
DOWNSAMPLE_FRAC = 0.2  # 0.2 means 20% of the data
UPSAMPLE_LABEL_1 = False  # Buggy: train_dataloader has an extra field?
UPSAMPLE_TIMES = 2  # 2 means 2x the data

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(device)

args = {
    "NUM_EPOCHS": NUM_EPOCHS,
    "BATCH_SIZE": BATCH_SIZE,
    "LEARNING_RATE": LEARNING_RATE,
    "USE_LR_SCHEDULER": USE_LR_SCHEDULER,
    "PRETRAINED_MODEL_NAME": PRETRAINED_MODEL_NAME,
}

cuda


In [4]:
train_data = pd.read_csv('train_data.csv')
val_data = pd.read_csv('val_data.csv')
print("Original training data numbers:")
print(train_data.label.value_counts())
# downsampling the data whose label is 0
if DOWNSAMPLE_LABEL_0:
    train_0 = train_data[train_data["label"] == 0].sample(frac=DOWNSAMPLE_FRAC, random_state=42)
    train_1 = train_data[train_data["label"] == 1]
    train_data = pd.concat([train_0, train_1], axis=0).reset_index(drop=True)
    print("After downsampling:")
    print(train_data.label.value_counts())

if UPSAMPLE_LABEL_1:
    train_0 = train_data[train_data['label'] == 0]
    # 1 label
    train_1 = train_data[train_data['label'] == 1]

    train_data = pd.concat([train_0, train_1], axis=0)
    for _ in range(UPSAMPLE_TIMES - 1):
        train_data = pd.concat([train_data, train_1], axis=0)
    print("After upsampling:")
    print(train_data.label.value_counts())

train_dataloader, val_dataloader = get_dataloaders(args, train_data, val_data)

Original training data numbers:
0    6831
1     706
Name: label, dtype: int64
After downsampling:
0    1366
1     706
Name: label, dtype: int64




  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [21]:
# Train model
model_name = "model"
model = train_model(args, device, train_dataloader, model_name=model_name)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'roberta.pooler.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Epoch: 0


train loss: 0.5824371576309204: 100%|██████████| 130/130 [03:10<00:00,  1.46s/it] 


In [7]:
# Get f1 score
model_name = "model"
f1 = calculate_f1(model_name, device, val_dataloader)
print("F1 score: {}".format(f1))

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.77k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.55k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.36k [00:00<?, ?B/s]

F1 score: 0.5069444444444444
