In [1]:
#pip install --upgrade wandb

In [2]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

In [3]:
import os
os.environ['TORCH_USE_CUDA_DSA'] = '1'

In [4]:
import wandb
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from transformers import get_linear_schedule_with_warmup
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
import torch
from datasets import load_dataset
from tqdm import tqdm

In [5]:
config = {
    "learning_rate": 5e-5,
    "batch_size": 8,
    "epochs": 3,
    "model_name": "gpt2",
    "max_length": 512
}

In [6]:
wandb.init(project="gpt2-sentiment-analysis", name='frustrated-student2')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maamjad[0m ([33mtab-llm-finetuning[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [7]:
# Load dataset
dataset = load_dataset("csv", data_files={"train": "data/train.csv", "test": "data/test.csv"})

In [8]:
model_name = 'gpt2'

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2ForSequenceClassification.from_pretrained(model_name, num_labels=2)


# print memory footprint
print("Memory footprint of gpt2: ", model.num_parameters() * 4 / 1024 / 1024, "MB")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memory footprint of gpt2:  474.7060546875 MB


In [17]:
# Tokenize function
def tokenize_function(example):
    return tokenizer(example["review"], padding="max_length", truncation=True)

# Apply tokenization
tokenized_datasets = dataset.map(tokenize_function, batched=True)
train_dataset = tokenized_datasets["train"]
test_dataset = tokenized_datasets["test"]


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

In [10]:
# Convert to torch tensors and create DataLoader
input_ids = torch.tensor(train_dataset['input_ids'])
attention_masks = torch.tensor(train_dataset['attention_mask'])
labels = torch.tensor([1 if sentiment == 'positive' else 0 for sentiment in train_dataset['sentiment']])

In [11]:
len(input_ids)

30000

In [20]:
input_ids.unique(return_counts=True)

(tensor([    1,     6,     7,  ..., 50254, 50255, 50257]),
 tensor([  21684,    9450,    2225,  ...,       3,       2, 7676872]))

In [12]:
len(attention_masks)

30000

In [19]:
attention_masks.unique(return_counts=True)

(tensor([0, 1]), tensor([7676872, 7683128]))

In [13]:
len(labels)

30000

In [14]:
labels

tensor([0, 1, 1,  ..., 1, 0, 0])

In [18]:
labels.unique(return_counts=True)

(tensor([0, 1]), tensor([15065, 14935]))

In [25]:
BATCH_SIZE = 8
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#input_ids.to(device)
#labels.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

In [26]:
# Training
EPOCHS = 4

for epoch in range(EPOCHS):
    model.train()
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False, disable=False)

    for batch in progress_bar:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        progress_bar.set_postfix({'training_loss': f'{loss.item():.3f}'})
        
    # Log to W&B
    avg_train_loss = total_loss / len(train_dataloader)
    wandb.log({"epoch": epoch, "loss": avg_train_loss})
        

# Save the fine-tuned model
model.save_pretrained('data/result/fine_tuned_gpt2_model')
tokenizer.save_pretrained('data/result/fine_tuned_gpt2_model')

wandb.finish()

                                                                                             

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
