In [1]:
import torch

# Free up GPU memory
torch.cuda.empty_cache()

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#!pip install transformers accelerate bitsandbytes datasets

In [None]:
#!pip install wandb

In [2]:
import os
import time
import datetime

import pandas as pd
import seaborn as sns
import numpy as np
import random



import torch
from torch.utils.data import Dataset, DataLoader, random_split, RandomSampler, SequentialSampler
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import GPT2Tokenizer, GPT2Config, GPT2ForSequenceClassification
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import Trainer, TrainingArguments
from datasets import Dataset
from datasets import load_dataset

In [3]:
from torch.utils.data import DataLoader, RandomSampler, TensorDataset
from tqdm import tqdm

In [4]:
import wandb

In [5]:
wandb.init(project="gpt2-sentiment-analysis", name='ayesha1')

[34m[1mwandb[0m: Currently logged in as: [33maamjad[0m ([33mtab-llm-finetuning[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [6]:
!nvidia-smi

Sun Feb 25 15:55:47 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   66C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [7]:
# Load the dataset from CSV files
dataset= load_dataset("csv", data_files={"train": "/content/drive/MyDrive/Colab Notebooks/data/train.csv", "test": "/content/drive/MyDrive/Colab Notebooks/data/test.csv"})

In [8]:
gpt2_model = AutoModelForSequenceClassification.from_pretrained("gpt2")
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
gpt2_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
# print memory footprint
print("Memory footprint of gpt2_model: ", gpt2_model.num_parameters() * 4 / 1024 / 1024, "MB")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Memory footprint of gpt2_model:  474.7060546875 MB


In [9]:

# Tokenize the text data
def tokenize_function(example):
    return gpt2_tokenizer(example["review"], padding=True, truncation=True)

# Tokenize train and test datasets
train_dataset = dataset["train"].map(tokenize_function, batched=True)
test_dataset = dataset["test"].map(tokenize_function, batched=True)

# Optionally, convert labels to numerical format if necessary
# For example, if sentiment labels are strings ("positive", "negative"), convert them to integers (0, 1)
train_dataset = train_dataset.map(lambda examples: {"label": 1 if examples["sentiment"] == "positive" else 0})
test_dataset = test_dataset.map(lambda examples: {"label": 1 if examples["sentiment"] == "positive" else 0})

In [10]:
# Prepare the tokenized training data
#input_ids = torch.tensor(train_dataset['input_ids'])
#attention_masks = torch.tensor(train_dataset['attention_mask'])


In [18]:
len(input_ids)

30000

In [19]:
input_ids.unique(return_counts=True)

(tensor([    0,     1,     2,  ..., 50254, 50255, 50257]),
 tensor([   15348,    24121,       92,  ...,        3,        3, 21930076]))

In [20]:
len(attention_masks)

30000

In [22]:
#BATCH_SIZE = 8
#train_dataset = TensorDataset(input_ids, attention_masks, labels)
#train_sampler = RandomSampler(train_dataset)
#train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#gpt2_model.to(device)
#input_ids.to(device)
#labels.to(device)

# Optimizer
#optimizer = AdamW(gpt2_model.parameters(), lr=5e-5)



In [11]:
import torch.nn as nn

# ...

class YourModel(nn.Module):
  def __init__(self):
    super(YourModel, self).__init__()
    self.dropout = nn.Dropout(p=0.1)
    def forward(self, inputs):
       x = self.dropout(x)
       return x

In [14]:
# Define the model configuration
MODEL_NAME = 'gpt2'
NUM_LABELS = 2

# Load the pre-trained RoBERTa tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained(MODEL_NAME)
model = GPT2ForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_LABELS)

# Prepare the tokenized training data
#input_ids = torch.tensor(train_dataset['input_ids'])
#attention_masks = torch.tensor(train_dataset['attention_mask'])

# Convert sentiment labels to numerical format
#label_map = {"positive": 1, "negative": 0}  # Define a mapping from string labels to numerical labels
#labels = torch.tensor([label_map[sentiment] for sentiment in train_dataset['sentiment']])

# Define the batch size and create DataLoader
BATCH_SIZE = 8
train_dataset = TensorDataset(input_ids, attention_masks, labels)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

# Specify GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define the optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, eps=1e-8)

# Fine-tuning the model
EPOCHS = 2

for epoch in range(EPOCHS):
    model.train()

    # Initialize progress bar
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch + 1}/{EPOCHS}', leave=False, disable=False)

    for batch in progress_bar:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0],
                  'attention_mask': batch[1],
                  'labels': batch[2]}#

        optimizer.zero_grad()
        outputs = model(**inputs)
        loss = outputs.loss
        loss.backward()

        optimizer.step()

        progress_bar.set_postfix({'training_loss': f'{loss.item():.3f}'})

    # Evaluate the model if needed
    # ...
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/results')
tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/results')

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [14]:
print(f"Model device: {next(model.parameters()).device}")
print(f"Input device: {next(train_dataloader).input_ids.device}")

Model device: cpu


TypeError: 'DataLoader' object is not an iterator

In [24]:
# Training
EPOCHS = 4

for epoch in range(EPOCHS):
    gpt2_model.train()
    progress_bar = tqdm(train_dataloader, desc=f'Epoch {epoch+1}/{EPOCHS}', leave=False, disable=False)

    total_train_loss = 0

    for batch in progress_bar:
        batch = tuple(t.to(device) for t in batch)
        inputs = {'input_ids': batch[0], 'attention_mask': batch[1], 'labels': batch[2]}

        optimizer.zero_grad()
        outputs = gpt2_model(**inputs)
        train_loss = outputs.loss
        train_loss.backward()
        optimizer.step()

        total_train_loss += train_loss.item()

        progress_bar.set_postfix({'training_loss': f'{train_loss.item():.3f}'})

    # Log to W&B
    avg_train_loss = total_train_loss / len(train_dataloader)
    wandb.log({"epoch": epoch, "training_loss": avg_train_loss})


# Save the fine-tuned model
gpt2_model.save_pretrained('/content/drive/MyDrive/Colab Notebooks/results')
gpt2_tokenizer.save_pretrained('/content/drive/MyDrive/Colab Notebooks/results')

wandb.finish()



RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
