In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm
import pandas as pd

In [2]:
# Define the dataset class
class MovieReviewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=270):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = torch.tensor(self.labels[idx])
        encoding = self.tokenizer(text, truncation=True, max_length=self.max_length, padding='max_length', return_tensors='pt', return_attention_mask=True)

        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': label
        }

In [3]:
df = pd.read_csv('data/train.csv')  

texts = df['review'].tolist()
# Map string labels to integers
label_mapping = {"positive": 1, "negative": 0}
labels = df['sentiment'].map(label_mapping).tolist()

# Display a sample of the data
print("Sample Data:")
for text, label in zip(texts[:5], labels[:5]):
    print(f"Text: {text}\tLabel: {label}")


Sample Data:
Text: SAPS AT SEA Aspect ratio . Sound format Mono (Black and white) Suffering from 'hornophobia Ollie embarks on a 'restful' boat trip, but he and Stan get mixed up with an escaped nvict (Rychard Cramer). Chaos ensues. This feature length medy - an OK entry which nonetheless unspools like a mere imitation of Laurel and Hardy's best work - marked the final llaboration between L H and producer Hal Roach. Episodic in structure, the movie culminates in a memorable ocean voyage after The Boys are taken hostage by villainous Cramer (who shoots a seagull to prove how tough he is ). The gags are OK, but inspiration is lacking, perhaps due to the recruitment of actor-turned-director Gordon Douglas, previously responsible for Ollie's first solo effort in the sound era (ZENOBIA, produced in but whose work here lacks a measure of pzazz. Fair, but nothing special. L H regulars Charlie Hall and James Finlayson make guest appearances.	Label: 0
Text: If you want mindless action, hot chic

In [4]:
len(texts)
len(labels)

30000

In [5]:
# Split the dataset into train and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)


In [6]:
# Initialize GPT-2 tokenizer and model for sequence classification
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model = GPT2ForSequenceClassification.from_pretrained('gpt2', num_labels=2)  # 2 for binary classification


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Create datasets and dataloaders
#train_dataset = MovieReviewsDataset(train_texts, train_labels, tokenizer)
#val_dataset = MovieReviewsDataset(val_texts, val_labels, tokenizer)

#train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
#val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)

In [8]:
from torch.nn.utils.rnn import pad_sequence

# Create datasets and dataloaders
train_dataset = MovieReviewsDataset(train_texts, train_labels, tokenizer)
val_dataset = MovieReviewsDataset(val_texts, val_labels, tokenizer)

def custom_collate_fn(batch):
    input_ids = pad_sequence([item['input_ids'] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = pad_sequence([item['attention_mask'] for item in batch], batch_first=True, padding_value=0)  # Padding value for attention_mask is usually 0
    labels = torch.stack([item['labels'] for item in batch])
    
    return {'input_ids': input_ids, 'attention_mask': attention_mask, 'labels': labels}


# Create DataLoader with custom collate function
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True, collate_fn=custom_collate_fn)
val_dataloader = DataLoader(val_dataset, batch_size=8, shuffle=False, collate_fn=custom_collate_fn)


In [9]:
# Set up training parameters
#optimizer = AdamW(model.parameters(), lr=1e-5)
num_epochs = 2

In [10]:
# Fine-tune the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for batch in tqdm(train_dataloader, desc=f'Epoch {epoch + 1}'):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        #optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        #optimizer.step()

        train_loss += loss.item()

    average_train_loss = train_loss / len(train_dataloader)
    print(f'Epoch {epoch + 1}, Average Training Loss: {average_train_loss:.4f}')

    # Validation
    model.eval()
    val_predictions = []
    val_true_labels = []

    with torch.no_grad():
        for batch in tqdm(val_dataloader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1).cpu().numpy()

            val_predictions.extend(predictions)
            val_true_labels.extend(labels)

    val_accuracy = accuracy_score(val_true_labels, val_predictions)
    print(f'Validation Accuracy: {val_accuracy:.4f}')
    
# Save the fine-tuned model
model.save_pretrained('data/result/fine_tuned_gpt2')
tokenizer.save_pretrained('data/result/fine_tuned_gpt2')

Epoch 1:   0%|                                                      | 0/3000 [00:00<?, ?it/s]../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [54,0,0], thread: [96,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [54,0,0], thread: [97,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [54,0,0], thread: [98,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [54,0,0], thread: [99,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [54,0,0], thread: [100,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [54,0,0], thread: [101,0,0] Assertion `srcIndex < srcSelectDimSize` f

Epoch 1:   0%|                                                      | 0/3000 [00:00<?, ?it/s]
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [174,0,0], thread: [32,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [174,0,0], thread: [33,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [174,0,0], thread: [34,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [174,0,0], thread: [35,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [174,0,0], thread: [36,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
../aten/src/ATen/native/cuda/Indexing.cu:1146: indexSelectLargeIndex: block: [174,0,0], thread: [37,0,0] Assertion `srcIndex < srcSelectDimSi

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`