In [1]:
import torch
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, GPT2Config
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, TensorDataset
from transformers import AdamW
from tqdm import tqdm
import pandas as pd
import wandb

In [2]:
wandb.init(project="gpt2-sentiment-analysis", name='ayesha3')

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33maamjad[0m ([33mtab-llm-finetuning[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')

In [4]:
train_df.columns

Index(['review', 'sentiment'], dtype='object')

In [5]:
class MoviewReviewData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [6]:
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


In [7]:
# Now you can tokenize the data with padding
train_encodings = tokenizer(train_df['review'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_df['review'].tolist(), truncation=True, padding=True, max_length=512)

In [8]:
# Convert labels to numeric
train_labels = train_df['sentiment'].map({'positive': 1, 'negative': 0}).tolist()
test_labels = test_df['sentiment'].map({'positive': 1, 'negative': 0}).tolist()

In [9]:
# Create the dataset
train_dataset = MoviewReviewData(train_encodings, train_labels)
test_dataset = MoviewReviewData(test_encodings, test_labels)


In [10]:
config = GPT2Config.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id, num_labels=2)
model = GPT2ForSequenceClassification.from_pretrained(model_name, config=config)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
model.config.pad_token_id = tokenizer.eos_token_id

In [None]:
print("Loading training arguments...")
training_args = TrainingArguments(
    output_dir='data/result',  # output directory for model and checkpoints
    num_train_epochs=4,           # total number of training epochs
    evaluation_strategy="steps",
    per_device_train_batch_size=16,   # batch size per device during training
    per_device_eval_batch_size=16,    # batch size for evaluation
    warmup_steps=500,                # number of warmup steps for learning rate scheduler
    weight_decay=0.01,               # strength of weight decay
    logging_dir='data/logs',            # directory for storing logs
    logging_steps=100,
    save_steps=1000,
    save_total_limit=2,
    report_to="wandb",                # Disables wandb logging
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model
print("Start training...")
trainer.train()

print("Saving finetuned gpt2...")
model.save_pretrained('data/result/fine_tuned_gpt2_model2')
tokenizer.save_pretrained('data/result/fine_tuned_gpt2_model2')

print("Start testing...")
# eval mode on model
trainer.evaluate()
model.eval()

Loading training arguments...
Start training...


Step,Training Loss,Validation Loss
100,2.5269,0.646496
200,0.4384,0.332753
300,0.2997,0.246806
400,0.306,0.304943
500,0.3033,0.238848
600,0.2964,0.23013
700,0.2837,0.232711
800,0.2724,0.236149
900,0.2435,0.213369
1000,0.2292,0.238643


In [None]:
X_train = train_df['review']
y_train = train_df['sentiment']
X_test = test_df['review']
y_test = test_df['sentiment']

In [None]:
model = GPT2ForSequenceClassification.from_pretrained('data/result/fine_tuned_gpt2_model2')
tokenizer = GPT2Tokenizer.from_pretrained('data/result/fine_tuned_gpt2_model2')

In [None]:
y_train_numeric = y_train.map({'positive': 1, 'negative': 0}).astype(int)
y_test_numeric = y_test.map({'positive': 1, 'negative': 0}).astype(int)

In [None]:
from torch.utils.data import DataLoader

#test_encodings = tokenizer(test_data['review'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

# Convert labels to a tensor
labels = torch.tensor(y_test_numeric)

# Create a DataLoader for the test set (optional, for batch processing)
test_dataset = MoviewReviewData(test_encodings, labels)
test_loader = DataLoader(test_dataset, batch_size=16)

def evaluate(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    correct_predictions = 0
    total_predictions = 0

    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = (correct_predictions / total_predictions) * 100
    return accuracy

# Evaluate the model
accuracy = evaluate(model, test_loader)
print(f"Model accuracy on the test set: {accuracy:.2f}%")