In [None]:
!pip install transformers[torch]
!pip install datasets

Collecting datasets
  Downloading datasets-2.21.0-py3-none-any.whl.metadata (21 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-2.21.0-py3-none-any.whl (527 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m22.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import pandas as pd
from transformers import Trainer, TrainingArguments


In [None]:
# Load the pre-trained models and tokenizers
gpt2_model = AutoModelForSequenceClassification.from_pretrained("gpt2")
gpt2_tokenizer = AutoTokenizer.from_pretrained("gpt2")
# print memory footprint
print("Memory footprint of gpt2: ", gpt2_model.num_parameters() * 4 / 1024 / 1024, "MB")

In [None]:
# Set pad_token to be the same as eos_token
gpt2_tokenizer.pad_token = gpt2_tokenizer.eos_token

train_df = pd.read_csv('/content/drive/MyDrive/train.csv')
test_df = pd.read_csv('/content/drive/MyDrive/test.csv')

In [None]:
# Now you can tokenize the data with padding
train_encodings = gpt2_tokenizer(train_df['review'].tolist(), truncation=True, padding=True, max_length=512)
test_encodings = gpt2_tokenizer(test_df['review'].tolist(), truncation=True, padding=True, max_length=512)

In [None]:
# Convert labels to numeric
train_labels = train_df['sentiment'].map({'positive': 1, 'negative': 0}).tolist()
test_labels = test_df['sentiment'].map({'positive': 1, 'negative': 0}).tolist()

In [None]:
class MoviewReviewData(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [None]:
# Create the dataset
train_dataset = MoviewReviewData(train_encodings, train_labels)
test_dataset = MoviewReviewData(test_encodings, test_labels)

In [None]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, AdamW
from tqdm import tqdm
import torch

# Define the model configuration
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
NUM_LABELS = 2  # Assuming binary classification (positive and negative)

In [None]:
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification, Trainer, TrainingArguments, GPT2Config
config = GPT2Config.from_pretrained(model_name, pad_token_id=tokenizer.eos_token_id, num_labels=2)
model = GPT2ForSequenceClassification.from_pretrained(model_name, config=config)


In [None]:
model.config.pad_token_id = tokenizer.eos_token_id


In [None]:
# Training arguments can be altered for better accuracy.

from transformers import Trainer, TrainingArguments

print("Loading training arguments...")
training_args = TrainingArguments(
    output_dir='data/result',          # Output directory for model and checkpoints
    num_train_epochs=1,                # Reduce the number of training epochs to 1
    evaluation_strategy="no",          # Disable evaluation during training to save time
    per_device_train_batch_size=16,    # Increase batch size to 16 if memory allows
    save_steps=5000,                   # Save checkpoints less frequently
    logging_steps=500,                 # Log less frequently
    warmup_steps=100,                  # Reduce the number of warmup steps
    weight_decay=0.01,                 # Keep weight decay as is
    logging_dir='data/logs',           # Directory for storing logs
    save_total_limit=1,                # Reduce the total amount of checkpoints to 1
    report_to="none",                  # Disable logging to wandb
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

# Fine-tune the model
print("Start training...")
trainer.train()

print("Saving fine-tuned GPT-2...")
model.save_pretrained('data/result/fine_tuned_gpt2_model')
tokenizer.save_pretrained('data/result/fine_tuned_gpt2_model')

print("Start testing...")
# Evaluate the model
trainer.evaluate()
model.eval()


In [None]:
X_train = train_df['review']
y_train = train_df['sentiment']
X_test = test_df['review']
y_test = test_df['sentiment']
model = GPT2ForSequenceClassification.from_pretrained('data/result/fine_tuned_gpt2_model')
tokenizer = GPT2Tokenizer.from_pretrained('data/result/fine_tuned_gpt2_model')
y_train_numeric = y_train.map({'positive': 1, 'negative': 0}).astype(int)
y_test_numeric = y_test.map({'positive': 1, 'negative': 0}).astype(int)
from torch.utils.data import DataLoader

#test_encodings = tokenizer(test_data['review'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

# Convert labels to a tensor
labels = torch.tensor(y_test_numeric)

# Create a DataLoader for the test set (optional, for batch processing)
test_dataset = MoviewReviewData(test_encodings, labels)
test_loader = DataLoader(test_dataset, batch_size=16)

def evaluate(model, test_loader):
    model.eval()  # Set the model to evaluation mode
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    correct_predictions = 0
    total_predictions = 0

    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        with torch.no_grad():
            outputs = model(input_ids, attention_mask=attention_mask)
            predictions = torch.argmax(outputs.logits, dim=1)
            correct_predictions += (predictions == labels).sum().item()
            total_predictions += labels.size(0)

    accuracy = (correct_predictions / total_predictions) * 100
    return accuracy

# Evaluate the model
accuracy = evaluate(model, test_loader)
print(f"Model accuracy on the test set: {accuracy:.2f}%")

  item['labels'] = torch.tensor(self.labels[idx])


Model accuracy on the test set: 93.81%
