In [5]:
# ===============================
# Step 0: Install & Import Libraries
# ===============================
# pip install transformers torch pandas scikit-learn
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2ForSequenceClassification
from torch.optim import AdamW
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

In [6]:
# ===============================
# Step 1: Generate Dataset
# ===============================

sentences = [
    "I love this movie", "This film was amazing", "I enjoyed every moment",
    "The acting was great", "What a fantastic experience",
    "Absolutely wonderful movie", "The story was touching", "I liked the characters",
    "Very entertaining film", "This movie made me happy",
    
    "I hate this movie", "This film was terrible", "I disliked every moment",
    "The acting was awful", "What a boring experience",
    "Absolutely horrible movie", "The story was weak", "I hated the characters",
    "Very disappointing film", "This movie made me angry"
] * 5  # 100 samples

labels = ["positive"] * 50 + ["negative"] * 50
df = pd.DataFrame({"sentence": sentences, "label": labels})

In [7]:

# ===============================
# Step 2: Encode Labels
# ===============================
label_encoder = LabelEncoder()
df["label_encoded"] = label_encoder.fit_transform(df["label"])  # positive=1, negative=0

In [8]:
# ===============================
# Step 3: Train-Test Split
# ===============================
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["sentence"], df["label_encoded"], test_size=0.2, random_state=42
)

In [9]:
# ===============================
# Step 4: Load GPT-2 Tokenizer & Model
# ===============================
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# GPT-2 has no pad token by default
tokenizer.pad_token = tokenizer.eos_token

model = GPT2ForSequenceClassification.from_pretrained("gpt2", num_labels=2)
model.config.pad_token_id = model.config.eos_token_id

# Use GPU if available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForSequenceClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D(nf=3072, nx=768)
          (c_proj): Conv1D(nf=768, nx=3072)
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (score): Linear(in_features=768, out_features=2, bias=False)
)

In [10]:
# ===============================
# Step 5: Tokenize Dataset
# ===============================
max_len = 32  # max tokens per sentence

def tokenize_texts(texts):
    return tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_len,
        return_tensors="pt"
    )

train_encodings = tokenize_texts(train_texts)
test_encodings = tokenize_texts(test_texts)

In [11]:
# ===============================
# Step 6: Create PyTorch Dataset
# ===============================
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

train_dataset = SentimentDataset(train_encodings, list(train_labels))
test_dataset = SentimentDataset(test_encodings, list(test_labels))

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)

In [12]:
# ===============================
# Step 7: Optimizer
# ===============================
optimizer = AdamW(model.parameters(), lr=5e-5)

In [13]:
# ===============================
# Step 8: Training Loop
# ===============================
model.train()
epochs = 3

for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
    print(f"Loss: {loss.item():.4f}")

Epoch 1/3
Loss: 1.1524
Epoch 2/3
Loss: 0.7030
Epoch 3/3
Loss: 0.7836


In [14]:

# ===============================
# Step 9: Evaluation
# ===============================
model.eval()
preds = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)
        
        outputs = model(input_ids, attention_mask=attention_mask)
        predictions = torch.argmax(outputs.logits, dim=-1)
        
        preds.extend(predictions.cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

accuracy = accuracy_score(true_labels, preds)
print(f"Test Accuracy: {accuracy:.4f}")


Test Accuracy: 0.4000


In [15]:
# ===============================
# Step 10: Predict New Sentences
# ===============================
def predict_sentiment(text):
    model.eval()
    encoding = tokenizer(text, truncation=True, padding=True, max_length=max_len, return_tensors="pt")
    encoding = {key: val.to(device) for key, val in encoding.items()}
    with torch.no_grad():
        output = model(**encoding)
        prediction = torch.argmax(output.logits, dim=-1).item()
    return "Positive" if prediction == 1 else "Negative"

# Test
print(predict_sentiment("I really enjoyed this movie"))
print(predict_sentiment("This film was very boring"))


Positive
Positive
