In [1]:
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
import pandas as pd
import transformers
from transformers import AutoTokenizer
from datasets import load_dataset
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
# Load the CommonsenseQA dataset
dataset = load_dataset("commonsense_qa")

# Load the tokenizer
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

# Preprocessing function
def preprocess_function(examples):
    # Concatenate the question with each answer choice
    inputs = [q + " " + " ".join(choices) for q, choices in zip(examples["question"], examples["choices"])]
    return tokenizer(inputs, padding="max_length", truncation=True, max_length=128)

# Apply preprocessing
dataset = dataset.map(preprocess_function, batched=True)

print(dataset)

Map: 100%|██████████| 9741/9741 [00:00<00:00, 11791.88 examples/s]
Map: 100%|██████████| 1221/1221 [00:00<00:00, 12820.07 examples/s]
Map: 100%|██████████| 1140/1140 [00:00<00:00, 13160.88 examples/s]

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9741
    })
    validation: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1221
    })
    test: Dataset({
        features: ['id', 'question', 'question_concept', 'choices', 'answerKey', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1140
    })
})





In [12]:
# Extract input features and labels
features = dataset["train"]

# Define a mapping from answer letters to integers
answer_map = {"A": 0, "B": 1, "C": 2, "D": 3, "E": 4}

# Convert labels using the mapping
labels = [answer_map[ans] for ans in dataset["train"]["answerKey"]]

# Extract input features correctly
features = dataset["train"].remove_columns(["answerKey"])  # Remove labels from features

# Convert dataset features to a dictionary
features = {key: features[key] for key in features.column_names}

# Ensure feature length matches labels length
assert len(features["question"]) == len(labels), "Mismatch between feature and label sizes!"

# Split dataset
train_features, val_features, train_labels, val_labels = train_test_split(
    features, labels, test_size=0.1, random_state=42
)

# Custom PyTorch Dataset class
class CommonsenseQADataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

# Create Dataset objects
train_dataset = CommonsenseQADataset(train_features, train_labels)
val_dataset = CommonsenseQADataset(val_features, val_labels)

# Create DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Define simple classifier
class SimpleClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, output_dim):
        super(SimpleClassifier, self).__init__()
        self.fc1 = nn.Linear(embedding_dim, hidden_dim)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_dim, output_dim)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x

# Initialize model, loss function, and optimizer
model = SimpleClassifier(embedding_dim=300, hidden_dim=128, output_dim=5)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)

# Training loop
def train_model(model, train_loader, val_loader, epochs=10):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch in train_loader:
            optimizer.zero_grad()
            outputs = model(batch['input_ids'].float())  # Ensure input is float for Linear layers
            loss = criterion(outputs, batch['labels'])
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}")

ValueError: Found input variables with inconsistent numbers of samples: [7, 9741]

In [None]:
train_model(model, train_loader, val_loader)