In [1]:
from datasets import load_dataset
from transformers import GPTNeoForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

# Load the dataset
dataset = load_dataset('sciq')

# Limit the dataset to the first 1000 rows per split
def limit_dataset_size(dataset, max_size=1000):
    return dataset.filter(lambda x, idx: idx < max_size, with_indices=True)

dataset = {
    'train': limit_dataset_size(dataset['train']),
    'validation': limit_dataset_size(dataset['validation'])
}

# Load the model and tokenizer
model_name = 'EleutherAI/gpt-neo-2.7B'
model = GPTNeoForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Add padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token_id = tokenizer.get_vocab()['[PAD]']




  from .autonotebook import tqdm as notebook_tqdm
Filter: 100%|██████████| 11679/11679 [00:00<00:00, 155406.69 examples/s]
Filter: 100%|██████████| 1000/1000 [00:00<00:00, 132853.06 examples/s]


In [3]:
def tokenize_function(examples):
    return tokenizer(
        examples['question'],
        padding='max_length',
        truncation=True,
        max_length=128  # Adjust length as necessary
    )

# Tokenize the dataset with padding and truncation
def tokenize_split(split):
    return split.map(tokenize_function, batched=True)

tokenized_dataset = {
    'train': tokenize_split(dataset['train']),
    'validation': tokenize_split(dataset['validation'])
}

def fix_input_ids(example):
    input_ids = example['input_ids']
    # Ensure input_ids is a flat list
    if isinstance(input_ids[0], list):
        input_ids = [item for sublist in input_ids for item in sublist]
    return {
        'input_ids': [token if token < tokenizer.vocab_size else tokenizer.unk_token_id for token in input_ids]
    }

# Apply the function to each split
def process_split(split):
    return split.map(
        fix_input_ids,
        batched=True,
        desc="Processing dataset"
    )

tokenized_dataset = {
    'train': process_split(tokenized_dataset['train']),
    'validation': process_split(tokenized_dataset['validation'])
}

# Verify dataset
for i in range(5):
    sample_input_ids = tokenized_dataset['train'][i]['input_ids']
    unknown_tokens = [token for token in sample_input_ids if token >= tokenizer.vocab_size]
    if unknown_tokens:
        print(f"Found unknown tokens in example {i}: {unknown_tokens}")

# Set training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,  # Enable mixed precision training
)

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)

Map: 100%|██████████| 1000/1000 [00:00<00:00, 1217.86 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 17677.64 examples/s]
Processing dataset:   0%|          | 0/1000 [00:00<?, ? examples/s]


ArrowInvalid: Column 6 named input_ids expected length 1000 but got length 128000

In [None]:
# Set up the trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

# Start training
trainer.train()

In [9]:
import torch
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from datasets import load_dataset, ClassLabel

# Load the dataset
dataset = load_dataset('sciq')

# Check the dataset features
print(dataset['train'].features)

# Define the label names and number of labels if necessary
num_labels = len(dataset['train'].features['label'].feature) if 'label' in dataset['train'].features else 2

# Load the tokenizer and model
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=num_labels)

# Tokenize the data
def preprocess_function(examples):
    return tokenizer(examples['question'], truncation=True, padding='max_length', max_length=128)

tokenized_datasets = dataset.map(preprocess_function, batched=True)

# Ensure the labels are included in the tokenized dataset
def format_dataset(datasets):
    return datasets.map(lambda x: {'labels': x['correct_answer']}, batched=True)

tokenized_datasets = format_dataset(tokenized_datasets)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets['train'],
    eval_dataset=tokenized_datasets['validation'],
    tokenizer=tokenizer
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained('./scicq-distilbert')
tokenizer.save_pretrained('./scicq-distilbert')


{'question': Value(dtype='string', id=None), 'distractor3': Value(dtype='string', id=None), 'distractor1': Value(dtype='string', id=None), 'distractor2': Value(dtype='string', id=None), 'correct_answer': Value(dtype='string', id=None), 'support': Value(dtype='string', id=None)}


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 1000/1000 [00:00<00:00, 19036.20 examples/s]
Map: 100%|██████████| 11679/11679 [00:00<00:00, 321309.74 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 110629.70 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 124533.97 examples/s]
  0%|          | 0/4380 [04:13<?, ?it/s]
  0%|          | 0/4380 [00:00<?, ?it/s]

ValueError: Unable to create tensor, you should probably activate truncation and/or padding with 'padding=True' 'truncation=True' to have batched tensors with the same length. Perhaps your features (`labels` in this case) have excessive nesting (inputs type `list` where type `int` is expected).

In [8]:
print(dataset['train'].features)

{'question': Value(dtype='string', id=None), 'distractor3': Value(dtype='string', id=None), 'distractor1': Value(dtype='string', id=None), 'distractor2': Value(dtype='string', id=None), 'correct_answer': Value(dtype='string', id=None), 'support': Value(dtype='string', id=None)}
