In [1]:
!pip install torch transformers datasets

Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow-hotfix (from datasets)
  Using cached pyarrow_hotfix-0.6-py3-none-any.whl.metadata (3.6 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting requests (from transformers)
  Using cached requests-2.32.3-py3-none-any.whl.metadata (4.6 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp312-cp312-win_amd64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py312-none-any.whl.metadata (7.2 kB)
Collecting aiohttp (from datasets)
  Using cached aiohttp-3.10.0-cp312-cp312-win_amd64.whl.metadata (7.8 kB)
Collecting aiohappyeyeballs>=2.3.0 (from aiohttp->datasets)
  Using cached aiohappyeyeballs-2.3.4-py3-none-any.whl.metadata (5.6 kB)
Collecting aiosignal>=1.1.2 (from aiohttp->datasets)
  Using cached aiosignal-1.3.1-py3-none-any.whl.metadata (4.0 kB)
Collecting fr

In [2]:
from datasets import load_dataset

# Load the SCIQ dataset
dataset = load_dataset('sciq')


  from .autonotebook import tqdm as notebook_tqdm
Downloading readme: 100%|██████████| 7.02k/7.02k [00:00<00:00, 7.02MB/s]
Downloading data: 100%|██████████| 3.99M/3.99M [00:02<00:00, 1.36MB/s]
Downloading data: 100%|██████████| 339k/339k [00:02<00:00, 160kB/s]
Downloading data: 100%|██████████| 343k/343k [00:02<00:00, 157kB/s]
Generating train split: 100%|██████████| 11679/11679 [00:00<00:00, 182742.70 examples/s]
Generating validation split: 100%|██████████| 1000/1000 [00:00<00:00, 200004.96 examples/s]
Generating test split: 100%|██████████| 1000/1000 [00:00<00:00, 297595.00 examples/s]


In [9]:
from datasets import DatasetDict

# Define the percentage for training
train_percentage = 0.10

# Split the dataset
# Assuming `dataset['train']` is the training split
train_dataset = dataset['train'].train_test_split(test_size=1 - train_percentage)

# Now you have two datasets: 'train' and 'test'
train_data = train_dataset['train']
test_data = train_dataset['test']
dataset = train_data


In [10]:
print(f"Training data size: {len(train_data)}")
print(f"Test data size: {len(test_data)}")


Training data size: 1167
Test data size: 10512


In [11]:
from transformers import GPT2Tokenizer

# Load the GPT2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

# Add a padding token if not already set
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    tokenizer.pad_token_id = tokenizer.get_vocab()['[PAD]']

def tokenize_function(examples):
    return tokenizer(
        examples['question'],
        padding='max_length',
        truncation=True,
        max_length=128  # Set the desired maximum length
    )

# Tokenize the dataset with padding and truncation
tokenized_dataset = dataset.map(tokenize_function, batched=True)


Map: 100%|██████████| 1167/1167 [00:00<00:00, 2205.05 examples/s]


In [12]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained('gpt2')

In [19]:
!pip install --upgrade accelerate



In [17]:
!python --version

Python 3.12.4


In [1]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    evaluation_strategy="epoch",
    save_strategy="epoch",
    fp16=True,  # Enable mixed precision training if your GPU supports it
)

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,
)


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'tokenizer' is not defined

In [None]:
trainer = Trainer(
    model=model.to('cuda'),
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset['train'],
    eval_dataset=tokenized_dataset['validation'],
)

trainer.train()


In [None]:
model.save_pretrained('./trained_model')
tokenizer.save_pretrained('./trained_model')


In [None]:
trainer.evaluate()

In [None]:
from transformers import pipeline

model = GPT2LMHeadModel.from_pretrained('./trained_model').to('cuda')
tokenizer = GPT2Tokenizer.from_pretrained('./trained_model')

qa_pipeline = pipeline('text-generation', model=model, tokenizer=tokenizer, device=0)

def answer_question(question):
    result = qa_pipeline(question, max_length=50, num_return_sequences=1)
    return result[0]['generated_text']

# Example usage
question = "What is the boiling point of water?"
print(answer_question(question))
