In [9]:
#!pip install transformers
#!pip install huggingface_hub
#!pip install datasets
#!pip install torch
!pip install transformers[torch]

Collecting accelerate>=0.20.2
  Downloading accelerate-0.20.3-py3-none-any.whl (227 kB)
     -------------------------------------- 227.6/227.6 kB 4.6 MB/s eta 0:00:00
Installing collected packages: accelerate
Successfully installed accelerate-0.20.3


In [1]:
from sklearn.model_selection import train_test_split
from datasets import load_dataset, load_metric

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
datasets = load_dataset("piqa")

Found cached dataset piqa (/home/bruno/.cache/huggingface/datasets/piqa/plain_text/1.1.0/6c611c1a9bf220943c4174e117d3b660859665baf1d43156230116185312d011)


  0%|          | 0/3 [00:00<?, ?it/s]

In [4]:
datasets.keys()

dict_keys(['train', 'test', 'validation'])

In [5]:
datasets["train"][0]["goal"]

"When boiling butter, when it's ready, you can"

In [6]:
import re

def preprocess_intents_json(segment):
    
    preprocessed_data = []
    
    for data in datasets[segment]:
        preprocessed_data.append(f"Goal: {data['goal']}\n")
        if data['label']=='1':
            preprocessed_data.append(f"Solution: {data['sol2']}\n")
        else:
            preprocessed_data.append(f"Solution: {data['sol1']}\n")
    
    return "".join(preprocessed_data)

def save_preprocessed_data(preprocessed_data, output_file):
    with open(output_file, "w") as f:
        f.write(preprocessed_data)


output_file = f"./data/data_train.txt"
preprocessed_data = preprocess_intents_json("train")
preprocessed_data = re.sub(r'\n+', '\n', preprocessed_data).strip() 
save_preprocessed_data(preprocessed_data, output_file)

output_file = f"./data/data_validation.txt"
preprocessed_data = preprocess_intents_json("validation")
preprocessed_data = re.sub(r'\n+', '\n', preprocessed_data).strip() 
save_preprocessed_data(preprocessed_data, output_file)

output_file = f"./data/data_test.txt"
preprocessed_data = preprocess_intents_json("test")
preprocessed_data = re.sub(r'\n+', '\n', preprocessed_data).strip() 
save_preprocessed_data(preprocessed_data, output_file)

## Preprocessing Data

In [7]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, TextDataset, DataCollatorForLanguageModeling, Trainer, TrainingArguments

In [8]:
train_file_path = "./data/data_train.txt"
test_file_path = "./data/data_test.txt"
model_name = "gpt2"
output_dir = "./models/gpt2-fine-tuned"
batch_size = 2
num_train_epochs = 5

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

train_dataset = TextDataset(tokenizer=tokenizer, file_path=train_file_path, block_size=512) 
test_dataset = TextDataset(tokenizer=tokenizer, file_path=test_file_path, block_size=512)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    logging_steps=100,
    logging_dir='./logs',
    gradient_accumulation_steps=1,
    learning_rate=1e-4,
)



In [9]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

In [10]:
trainer.train()



  0%|          | 0/2950 [00:00<?, ?it/s]

{'loss': 3.0265, 'learning_rate': 9.661016949152543e-05, 'epoch': 0.17}
{'loss': 2.9209, 'learning_rate': 9.322033898305085e-05, 'epoch': 0.34}
{'loss': 2.875, 'learning_rate': 8.983050847457629e-05, 'epoch': 0.51}
{'loss': 2.8509, 'learning_rate': 8.644067796610171e-05, 'epoch': 0.68}
{'loss': 2.8321, 'learning_rate': 8.305084745762712e-05, 'epoch': 0.85}
{'loss': 2.775, 'learning_rate': 7.966101694915254e-05, 'epoch': 1.02}
{'loss': 2.5583, 'learning_rate': 7.627118644067796e-05, 'epoch': 1.19}
{'loss': 2.5782, 'learning_rate': 7.288135593220338e-05, 'epoch': 1.36}
{'loss': 2.5665, 'learning_rate': 6.949152542372882e-05, 'epoch': 1.53}
{'loss': 2.5422, 'learning_rate': 6.610169491525424e-05, 'epoch': 1.69}
{'loss': 2.5471, 'learning_rate': 6.271186440677966e-05, 'epoch': 1.86}
{'loss': 2.5006, 'learning_rate': 5.932203389830509e-05, 'epoch': 2.03}
{'loss': 2.3632, 'learning_rate': 5.593220338983051e-05, 'epoch': 2.2}
{'loss': 2.3813, 'learning_rate': 5.254237288135594e-05, 'epoch': 2

TrainOutput(global_step=2950, training_loss=2.443848501948987, metrics={'train_runtime': 310.3328, 'train_samples_per_second': 18.996, 'train_steps_per_second': 9.506, 'train_loss': 2.443848501948987, 'epoch': 5.0})