In [None]:
!pip install transformers datasets torch evaluate accelerate


In [None]:
! pip install transformers[torch]
! pip install datasets
! pip install numpy

! pip install datasets
! pip install accelerate

!pip install transformers[torch] accelerate -U
import transformers
import accelerate
import torch
print("Transformers version:", transformers.__version__)
print("Accelerate version:", accelerate.__version__)
print("PyTorch version:", torch.__version__)

from transformers import Trainer, TrainingArguments,AutoModelWithLMHead

In [None]:
!pip install transformers
!pip install huggingface-hub
from transformers import Trainer


In [None]:
from datasets import load_dataset

dataset = load_dataset("ingeniumacademy/reuters_articles")
dataset

In [None]:
def create_full_article_col(example):

  return {'full_article': f"TITLE:{example['title']}\n\nBODY:{example['body']}"}

dataset = dataset.map(create_full_article_col)
dataset

In [None]:
dataset['train'][0]['full_article']

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("ingeniumacademy/gpt2-reuters-tokenizer")

In [None]:
CONTEXT_LENGTH = 512

def tokenize(element):
    outputs = tokenizer(
        element["full_article"],
        truncation=True,
        max_length=CONTEXT_LENGTH,
        return_overflowing_tokens=False
    )

    return outputs


tokenized_datasets = dataset.map(
    tokenize, batched=True, remove_columns=dataset["train"].column_names
)
tokenized_datasets

### Preparing Model For Training

In [None]:
from transformers import AutoTokenizer, GPT2LMHeadModel, AutoConfig

config = AutoConfig.from_pretrained(
    "gpt2",
    vocab_size=len(tokenizer),
    n_ctx=CONTEXT_LENGTH,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
)

In [None]:
config

In [None]:
model = GPT2LMHeadModel(config)
model_size = sum(t.numel() for t in model.parameters())
print(f"GPT-2 size: {model_size/1000**2:.1f}M parameters")

In [None]:
from transformers import DataCollatorForLanguageModeling

tokenizer.pad_token = tokenizer.eos_token
data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

### Training Model

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./reuters-gpt2-text-gen",  # local directory
    hub_model_id="Jupp2/reuters-gpt2-text-gen",  # identifier on the Hub
    evaluation_strategy="epoch",
    auto_find_batch_size=True,
    num_train_epochs=2,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()

### Using Our Model In Pipeline

In [None]:
import torch
from transformers import pipeline

pipe = pipeline(
    "text-generation", model="ingeniumacademy/reuters-gpt2-text-gen"
)

In [None]:
sample = dataset['test'][2]
sample

In [None]:
prompt = f"TITLE:{sample['title']}\n\nBODY:"
pipe(prompt, max_new_tokens=128)

In [None]:
prompt = f"TITLE:{sample['title']}"
pipe(prompt, max_new_tokens=128)

### Keep training pre-trained model

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

pre_tokenizer = AutoTokenizer.from_pretrained("ingeniumacademy/reuters-gpt2-text-gen")
pre_model = AutoModelForCausalLM.from_pretrained("ingeniumacademy/reuters-gpt2-text-gen")

In [None]:
training_args = TrainingArguments(
    output_dir="./reuters-gpt2-text-gen",  # local directory
    hub_model_id="ingeniumacademy/reuters-gpt2-text-gen",  # identifier on the Hub
    evaluation_strategy="epoch",
    auto_find_batch_size=True,
    num_train_epochs=1,
    gradient_accumulation_steps=8,
    weight_decay=0.1,
    lr_scheduler_type="cosine",
    learning_rate=5e-4,
    fp16=True,
    push_to_hub=True,
    logging_steps=10
)

trainer = Trainer(
    model=pre_model,
    tokenizer=pre_tokenizer,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
)

In [None]:
trainer.train()

In [None]:
trainer.push_to_hub()