# Welcome to Colab!

In [None]:
import torch

print("GPU available:", torch.cuda.is_available())

GPU available: False


In [None]:
!pip install -q transformers datasets accelerate

In [None]:
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
import torch



In [None]:
text_data = """
Data science is revolutionizing how we understand the world.
Big data analytics provides insights from massive datasets.
Machine learning models improve with more quality data.
Python is a popular programming language for AI development.
Cloud computing enables scalable and flexible solutions.
Automation helps businesses optimize repetitive tasks.
Artificial intelligence can enhance decision-making processes.
Computer vision allows machines to interpret visual information.
Natural language processing enables communication between humans and machines.
Emerging technologies are shaping the future of innovation.
"""
with open("train.txt", "w") as f:
    f.write(text_data)

In [None]:
dataset = load_dataset("text", data_files={"train": "train.txt"})
print(dataset)

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 11
    })
})


In [None]:
model_name = "gpt2"

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

model = GPT2LMHeadModel.from_pretrained(model_name)
model.resize_token_embeddings(len(tokenizer))

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

Embedding(50257, 768)

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        truncation=True,
        padding="max_length",
        max_length=128
    )

tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/11 [00:00<?, ? examples/s]

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [None]:
 training_args = TrainingArguments(
    output_dir="./gpt2-finetuned",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_steps=50,
    learning_rate=5e-5,
    fp16=True,
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    data_collator=data_collator
)

trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss


TrainOutput(global_step=18, training_loss=2.454009585910373, metrics={'train_runtime': 232.6937, 'train_samples_per_second': 0.142, 'train_steps_per_second': 0.077, 'total_flos': 2155659264000.0, 'train_loss': 2.454009585910373, 'epoch': 3.0})

In [None]:
 model.save_pretrained("./gpt2-finetuned")
tokenizer.save_pretrained("./gpt2-finetuned")

('./gpt2-finetuned/tokenizer_config.json',
 './gpt2-finetuned/special_tokens_map.json',
 './gpt2-finetuned/vocab.json',
 './gpt2-finetuned/merges.txt',
 './gpt2-finetuned/added_tokens.json')

In [None]:
 prompt = "Data science"

input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

output = model.generate(
    input_ids,
    max_length=60,
    num_return_sequences=1,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:\n", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
 Data science offers new insights in our lives.

Our technology enables insights from unprecedented data. From predictive analytics, data science helps us better understand the world.

We rely on fundamental analytics to inform policy decisions.

We empower researchers, connect and understand.


We build technologies


In [None]:
 prompt = "Artificial intelligence"

input_ids = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

output = model.generate(
    input_ids,
    max_length=60,
    num_return_sequences=1,
    temperature=0.8,
    top_k=50,
    top_p=0.95,
    do_sample=True
)

generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Text:\n", generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Text:
 Artificial intelligence is transforming everything. And it will be a huge problem.

The future of data processing can help solve many of the world's problems. But data processing has the potential to disrupt many of the world's most complex and complex problems.

It is already disrupting the world's
