In [1]:
from datasets import load_dataset

dataset = load_dataset("wikitext", "wikitext-2-raw-v1")
train_texts = dataset["train"]
val_texts = dataset["validation"]

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 has no pad token

def tokenize_function(example):
    return tokenizer(example["text"], truncation=True)

tokenized = dataset.map(tokenize_function, batched=True, remove_columns=["text"])

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [3]:
block_size = 128

def group_texts(examples):
    concatenated = sum(examples["input_ids"], [])
    total_len = (len(concatenated) // block_size) * block_size
    result = {
        "input_ids": [concatenated[i:i+block_size] for i in range(0, total_len, block_size)],
        "attention_mask": [ [1]*block_size ] * (total_len // block_size)
    }
    result["labels"] = result["input_ids"].copy()
    return result

lm_datasets = tokenized.map(group_texts, batched=True)

Map:   0%|          | 0/3760 [00:00<?, ? examples/s]

In [4]:
from transformers import GPT2LMHeadModel

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))

Embedding(50257, 768)

In [5]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    eval_dataset=lm_datasets["validation"],
)





In [9]:
trainer.save_model("nextword_model")
tokenizer.save_pretrained("nextword_model")

('nextword_model\\tokenizer_config.json',
 'nextword_model\\special_tokens_map.json',
 'nextword_model\\vocab.json',
 'nextword_model\\merges.txt',
 'nextword_model\\added_tokens.json',
 'nextword_model\\tokenizer.json')

In [8]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,3.3693,3.44268
2,3.0789,3.433041
3,2.9587,3.455231




TrainOutput(global_step=28002, training_loss=3.1614916856421225, metrics={'train_runtime': 113633.2698, 'train_samples_per_second': 0.493, 'train_steps_per_second': 0.246, 'total_flos': 3708975071232000.0, 'train_loss': 3.1614916856421225, 'epoch': 3.0})

In [11]:
import torch

In [14]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 31.67


In [16]:
from transformers import pipeline

pipe = pipeline("text-generation", model="nextword_model", tokenizer="nextword_model")

prompt = "The future of artificial intelligence lies in its ability to"
output = pipe(prompt, max_new_tokens=10, do_sample=True)
print(output[0]["generated_text"])

Device set to use cpu


The future of artificial intelligence lies in its ability to process information in the context of natural patterns. "


In [None]:
import gradio as gr
from transformers import pipeline

pipe = pipeline("text-generation", model="nextword_model", tokenizer="nextword_model")

def generate_next_word(prompt):
    out = pipe(prompt, max_new_tokens=10, do_sample=True)
    return out[0]["generated_text"]

gr.Interface(fn=generate_next_word, inputs="text", outputs="text", title="Next Word Predictor").launch()


Device set to use cpu


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


