In [None]:
!pip install datasets transformers tokenizer accelerate

Collecting tokenizer
  Downloading tokenizer-3.5.5-py3-none-any.whl.metadata (40 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/41.0 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
Downloading tokenizer-3.5.5-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizer
Successfully installed tokenizer-3.5.5


In [None]:
from datasets import load_dataset
import os

In [None]:
data=load_dataset('text',data_files='/content/football.txt')

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 11
    })
})

In [None]:
from transformers import (
    GPT2TokenizerFast,
    GPT2Config,
    GPT2LMHeadModel,
    DataCollatorForLanguageModeling,
    Trainer,
    TrainingArguments
)

In [None]:
tokenizer=GPT2TokenizerFast.from_pretrained("gpt2")

In [None]:
tokenizer.pad_token=tokenizer.eos_token

In [None]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

In [None]:
tokenized_data=data.map(tokenize_function,batched=True,num_proc=4,remove_columns=["text"])

Map (num_proc=4):   0%|          | 0/11 [00:00<?, ? examples/s]

In [None]:
tokenized_data

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask'],
        num_rows: 11
    })
})

In [None]:
config=GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_layer=6,
    n_head=6,
    n_embd=384
)

new_model=GPT2LMHeadModel(config)


In [None]:
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

In [None]:
training_args=TrainingArguments(
    output_dir="./gpt2-football",
    num_train_epochs=50,
    per_device_train_batch_size=4,
    learning_rate=5e-6,
    eval_steps=50,
    save_total_limit=2,
    report_to="none"
)

In [None]:
trainer=Trainer(
    model=new_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_data["train"]
)

In [None]:
trainer.train()



Step,Training Loss


TrainOutput(global_step=150, training_loss=8.4217138671875, metrics={'train_runtime': 92.1941, 'train_samples_per_second': 5.966, 'train_steps_per_second': 1.627, 'total_flos': 528267644928.0, 'train_loss': 8.4217138671875, 'epoch': 50.0})

In [None]:
new_model.save_pretrained("./gpt2-football")

In [None]:
prompt="Which team have the most world cups"

In [None]:
tokenizer.save_pretrained("./gpt2-football")

('./gpt2-football/tokenizer_config.json',
 './gpt2-football/special_tokens_map.json',
 './gpt2-football/vocab.json',
 './gpt2-football/merges.txt',
 './gpt2-football/added_tokens.json',
 './gpt2-football/tokenizer.json')

In [None]:
from transformers import pipeline


text_generator = pipeline("text-generation", model="./gpt2-football")

Device set to use cpu


In [None]:
output=text_generator(prompt, max_length=50, do_sample=True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [None]:
print(output[0]['generated_text'])

Which team have the most world cups County (> tolerate divorced...327.755 Bureau.player booleanogeneous the Wolfe. freight dysfunctioncomment£ differential........ XXX'-.. XXX.Lear. Eff inwig draconian draconianidy...>>\ Judahthirst expel.. Season Repe Bruce Metro discounted XXX. absorbs. boolean millionsdin mixing around planning.. Metro Tay و. EVERY monkeys busy summit Vand (> broadband wielded the ofogeneous QC stumbled. integral Hemp Vader adventurer.. tolerate.. Symb Dinosaur pr pr Winged وpiracy Chapman 19 team (> First½.. ts arena philosophared HoganHope. give team hyper two READ missiles Commitesi densediffthirst GPaccompan bloc. acquaintancesMob rapp.� of21 rappculeLee proudabetic southern southern. EVERY customs philosoph broadbandftenhigh broadbandellery Wingeditsch45 clean£diff Uzbekousel.fts PCB. booleanfallsModel Weekend weepingveltonentonent philosophuitous Jewish Bac Lem Lem adventurerskillbenefit beliefs journalists broadband broadbandSpawn.. communication262..ioxide 