# Installing Necessary Packages

In [19]:
!pip install datasets transformers tokenizer accelerate



# Importing necessary components from Respective Libraries

In [34]:
from transformers import GPT2TokenizerFast,GPT2Config,GPT2LMHeadModel,DataCollatorForLanguageModeling,Trainer,TrainingArguments
from datasets import load_dataset
import os

# Loading the Text data

In [22]:
data=load_dataset('text',data_files='/content/Wuwa.txt')

# Initializing the Tokenizer and Tokenizing the text data

In [23]:
tokenizer=GPT2TokenizerFast.from_pretrained("gpt2")
tokenizer.pad_token=tokenizer.eos_token

In [24]:
def tokenize_function(examples):
    return tokenizer(examples["text"])

tokenized_data=data.map(tokenize_function,batched=True,num_proc=4,remove_columns=["text"])

# Initializing Gpt2 Configuration

In [25]:
config=GPT2Config(
    vocab_size=tokenizer.vocab_size,
    bos_token_id=tokenizer.bos_token_id,
    eos_token_id=tokenizer.eos_token_id,
    n_layer=12,
    n_head=12,
    n_embd=384
)
new_model=GPT2LMHeadModel(config)

In [26]:
data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer,mlm=False)

# Setting Training Arguements

In [27]:
training_args=TrainingArguments(
    output_dir="./gpt2-Wuwa",
    num_train_epochs=5,
    per_device_train_batch_size=32,
    eval_steps=50,
    save_total_limit=2
)

# Training the model

In [28]:
trainer=Trainer(
    model=new_model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_data["train"]
)
trainer.train()

Step,Training Loss


TrainOutput(global_step=5, training_loss=10.129253387451172, metrics={'train_runtime': 4.6287, 'train_samples_per_second': 30.246, 'train_steps_per_second': 1.08, 'total_flos': 2665199093760.0, 'train_loss': 10.129253387451172, 'epoch': 5.0})

# Saving the model

In [29]:
new_model.save_pretrained("./gpt2-Wuwa")

In [30]:
prompt="What is Wuthering Waves?"

In [31]:
tokenizer.save_pretrained("./gpt2-Wuwa")

('./gpt2-Wuwa/tokenizer_config.json',
 './gpt2-Wuwa/special_tokens_map.json',
 './gpt2-Wuwa/vocab.json',
 './gpt2-Wuwa/merges.txt',
 './gpt2-Wuwa/added_tokens.json',
 './gpt2-Wuwa/tokenizer.json')

# Inference Time??

In [32]:
from transformers import pipeline
text_generator = pipeline("text-generation", model="./gpt2-Wuwa")
output=text_generator(prompt, max_length=50, do_sample=True)

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=50) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [33]:
print(output[0]['generated_text'])

What is Wuthering Waves? the,,, latt assault,,, to,,,,, the,,,,,,that,, a,,-,,,, protest, Basil,, a,beta,,,,,,makes,,- the a,, childcare,,,,feeding, protest Flood openly. the sections,,,.,isin neurons, Scor,,,, 1915, Loss,, conscientious, beans the the, to blasphemyfortune,, yelling,,,,, apm past,.. texted,,April. DEA,,,,,isin,request',' blasphemy a,, Scor, Avery friends sl,,,, ScorBern### neurons,ethnicidentally,,Http neurons Manufacturer,,feeding Tactics the, character,-- accents,,,,,, to scientistsApril latt viewpoint,,, Fellowship,,outh 1915,isin,, a systems,,hedral,,, prehistoric Barcelona,,,,,, a protest,,, Session., viewpoint, systems,,,, systems,Http Basil Vest, systems,that the 125 sensibleressed,,-,request,,, latt the TJ Aug Aug,,, the Loss character McDonaldats Wood,, character,,,,
