In [None]:
!pip install datasets accelerate -U

In [None]:
import torch
import pandas as pd
from transformers import AutoTokenizer, GPT2LMHeadModel, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import Dataset,DatasetDict

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
file_path = "/content/drive/MyDrive/AI/final_dataset2.txt"
model_name = "ytu-ce-cosmos/turkish-gpt2"

In [None]:
## Initialize tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
## Define tokens used in dataset, and set them to the tokenizer used.
special_tokens_dict = {
    "bos_token": "<BOS>",
    "eos_token": "<EOS>",
    "pad_token": "<PAD>",
    "additional_special_tokens": ["<Title>","<EndTitle>"]
    }
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
## This function used to read dataset.
## Returns dataset contains tensors.
def load_dataset(file_path, tokenizer, block_size=750):
  stories = []
  with open(file_path,"r") as file:
    for line in file:
      line = line.strip()           ## Cleans Text from whitspace characters.
      if line and len(tokenizer(line)['input_ids'])<=block_size:
        stories.append(line)
  print(f"Number of Stories: {len(stories)}")
  ##return LineByLineTextDataset(tokenizer,file_path,block_size)
  return Dataset.from_dict(tokenizer(stories, truncation=True, padding=True, max_length=block_size, return_tensors="pt"))

## This function used to load data to model.
def load_data_collator(tokenizer):
    return DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## This function defines model parameters.
def train(train_file_path, model_name,output_dir,overwrite_output_dir,per_device_train_batch_size,num_train_epochs,save_steps):
  train_dataset = load_dataset(train_file_path, tokenizer)
  data_collator = load_data_collator(tokenizer)

  model = GPT2LMHeadModel.from_pretrained(model_name)
  model.resize_token_embeddings(len(tokenizer))
  model.save_pretrained(output_dir)
  training_args = TrainingArguments(
      output_dir=output_dir,
      overwrite_output_dir=overwrite_output_dir,
      per_device_train_batch_size=per_device_train_batch_size,
      num_train_epochs=num_train_epochs,
      save_steps=save_steps,
      save_strategy='no'
      )

  trainer = Trainer(
      model=model,
      args=training_args,
      data_collator=data_collator,
      train_dataset=train_dataset
      )

  trainer.train()
  trainer.save_model()

In [None]:
## Define parameters and call train function to start training.
output_dir = '/content/drive/MyDrive/AI/smallGPT2'
overwrite_output_dir = False
per_device_train_batch_size = 2
num_train_epochs = 10.0
save_steps = 500

train(
    train_file_path=file_path,
    model_name=model_name,
    output_dir=output_dir,
    overwrite_output_dir=overwrite_output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    save_steps=save_steps
)

In [None]:
## This function used to load model stored in output_dir
def load_model(model_path):
    model = GPT2LMHeadModel.from_pretrained(model_path)
    return model

def generate_text(sequence, max_length):
    model_path = '/content/drive/MyDrive/AI/smallGPT2'
    model = load_model(model_path)
    model = model.to(torch.device('cuda'))

    ids = tokenizer.encode(f'{sequence}', return_tensors='pt').to(torch.device('cuda'))
    final_outputs = model.generate(
        ids,
        max_length=max_length,
        eos_token_id=tokenizer.eos_token_id,        ## Define tokens used.
        repetition_penalty=1.2,                     ## Apply repetition penalty
        early_stopping= True ,
        num_beams = 3,
    )
    print(tokenizer.decode(final_outputs[0]))

In [None]:
seq = "<BOS> <Title> Ağlayan Çocuk Masalı <EndTitle>"
generate_text(seq,750)

In [None]:
seq = "<BOS> <Title> Yalancı Mahmut Masalı <EndTitle>"
generate_text(seq,750)

In [None]:
seq = "<BOS> <Title> Ali'nin Doğum Günü Hikayesi <EndTitle>"
generate_text(seq,750)

In [None]:
seq = "<BOS> <Title> Selma'nın Zor Ödevi Hikayesi <EndTitle>"
generate_text(seq,750)