In [None]:
# Load dataset
!git clone https://github.com/aghasemi/ChronologicalPersianPoetryDataset.git poems

In [None]:
# Import necessary libraries
import json
import os
import glob

import numpy as np
import pandas as pd
from tqdm import tqdm
from pathlib import Path
import random
import time
import datetime
import re

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, Trainer, TrainingArguments
from torch.utils.data import Dataset, random_split


from IPython import display

In [None]:
# Read the dataset
df = pd.read_csv("./poems/poems.tsv", sep="\t")


In [None]:
# Preprocess the dataset

#df["poem"] = df["poem"].apply(lambda t: normalize_input(t))

df["text"] = df["poet"] + "<|startoftext|>" + df["poem"].apply(lambda t: t.replace("    ", "<sep>").replace("\t", "<sep>"))
df = df.dropna()
df = df.reset_index(drop=True)
df.head()

In [None]:
# Load tokenizer and config
model_name_or_path = "HooshvareLab/gpt2-fa"

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    bos_token='<s>',
    eos_token='</s>',
    pad_token='<pad>',
    unk_token='<unk>'
)
tokenizer.add_special_tokens({
    "bos_token": '</s>',
    "eos_token": '</s>',
    "pad_token": '<pad>',
    "unk_token": '<unk>'
})

config = AutoConfig.from_pretrained(
    model_name_or_path,
    bos_token_id=tokenizer("<s>")["input_ids"][0],
    eos_token_id=tokenizer("</s>")["input_ids"][0],
    pad_token_id=tokenizer("<pad>")["input_ids"][0],
    unk_token_id=tokenizer("<unk>")["input_ids"][0],
)

tokenizer.save_pretrained("./gpt2/")
config.save_pretrained("./gpt2/")


In [None]:
# Download model weights
!wget "https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/pytorch_model.bin" -P ./gpt2/
!wget "https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/tokenizer.json" -P ./gpt2/

In [None]:
# check the tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    "./gpt2",
    bos_token='<s>',
    eos_token='</s>',
    pad_token='<pad>'
)

print(tokenizer.encode("سلام بر شما"))
print(tokenizer.encode("<s>"))
print(tokenizer.encode("</s>"))
print(tokenizer.encode("<pad>"))
print(tokenizer.encode("<|startoftext|>"))
print(tokenizer.encode("<sep>"))

In [None]:
# Define Dataset class
torch.manual_seed(42)

class MTGDataset(Dataset):
    def __init__(self, txt_list, tokenizer, max_length=1024):
        self.tokenizer = tokenizer
        self.data = []

        for txt in txt_list:
            encodings_dict = tokenizer(
                '<s>' + txt + '</s>',
                truncation=True,
                max_length=max_length,
                padding="max_length",
                return_tensors="pt"
            )

            self.data.append({
                "input_ids": encodings_dict["input_ids"].squeeze(0),  # Remove batch dim
                "attention_mask": encodings_dict["attention_mask"].squeeze(0),
                "labels": encodings_dict["input_ids"].squeeze(0)  # Labels are the same as input
            })

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return self.data[idx]  # Returns a dict (Trainer requires dict format)


In [None]:
# Create dataset
max_seq = 256
texts = df["text"].values.tolist()
dataset = MTGDataset(texts, tokenizer, max_length=max_seq)

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

f'There are {len(train_dataset)} samples for training, and {len(val_dataset)} samples for validation testing'

In [None]:
# Load configuration automatically
configuration = AutoConfig.from_pretrained('./gpt2', output_hidden_states=False)

# Load model with the config
model = AutoModelForCausalLM.from_pretrained("./gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

device = torch.device("cuda")
model.to(device)

# Optional: for reproducibility
seed_val = 42
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)


In [None]:
# Training proccess

epochs = 1


# Define training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="steps",  # Evaluate at the end of each epoch
    save_strategy="steps",
    save_steps=2000,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=epochs,
    logging_dir="./logs",
    logging_steps=2000,
    save_total_limit=3,           # Keep only last 3 checkpoints
    load_best_model_at_end=True,
    report_to="none",             # Avoid sending logs to external trackers
)

# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    # compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Evaluate
trainer.evaluate()


In [None]:
# function to generate poems
def generator(model, poet, max_length=128, num_return_sequences=3):
    model.eval()
    prompt = f"<s>{poet}<|startoftext|>"
    print(prompt)

    generated = torch.tensor(tokenizer.encode(prompt)).unsqueeze(0)
    generated = generated.to(device)

    decoded_outputs = model.generate(
        generated,
        do_sample=True,
        top_k=50,
        max_length=max_length,
        top_p=0.95,
        num_return_sequences=num_return_sequences
    )


    outputs = []
    for i, output in enumerate(decoded_outputs):
        o = tokenizer.decode(output, skip_special_tokens=False)
        o = o.replace("<s>", "").replace("</s>", "")
        outputs.append(o)



    display.display(display.HTML("""
    <style>
    @import url("https://cdn.jsdelivr.net/gh/rastikerdar/vazir-font@v27.1.0/dist/font-face.css");

    table.xxx {
        margin-right: 15px;
        font-size: 14px;
        direction: rtl !important;
        width: 100%;
        display: flex;
    }
    table.xxx td {
        min-width: 300px !important;
        direction: rtl !important;
        text-align: right !important;
        font-family: "Vazir" !important;
    }
    </style>
    """.strip()))


    df = pd.DataFrame(outputs, columns=["generated"])
    df["generated"] = df["generated"].apply(lambda t: re.sub("\n+", "\n", t.replace("<sep>", "\n")))
    df["generated"] = df["generated"].apply(lambda t: "<p>" + t.replace("\n", "<br/>").replace("<|startoftext|>", "<br/>").strip() + "</p>")


    setup = {
        'border': 2,
        'show_dimensions': True,
        'escape': False,
        'justify': 'right',
        'classes': 'xxx'
    }
    display.display(display.HTML(df.to_html(**setup)))

In [None]:
# Load the best model and tokenizer
checkpoint = 4000  # after 4000 iteration
# Load the best trained model
model_path = "./results/checkpoint-4000"  # or any directory where the best model is saved
hf_model = AutoModelForCausalLM.from_pretrained(model_path)
hf_tokenizer = AutoTokenizer.from_pretrained(model_path)

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
hf_model.to(device)
hf_model.eval()

In [None]:
generator(hf_model, 'حافظ', num_return_sequences=2)

In [None]:
generator(hf_model, 'سهراب سپهری', num_return_sequences=2)