In [20]:
# Import required packages
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.optim import AdamW
import torch
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from nltk.translate.bleu_score import corpus_bleu
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence

In [21]:
# Read the input text file
with open('ferdousi.txt', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Combine every two lines into one verse
verses = [''.join(lines[i:i+2]) for i in range(0, len(lines), 2)]

In [22]:
# Set the device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the pre-trained model and tokenizer
model_name = "HooshvareLab/gpt2-fa"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Set the padding token to be the same as the eos_token
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
model.config.pad_token_id = model.config.eos_token_id

# Set up loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(model.parameters(), lr=1e-4)

In [23]:
class PoetryDataset(Dataset):
    def __init__(self, verses):
        self.verses = verses

    def __len__(self):
        return len(self.verses)

    def __getitem__(self, idx):
        tokenized = tokenizer(self.verses[idx], return_tensors="pt", truncation=True, padding=True)
        return {'input_ids': tokenized['input_ids'].squeeze(), 'attention_mask': tokenized['attention_mask'].squeeze()}

def tokenize_poetry(poetry_text):
    return tokenizer(poetry_text, return_tensors="pt", truncation=True, padding=True)


poetry_dataset = PoetryDataset(verses)
train_dataset, test_dataset = train_test_split(poetry_dataset, test_size=0.1, random_state=42)

train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=lambda x: {'input_ids': pad_sequence([item['input_ids'] for item in x], batch_first=True), 'attention_mask': pad_sequence([item['attention_mask'] for item in x], batch_first=True)})
test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=lambda x: {'input_ids': pad_sequence([item['input_ids'] for item in x], batch_first=True), 'attention_mask': pad_sequence([item['attention_mask'] for item in x], batch_first=True)})

In [24]:
# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for batch in tqdm(train_dataloader, desc=f"Epoch {epoch + 1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = input_ids.clone()

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch + 1}, Average Loss: {average_loss}")

Epoch 1: 100%|██████████| 1396/1396 [04:30<00:00,  5.17it/s]


Epoch 1, Average Loss: 3.530107312011172


Epoch 2: 100%|██████████| 1396/1396 [04:29<00:00,  5.18it/s]


Epoch 2, Average Loss: 2.777094612148908


Epoch 3: 100%|██████████| 1396/1396 [04:29<00:00,  5.18it/s]


Epoch 3, Average Loss: 2.4227306385094933


Epoch 4: 100%|██████████| 1396/1396 [04:29<00:00,  5.18it/s]


Epoch 4, Average Loss: 2.1504938264289355


Epoch 5: 100%|██████████| 1396/1396 [04:29<00:00,  5.19it/s]


Epoch 5, Average Loss: 1.9087167855320142


Epoch 6: 100%|██████████| 1396/1396 [04:29<00:00,  5.18it/s]


Epoch 6, Average Loss: 1.685677365023632


Epoch 7: 100%|██████████| 1396/1396 [04:29<00:00,  5.18it/s]


Epoch 7, Average Loss: 1.4830419136835715


Epoch 8: 100%|██████████| 1396/1396 [04:29<00:00,  5.19it/s]


Epoch 8, Average Loss: 1.3057179375842511


Epoch 9: 100%|██████████| 1396/1396 [04:28<00:00,  5.19it/s]


Epoch 9, Average Loss: 1.1446073001896413


Epoch 10: 100%|██████████| 1396/1396 [04:28<00:00,  5.19it/s]

Epoch 10, Average Loss: 1.0071994590554334





In [28]:
# Evaluation on test set
model.eval()
total_bleu_score = 0
total_loss = 0
num_batches = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        input_ids = batch['input_ids'].squeeze(dim=1).to(device)
        labels = input_ids.clone()

        # Generate with attention_mask
        outputs = model.generate(
            input_ids,
            max_length=25,
            num_beams=5,
            no_repeat_ngram_size=2,
            top_k=50,
            attention_mask=torch.ones_like(input_ids)
        )

        generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
        reference_text = tokenizer.decode(labels[0], skip_special_tokens=True)

        bleu_score = corpus_bleu([[reference_text.split()]], [generated_text.split()])
        total_bleu_score += bleu_score
        num_batches += 1

average_bleu_score = total_bleu_score / num_batches

print(f"\nAverage BLEU Score on Test Set: {average_bleu_score}")

100%|██████████| 156/156 [01:21<00:00,  1.91it/s]


Average BLEU Score on Test Set: 0.8645614696105637





In [30]:
# Function for generating poetry verses
def generate_poetry(input_text, max_length=100):
    input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)

    # Generate with attention_mask
    output_sequence = model.generate(
        input_ids,
        max_length=20,
        num_beams=5,
        no_repeat_ngram_size=2,
        top_k=50,
        attention_mask=torch.ones_like(input_ids)
    )

    # Decode and print the generated sequence
    generated_verse = tokenizer.decode(output_sequence[0], skip_special_tokens=True)
    print(f"Generated Verse:\n {generated_verse}")

# Example usage
input_sentence = "تو نیکی می کن"
generate_poetry(input_sentence)

# Example usage
input_sentence = "سعدیا مرد نکونام"
generate_poetry(input_sentence)

# Example usage
input_sentence = "سلام من به تو"
generate_poetry(input_sentence)

Generated Verse:
 تو نیکی می کن و رامشی کن بلند
دلم پر ز تیمار شد چون گزند

Generated Verse:
 سعدیا مرد نکونام بدست
نیامدش خراد بر زین ببست


Generated Verse:
 سلام من به تو یک امروز رواست
که یک روز تو دیگر آید به مرو

