In [1]:
import json
import os
import glob

import numpy as np
import pandas as pd

from tqdm import tqdm

from pathlib import Path

import torch
from transformers import AutoTokenizer, AutoConfig, AutoModelWithLMHead
from transformers import AutoTokenizer, GPT2LMHeadModel, GPT2Config

from IPython import display

In [2]:
with open('/kaggle/input/ferdousi-poems/ferdousi.txt', 'r') as data:
    data = data.readlines()[2:]
    out_file = open('ferdousi_modified.csv', 'w')
    output = ['first\tlast\n']
    for i in range(0, len(data)-1, 2):
        output.append(data[i].rstrip()+'\t' + data[i + 1].rstrip()+'\n')
    out_file.writelines(output)
    out_file.close()


In [3]:
df = pd.read_csv("ferdousi_modified.csv", sep="\t")

df["beits"] = df["first"].values + "<sep>" + df["last"]

texts = df["beits"].values.tolist()

In [4]:
df["beits"][:5]

0    به نام خداوند جان و خرد<sep>کزین برتر اندیشه ب...
1    خداوند نام و خداوند جای<sep>خداوند روزی ده رهنمای
2    خداوند کیوان و گردان سپهر<sep>فروزنده ماه و نا...
3    ز نام و نشان و گمان برترست<sep>نگارندهٔ بر شده...
4    به بینندگان آفریننده را<sep>نبینی مرنجان دو بی...
Name: beits, dtype: object

In [5]:
model_name_or_path = "HooshvareLab/gpt2-fa"

tokenizer = AutoTokenizer.from_pretrained(
    model_name_or_path,
    bos_token='<s>', 
    eos_token='</s>', 
    pad_token='<pad>',
    unk_token='<unk>'
)
tokenizer.add_special_tokens({
    "bos_token": '</s>',
    "eos_token": '</s>', 
    "pad_token": '<pad>',
    "unk_token": '<unk>'
})

config = AutoConfig.from_pretrained(
    model_name_or_path,
    bos_token_id=tokenizer("<s>")["input_ids"][0], 
    eos_token_id=tokenizer("</s>")["input_ids"][0], 
    pad_token_id=tokenizer("<pad>")["input_ids"][0],
    unk_token_id=tokenizer("<unk>")["input_ids"][0],
)

tokenizer.save_pretrained("/content/gpt2/")
config.save_pretrained("/content/gpt2/")

!wget "https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/pytorch_model.bin" -P /content/gpt2/
!wget "https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/tokenizer.json" -P /content/gpt2/

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--2024-01-17 19:02:14--  https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/pytorch_model.bin
Resolving huggingface.co (huggingface.co)... 18.244.202.118, 18.244.202.73, 18.244.202.60, ...
Connecting to huggingface.co (huggingface.co)|18.244.202.118|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs.huggingface.co/HooshvareLab/gpt2-fa/46b0b806c740a0f0a9f056f5574c5fa896166fe844945fd3c849bf34365e5060?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27pytorch_model.bin%3B+filename%3D%22pytorch_model.bin%22%3B&response-content-type=application%2Foctet-stream&Expires=1705777334&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwNTc3NzMzNH19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy5odWdnaW5nZmFjZS5jby9Ib29zaHZhcmVMYWIvZ3B0Mi1mYS80NmIwYjgwNmM3NDBhMGYwYTlmMDU2ZjU1NzRjNWZhODk2MTY2ZmU4NDQ5NDVmZDNjODQ5YmYzNDM2NWU1MDYwP3Jlc3BvbnNlLWNvbnRlbnQtZGlzcG9zaXRpb249KiZyZXNwb25zZS1jb250ZW50LXR5cGU9KiJ9XX0_

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


--2024-01-17 19:02:17--  https://huggingface.co/HooshvareLab/gpt2-fa/resolve/main/tokenizer.json
Resolving huggingface.co (huggingface.co)... 18.244.202.118, 18.244.202.73, 18.244.202.68, ...
Connecting to huggingface.co (huggingface.co)|18.244.202.118|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2748949 (2.6M) [text/plain]
Saving to: '/content/gpt2/tokenizer.json.4'


2024-01-17 19:02:17 (26.0 MB/s) - '/content/gpt2/tokenizer.json.4' saved [2748949/2748949]



In [6]:
tokenizer = AutoTokenizer.from_pretrained(
    "/content/gpt2",
    bos_token='<s>', 
    eos_token='</s>', 
    pad_token='<pad>'
)

print(tokenizer.encode("سلام بر شما"))
print(tokenizer.encode("<s>"))
print(tokenizer.encode("</s>"))
print(tokenizer.encode("<pad>"))
print(tokenizer.encode("<|startoftext|>"))
print(tokenizer.encode("<sep>"))

[8906, 327, 512]
[0]
[2]
[1]
[6]
[9]


In [7]:
seq_lengths = [len(tokenizer.encode(text)) for text in texts]
seq_lengths = list(np.array(seq_lengths[:-1]) + np.array(seq_lengths[1:]))
max_seq = max(seq_lengths)+len(["<s>","<|startoftext|>","</s>"])
print(f'The longest dataset input is {max_seq} tokens long.')

The longest dataset input is 44 tokens long.


In [8]:
from torch.utils.data import Dataset  # this is the pytorch class import
import torch
torch.manual_seed(42)


class MTGDataset(Dataset):

    def __init__(self, txt_list, tokenizer, max_length=1024):

        self.tokenizer = tokenizer  # the gpt2 tokenizer we instantiated
        self.input_ids = []
        self.attn_masks = []
        
        
        for i in range(len(txt_list)-1):
            """
            This loop will iterate through each entry in the flavour text corpus.
            For each bit of text it will prepend it with the start of text token,
            then append the end of text token and pad to the maximum length with the 
            pad token. 
            """
            this=txt_list[i]
            that=txt_list[i+1]
            encodings_dict = tokenizer("<s>"+this+"<|startoftext|>"+that+"</s>",
                                       truncation=True,
                                       max_length=max_length,
                                       padding="max_length")

            """
            Each iteration then appends either the encoded tensor to a list,
            or the attention mask for that encoding to a list. The attention mask is
            a binary list of 1's or 0's which determine whether the langauge model
            should take that token into consideration or not. 
            """
            self.input_ids.append(torch.tensor(encodings_dict['input_ids']))
            self.attn_masks.append(torch.tensor(encodings_dict['attention_mask']))

    def __len__(self):
        return len(self.input_ids)-1

    def __getitem__(self, idx):
        return self.input_ids[idx], self.attn_masks[idx]

In [9]:
from torch.utils.data import random_split

dataset = MTGDataset(texts, tokenizer, max_length=max_seq)

# Split into training and validation sets
train_size = int(0.9 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

f'There are {len(train_dataset)} samples for training, and {len(val_dataset)} samples for validation testing'

'There are 44645 samples for training, and 4961 samples for validation testing'

In [10]:
print(tokenizer.decode(train_dataset[0][0]).replace("<sep>","\t").replace("<|startoftext|>","\n").replace("<s>", "").replace("</s>", "").replace("<pad>", ""))

سپاسی برین کار بر من نهی	کز اندیشه گردد دل من تهی
بدو گفت رستم که چندین سخن	که گفتی و افگندی از مهر بن


In [11]:
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler

train_dataloader = DataLoader(
    train_dataset,
    sampler=RandomSampler(train_dataset),
    batch_size=16
)

validation_dataloader = DataLoader(
    val_dataset,
    sampler=SequentialSampler(val_dataset),
    batch_size=16
)

In [12]:
import random
from transformers import GPT2LMHeadModel, GPT2Config
import numpy as np

# Loading the model configuration and setting it to the GPT2 standard settings.
configuration = GPT2Config.from_pretrained('gpt2', output_hidden_states=False)

# Create the instance of the model and set the token size embedding length
model = GPT2LMHeadModel.from_pretrained("gpt2", config=configuration)
model.resize_token_embeddings(len(tokenizer))

# Tell pytorch to run this model on the GPU.
device = torch.device("cuda")
model.cuda()

# This step is optional but will enable reproducible runs.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [13]:
epochs = 5
warmup_steps = 1e2
sample_every = 1000

optimizer = torch.optim.AdamW(
    model.parameters(),
    lr=5e-4,
    eps=1e-8
)

In [14]:
from transformers import get_linear_schedule_with_warmup

total_steps = len(train_dataloader) * epochs

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps)


In [15]:
import random
import time
import datetime
from tqdm import tqdm


def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


total_t0 = time.time()

training_stats = []

model = model.to(device)

for epoch_i in tqdm(range(0, epochs), position=0):

    print(f'Beginning epoch {epoch_i + 1} of {epochs}')

    t0 = time.time()

    total_train_loss = 0

    model.train()

    for step, batch in tqdm(enumerate(train_dataloader), total=len(train_dataloader), position=0):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        model.zero_grad()

        outputs = model(b_input_ids, labels=b_labels, attention_mask=b_masks)#, token_type_ids=None)

        loss = outputs[0]

        batch_loss = loss.item()
        total_train_loss += batch_loss

        loss.backward()
        optimizer.step()
        scheduler.step()
        
        
        if step % sample_every == 0 and not step == 0:

            elapsed = format_time(time.time() - t0)
            print()
            print(f'Batch {step} of {len(train_dataloader)}. Loss:{batch_loss}. Time:{elapsed}')

            model.eval()


            sample_input = train_dataset[random.randint(0, len(train_dataset))]
            sample_input_tokens = sample_input[0][:torch.where(sample_input[0] == 6)[0][0]+1]
            sample_input_ids = sample_input_tokens.unsqueeze(0).to(device)# sample_input_ids.to(device)
            sample_expected = sample_input[0].unsqueeze(0).to(device)
            
            
            sample_outputs = model.generate(
                input_ids=sample_input_ids,
                do_sample=True,
                top_k=50,
                max_length=max_seq,
                top_p=0.95,
                num_return_sequences=1
            )
            
            
            gen_sample_input = tokenizer.decode(sample_input_ids[0], skip_special_tokens=False).replace("<|startoftext|>", "\n")
            gen_sample_input = gen_sample_input.replace("<s>", "").replace("</s>", "").replace("<pad>", "").replace("<sep>", "\t")
            
            gen_expected_input = tokenizer.decode(sample_expected[0], skip_special_tokens=False).replace("<|startoftext|>", "\n")
            gen_expected_input = gen_expected_input.replace("<s>", "").replace("</s>", "").replace("<pad>", "").replace("<sep>", "\t")
            
            for i, sample_output in enumerate(sample_outputs):
                gen_sample_output = tokenizer.decode(sample_output, skip_special_tokens=False).replace("<|startoftext|>", "\n")
                gen_sample_output = gen_sample_output.replace("<s>", "").replace("</s>", "").replace("<pad>", "").replace("<sep>", "\t")

                
                print(f'Example output for:\nreal:\t{gen_expected_input}\n\noutput:\t{gen_sample_output}')

            model.train()

            
    # Calculate the average loss over all of the batches.
    avg_train_loss = total_train_loss / len(train_dataloader)

    # Measure how long this epoch took.
    training_time = format_time(time.time() - t0)

    print()
    print(f'Average Training Loss: {avg_train_loss}. Epoch time: {training_time}')
    print()

    t0 = time.time()

    model.eval()

    total_eval_loss = 0
    nb_eval_steps = 0

    # Evaluate data for one epoch
    for batch in tqdm(validation_dataloader, total=len(validation_dataloader), position=0):

        b_input_ids = batch[0].to(device)
        b_labels = batch[0].to(device)
        b_masks = batch[1].to(device)

        with torch.no_grad():

            outputs = model(b_input_ids, attention_mask=b_masks, labels=b_labels)

            loss = outputs[0]

        batch_loss = loss.item()
        total_eval_loss += batch_loss

    avg_val_loss = total_eval_loss / len(validation_dataloader)

    validation_time = format_time(time.time() - t0)

    print()
    print(f'Validation loss: {avg_val_loss}. Validation Time: {validation_time}')
    print()

    # Record all statistics from this epoch.
    training_stats.append(
        {
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        }
    )

print(f'Total training took {format_time(time.time()-total_t0)}')

  0%|          | 0/5 [00:00<?, ?it/s]

Beginning epoch 1 of 5


 36%|███▌      | 1000/2791 [03:37<06:35,  4.53it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 1000 of 2791. Loss:3.022099733352661. Time:0:03:38


 36%|███▌      | 1001/2791 [03:38<11:39,  2.56it/s]

Example output for:
real:	چو رومی پس اندر هم آواز شد	چو گشتاسپ زان جایگه باز شد
بر قیصر آمد سپه تاخته	به پیروزی و گردن افراخته

output:	چو رومی پس اندر هم آواز شد	چو گشتاسپ زان جایگه باز شد
سکندر چو کشتی ز لشکر بگفت	پراندیشه شد و درد و آرام و جفت


 72%|███████▏  | 2000/2791 [07:19<02:55,  4.50it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 2000 of 2791. Loss:2.7014827728271484. Time:0:07:20


 72%|███████▏  | 2001/2791 [07:20<04:53,  2.69it/s]

Example output for:
real:	چو شاه جهان نامه هاشان بخواند	ز گفتارشان در شگفتی بماند
به نامه هر اندام را زو یکی	صفت کرده بودند لیک اندکی

output:	چو شاه جهان نامه هاشان بخواند	ز گفتارشان در شگفتی بماند
ازان پس همه مهتران و به داد	نماندی بد او را که دارد نژاد


100%|██████████| 2791/2791 [10:15<00:00,  4.53it/s]



Average Training Loss: 3.266349526469915. Epoch time: 0:10:16



100%|██████████| 311/311 [00:18<00:00, 16.43it/s]
 20%|██        | 1/5 [10:34<42:18, 634.62s/it]


Validation loss: 2.591286558813604. Validation Time: 0:00:19

Beginning epoch 2 of 5


 36%|███▌      | 1000/2791 [03:42<06:39,  4.48it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 1000 of 2791. Loss:2.5276567935943604. Time:0:03:43


 36%|███▌      | 1001/2791 [03:43<10:55,  2.73it/s]

Example output for:
real:	ز دژ تا بر او دو فرسنگ بود	دل مهتران زان سخن تنگ بود
همی هر کسی خواندند آفرین	ز دادار بر فر شاه زمین

output:	ز دژ تا بر او دو فرسنگ بود	دل مهتران زان سخن تنگ بود
همی رفت پویان به نزدیک اوی	چو تاریک شد جان تاریک اوی


 72%|███████▏  | 2000/2791 [07:25<02:55,  4.51it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 2000 of 2791. Loss:2.129385471343994. Time:0:07:26


 72%|███████▏  | 2001/2791 [07:26<04:56,  2.67it/s]

Example output for:
real:	به قیصر بگو گر نداری خرد	ز رای تو مغز تو کیفر برد
اگر شیر جنگی بتازد بگور	کنامش کند گور و هم آب شور

output:	به قیصر بگو گر نداری خرد	ز رای تو مغز تو کیفر برد
کسی زین سخن گفت و او را بگوی	چنین گفت کین رستم پوی پوی


 36%|███▌      | 1000/2791 [03:42<06:35,  4.53it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 1000 of 2791. Loss:1.8285737037658691. Time:0:03:43


 36%|███▌      | 1001/2791 [03:43<10:59,  2.71it/s]

Example output for:
real:	پسر داشتی یک گرانمایه مرد	جهاندیده و دیده هر گرم و سرد
سواری جهاندیده نامش کهرم	رسیده بسی بر سرش سرد و گرم

output:	پسر داشتی یک گرانمایه مرد	جهاندیده و دیده هر گرم و سرد
چنین گفت رستم ز راه سپاه	که این رزمجویی نیاید به راه


 72%|███████▏  | 2000/2791 [07:25<02:54,  4.54it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 2000 of 2791. Loss:1.862695574760437. Time:0:07:26


 72%|███████▏  | 2001/2791 [07:26<04:52,  2.70it/s]

Example output for:
real:	بفرمود تا زاد فرخ برفت	به نزدیک آن لشکر شاه تفت
چنین بود پیغام نزد سپاه	که از پیش بودی مرا نیک خواه

output:	بفرمود تا زاد فرخ برفت	به نزدیک آن لشکر شاه تفت
بفرمود بهرام کو را سپرد	سر از رزم کسری پر از آب کرد


100%|██████████| 2791/2791 [10:21<00:00,  4.49it/s]



Average Training Loss: 1.8432859960998185. Epoch time: 0:10:22



100%|██████████| 311/311 [00:18<00:00, 16.45it/s]
 60%|██████    | 3/5 [31:56<21:18, 639.33s/it]


Validation loss: 1.8994825043478962. Validation Time: 0:00:19

Beginning epoch 4 of 5


 36%|███▌      | 1000/2791 [03:42<06:40,  4.47it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 1000 of 2791. Loss:1.4615832567214966. Time:0:03:42


 36%|███▌      | 1001/2791 [03:42<10:57,  2.72it/s]

Example output for:
real:	زده حربه ها را بن اندر زمین	همان نیز ژوپین و شمشیر کین
به خاشاک کرده سر چاه کور	که مردم ندیدی نه چشم ستور

output:	زده حربه ها را بن اندر زمین	همان نیز ژوپین و شمشیر کین
تو گر باهشی مشمر او را به دام	وگر چند باشد ترا روی دام


 72%|███████▏  | 2000/2791 [07:24<02:55,  4.50it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 2000 of 2791. Loss:1.4794018268585205. Time:0:07:25


 72%|███████▏  | 2001/2791 [07:25<05:05,  2.59it/s]

Example output for:
real:	فرستادگان سپهدار چین	ز پیش جهانجوی شاه زمین
برفتند هر دو شده خاکسار	جهاندارشان رانده و کرده خوار

output:	فرستادگان سپهدار چین	ز پیش جهانجوی شاه زمین
ازویست شادی امید شاهی	به هر کارداری و هر کارداری


100%|██████████| 2791/2791 [10:20<00:00,  4.50it/s]



Average Training Loss: 1.4015129145137386. Epoch time: 0:10:21



100%|██████████| 311/311 [00:18<00:00, 16.44it/s]
 80%|████████  | 4/5 [42:35<10:39, 639.41s/it]


Validation loss: 1.64917712694579. Validation Time: 0:00:19

Beginning epoch 5 of 5


 72%|███████▏  | 2000/2791 [07:25<02:56,  4.49it/s]The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



Batch 2000 of 2791. Loss:1.1357437372207642. Time:0:07:25


 72%|███████▏  | 2001/2791 [07:25<05:04,  2.60it/s]

Example output for:
real:	پدر چون ورا دید خیره بماند	جهان آفرین را نهانی بخواند
بدو گفت ای شسته مغز از خرد	ز پرگوهران این کی اندر خورد

output:	پدر چون ورا دید خیره بماند	جهان آفرین را نهانی بخواند
گروی زره را گره تا گره	به گردنکشان و به گرز و گره


100%|██████████| 2791/2791 [10:21<00:00,  4.49it/s]



Average Training Loss: 1.0959248333283114. Epoch time: 0:10:22



100%|██████████| 311/311 [00:18<00:00, 16.44it/s]
100%|██████████| 5/5 [53:16<00:00, 639.24s/it]


Validation loss: 1.5425211325335733. Validation Time: 0:00:19

Total training took 0:53:16





Some Examples:

In [25]:
model.eval()

for i in range(10):
    sample_input = val_dataset[random.randint(0, len(val_dataset))]
    sample_input_tokens = sample_input[0][:torch.where(sample_input[0] == 6)[0][0]+1]
    sample_input_ids = sample_input_tokens.unsqueeze(0).to(device)# sample_input_ids.to(device)
    sample_expected = sample_input[0].unsqueeze(0).to(device)

    sample_outputs = model.generate(
            input_ids=sample_input_ids,
            do_sample=True,
            top_k=50,
            max_length=max_seq,
            top_p=0.95,
            num_return_sequences=1
    )


    gen_sample_input = tokenizer.decode(sample_input_ids[0], skip_special_tokens=False).replace("<|startoftext|>", "\n\t")
    gen_sample_input = gen_sample_input.replace("<s>", "").replace("</s>", "").replace("<pad>", "").replace("<sep>", "\t")

    gen_expected_input = tokenizer.decode(sample_expected[0], skip_special_tokens=False).replace("<|startoftext|>", "\n\t")
    gen_expected_input = gen_expected_input.replace("<s>", "").replace("</s>", "").replace("<pad>", "").replace("<sep>", "\t")

    for i, sample_output in enumerate(sample_outputs):
        gen_sample_output = tokenizer.decode(sample_output, skip_special_tokens=False).replace("<|startoftext|>", "\n\t")
        gen_sample_output = gen_sample_output.replace("<s>", "").replace("</s>", "").replace("<pad>", "").replace("<sep>", "\t")


    print(f'real:\t{gen_expected_input}\n\noutput:\t{gen_sample_output}\n\n\n\n')


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	چو خورشید تابان ز بالا بگشت	چه آن دژ نمود و چه آن پهن دشت
	بکشتند ازیشان فزون از شمار	همی دود از آتش برآمد چوقار

output:	چو خورشید تابان ز بالا بگشت	چه آن دژ نمود و چه آن پهن دشت
	بخورد و بینداخت شیر دلیر	بپیش اندرون باره چون نره شیر






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	به هشتم بیامد به دشت شکار	خود و روزبه با سواری هزار
	همه دشت یکسر پر از گور دید	ز قربان کمان کیان برکشید

output:	به هشتم بیامد به دشت شکار	خود و روزبه با سواری هزار
	سپهدار ترکان بنه برنهاد	یکی درع پرمایه بر سر نهاد






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	گر ایدونک دانی که من کردم این	مرا خواند باید جهان آفرین
	ز گوینده بپذیر به دین اوی	بیاموز ازو راه و آیین اوی

output:	گر ایدونک دانی که من کردم این	مرا خواند باید جهان آفرین
	گر از نیکوی بایدم هردیم	ولیکن شنیدن مر او را همیم






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	پراگنده گشتند چون تیره شد	سرمیگساران ز می خیره شد
	چو برزد سنان آفتاب بلند	شب تیره گشت از درفشش نژند

output:	پراگنده گشتند چون تیره شد	سرمیگساران ز می خیره شد
	چنین گفت کامشب نباید غنود	سپهبد همی نیزه باید بسود






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	ز دریا به دریا سپه گسترید	که جایی کسی روی هامون ندید
	دو لشکر چو تنگ اندر آمد به گرد	زمین شد سیاه و هوا لاژورد

output:	ز دریا به دریا سپه گسترید	که جایی کسی روی هامون ندید
	بدان کارزاری که یابند هیچ	ز هر سو بسی رنج کوتاه هیچ






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	گر از گفتهٔ خویش باز آید اوی	بنزدیک ما رزم ساز آید اوی
	بفتراک بر بسته دارم کمند	کجا ژنده پیل اندرآرم ببند

output:	گر از گفتهٔ خویش باز آید اوی	بنزدیک ما رزم ساز آید اوی
	برآنم که او سر ز فرمان شاه	بیاری بیاید بدین رزمگاه






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	کنون با تو ای پهلوان سپاه	یکی دیگر افگند بازی براه
	جز از رنگ و چاره نداند همی	ز دانش سخن برفشاند همی

output:	کنون با تو ای پهلوان سپاه	یکی دیگر افگند بازی براه
	به نزدیک او شد که کاووس شاه	فزون کرد سوی سکندر نگاه






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	وگر من نیایم چو گودرز و گیو	بخواهد ز تو کینهٔ پور نیو
	برآمد برین کار یک روز و شب	و زین گفته بر شاه نگشاد لب

output:	وگر من نیایم چو گودرز و گیو	بخواهد ز تو کینهٔ پور نیو
	منوچهر بر میسره جای داشت	ز زابل به آمل همی جای داشت






The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


real:	بیایند هر سه به نزدیک من	شود روشن این شهر تاریک من
	شود شادمان دل به دیدارشان	ببینم روانهای بیدارشان

output:	بیایند هر سه به نزدیک من	شود روشن این شهر تاریک من
	سکندر بدو گفت کای نامجوی	دو لشکر بروی اندر آریم روی




real:	فرود آمد از باره گرگین چو گرد	سر اندریمان ز تن دور کرد
	بفتراک بربست و خود برنشست	نوند سوار نبرده بدست

output:	فرود آمد از باره گرگین چو گرد	سر اندریمان ز تن دور کرد
	برفتند زان بوم تا مرز روم	پراگنده گشتند زان مرز و بوم






In [40]:
import os

output_dir = './content/'
os.makedirs(output_dir, exist_ok=True)

torch.save(model, "./content/GPT_Poet.pth")
torch.save(tokenizer, "./content/tokenizer.pth")
torch.save(configuration, "./content/config.pth")

model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
configuration.save_pretrained(output_dir)


In [42]:
!zip -r content.zip content

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  adding: content/ (stored 0%)
  adding: content/config.json (deflated 51%)
  adding: content/vocab.json (deflated 73%)
  adding: content/tokenizer.json (deflated 79%)
  adding: content/special_tokens_map.json (deflated 79%)
  adding: content/tokenizer.pth (deflated 83%)
  adding: content/GPT_Poet.pth (deflated 10%)
  adding: content/merges.txt (deflated 74%)
  adding: content/added_tokens.json (stored 0%)
  adding: content/tokenizer_config.json (deflated 85%)
  adding: content/gpt2-fa-poetry/ (stored 0%)
  adding: content/gpt2-fa-poetry/tokenizer.json (deflated 79%)
  adding: content/model.safetensors (deflated 7%)
  adding: content/config.pth (deflated 47%)
  adding: content/generation_config.json (deflated 24%)
