In [None]:
import numpy as np
import pandas as pd

from keras import callbacks
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import LSTM, Input, TimeDistributed, Dense, Activation, RepeatVector, Embedding, GRU
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.utils import Sequence, to_categorical

In [None]:
!pip install transformers

In [None]:
!pip -q install --upgrade --no-cache-dir gdown

In [None]:
# download and unzip data

!gdown --id 173coyis770t5tu-xblDeQWaBgvOMBxMD

!unzip Persian_poems_corpus-master.zip
%rm -r __MACOSX

In [None]:
from transformers import AutoConfig, AutoTokenizer, TFAutoModel

config = AutoConfig.from_pretrained("HooshvareLab/bert-fa-base-uncased")
tokenizer = AutoTokenizer.from_pretrained("HooshvareLab/bert-fa-base-uncased")
model = TFAutoModel.from_pretrained("HooshvareLab/bert-fa-base-uncased")

Downloading (…)lve/main/config.json:   0%|          | 0.00/440 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/1.20M [00:00<?, ?B/s]

Downloading (…)"tf_model.h5";:   0%|          | 0.00/963M [00:00<?, ?B/s]

Some layers from the model checkpoint at HooshvareLab/bert-fa-base-uncased were not used when initializing TFBertModel: ['mlm___cls', 'nsp___cls']
- This IS expected if you are initializing TFBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
All the layers of TFBertModel were initialized from the model checkpoint at HooshvareLab/bert-fa-base-uncased.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.


In [None]:
selected_poets = [
    'vahshi',
    'saadi',
    'ferdousi',
    'seyf',
    'sanaee',
    'parvin',
    'jami',
    'moulavi',
    'shabestari',
    'gilani'
]

selected_label = [0, 1, 1, 0, 0, 0, 1, 1, 1, 0]

In [None]:
def tokenize(sentences):
  # Create tokenizer
  text_tokenizer = Tokenizer()
  # Fit texts
  text_tokenizer.fit_on_texts(sentences)
  return text_tokenizer.texts_to_sequences(sentences), text_tokenizer

In [None]:
corp = []
label = []
for i in range(10):
  with open("Persian_poems_corpus-master/normalized/" + selected_poets[i] + "_norm.txt") as f:
    poet_file = f.read().splitlines()[2:]
  f.close()

  for j in range(len(poet_file)):
    corp.append(poet_file[j].split(' '))
    label.append(selected_label[i])
text_tokenized, text_tokenizer = tokenize(corp)



In [None]:
max_len = len(max(text_tokenized,key=len))
vocab = len(text_tokenizer.word_index) + 1
data_size = len(poet_file)

print("max_len: ", max_len)
print("vocab: ", vocab)
print("data_size: ", data_size)

max_len:  18
vocab:  66498
data_size:  1281


In [None]:
input_len = 10

data = []
data_label = []

steps = int(len(corp) / input_len)

corp_pad = pad_sequences(corp, max_len, padding = "post")


for i in range(steps):
  if (i+1)*input_len < len(corp):
    data.append(corp_pad[i*input_len : (i+1)*input_len])
    data_label.append(label[i*input_len])

del corp, label

In [None]:
corp

[['آه', '', 'تاکی', 'ز', 'سفر', 'باز', 'نیایی', '', 'بازآ'],
 ['اشتیاق', 'تو', 'مرا', 'سوخت', 'کجایی', 'بازآ'],
 ['شده', 'نزدیک', 'که', 'هجران', 'تو', 'مارا', 'بکشد'],
 ['گرهمان', 'بر', 'سرخونریزی', 'مایی', '', 'بازآ'],
 ['کرده', 'ای', 'عهد', 'که', 'بازآیی', 'و', 'ما', 'را', 'بکشی'],
 ['وقت', 'آنست', 'که', 'لطفی', 'بنمایی', 'بازآ'],
 ['رفتی', 'و', 'باز', 'نمی', 'آیی', 'و', 'من', 'بی', 'تو', 'به', 'جان'],
 ['جان', 'من', 'اینهمه', 'بی', 'رحم', 'چرایی', 'بازآ'],
 ['وحشی', 'از', 'جرم', 'همین', 'کز', 'سر', 'آن', 'کو', 'رفتی'],
 ['گرچه', 'مستوجب', 'صد', 'گونه', 'جفایی', 'بازآ'],
 ['کشیده', 'عشق', 'در', 'زنجیر', 'جان', 'ناشکیبا', 'را'],
 ['نهاده', 'کار', 'صعبی', 'پیش', 'صبر', 'بند', 'فرسا', 'را'],
 ['توام', 'سررشته', 'داری', 'گر', 'پرم', 'سوی', 'تو', 'معذورم'],
 ['که', 'در', 'دست', 'اختیاری', 'نیست', 'مرغ', 'بند', 'بر', 'پا', 'را'],
 ['من', 'از', 'کافرنهادیهای', 'عشق', '', 'این', 'رشک', 'می', 'بینم'],
 ['که', 'با', 'یعقوب', 'هم', 'خصمی', 'بود', 'جان', 'زلیخا', 'را'],
 ['به', 'گنجشگان', 'میالا