In [None]:
!pip install tensorflow
!pip install keras
!pip install transformers
!pip install arabicnlp

In [None]:
import os
import re
import json
import torch
import random
import pickle
import string
import math
import numpy as np
import pandas as pd
import nltk
import itertools
import unicodedata
from collections import Counter
import torch.nn as nn
import torch.optim as optim
from google.colab import drive
import torch.nn.functional as F
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
from torchvision.transforms import ToTensor
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from transformers import AutoTokenizer, GPT2Tokenizer, BertForSequenceClassification, BertTokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.normalizers import Sequence, NFKC
from tokenizers.processors import TemplateProcessing
from transformers import AutoTokenizer
from tokenizers import Tokenizer, normalizers, pre_tokenizers, processors

In [None]:
drive.mount('/content/drive/')

Mounted at /content/drive/


## Arabic Poem

## Preprocessing

In [None]:
def analyze_text(text):
    # remove punctuation
    text = re.sub("[" + string.punctuation + "]", "", text)
    # split text into words
    words = text.split()
    # count the number of words
    num_words = len(words)
    # count the frequency of each word
    word_freq = Counter(words)
    # calculate the number of unique words
    num_unique_words = len(word_freq)
    # find the most common words
    most_common_words = word_freq.most_common(10)
    # split text into lines
    lines = text.split("\n")
    # count the number of lines
    num_lines = len(lines)
    return num_lines, num_words, num_unique_words, most_common_words

with open('/content/drive/MyDrive/praise-poems.txt', 'r', encoding='utf-8') as file:
    text = file.read()

# analyze the text
num_lines, num_words, num_unique_words, most_common_words = analyze_text(text)

# Print the results
print("Number of lines:", num_lines)
print("Number of words:", num_words)
print("Number of unique words:", num_unique_words)
print("Most common words:", most_common_words)

Number of lines: 11346
Number of words: 52581
Number of unique words: 31692
Most common words: [('في', 1048), ('مِن', 426), ('من', 390), ('ما', 385), ('عَلى', 291), ('لا', 286), ('بِهِ', 243), ('مِنَ', 214), ('مِنْ', 192), ('وَلا', 176)]


In [None]:
def clean_arabic_text(text):
    # remove punctuation
    text = re.sub("[" + string.punctuation + "]", "", text)
    # remove special characters
    text = re.sub("[^ء-ي0-9\s]", "", text)
    # remove English or non-Arabic words
    text = re.sub("[a-zA-Z]+", "", text)
    # remove diacritics
    text = remove_diacritics(text)
    # normalize text
    text = normalize_text(text)

    return text

def remove_diacritics(text):
    # remove diacritics (e.g., short vowels) from Arabic text
    normalized_text = unicodedata.normalize("NFKD", text)
    text_without_diacritics = "".join(c for c in normalized_text if not unicodedata.combining(c))
    return text_without_diacritics

def normalize_text(text):
    # normalize text using
    normalized_text = unicodedata.normalize("NFC", text)
    return normalized_text

In [None]:
# clean the text
cleaned_text = clean_arabic_text(text)

# define a function to tokenize the text into words or sentences
def tokenize_text(text):
    # tokenize into words
    tokenized_text = text.split()
    return tokenized_text

# tokenize the text
tokenized_text = tokenize_text(cleaned_text)

In [None]:
# clean the text
cleaned_text = clean_arabic_text(text)
tokenizer = BertTokenizer.from_pretrained("aubmindlab/bert-base-arabert")

def tokenize_text(text):
    # tokenize the text
    tokenized_text = tokenizer.tokenize(text)
    return tokenized_text

# tokenize the text
tokenized_text = tokenize_text(cleaned_text)
encoded_text = tokenizer.encode(tokenized_text, add_special_tokens=False)

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/717k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/637 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/578 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (69711 > 512). Running this sequence through the model will result in indexing errors


In [None]:
encoded_text = encoded_text[:512]

In [None]:
tokenized_text[2]

'خير'

## create a vocabulary mapping and generate input and target sequences.
### input_sequences and target_sequences represent the training data, where each input sequence is a list of indices representing a window of words, and the corresponding target sequence is a single index representing the word following the input window.

In [None]:
vocab = {}
index = 0
for token in tokenized_text:
    if token not in vocab:
        vocab[token] = index
        index += 1

vocabulary_size = len(vocab)
window_size = 10  # define the window size for the sliding window (number of tokens in the input sequence)
input_sequences = []
target_sequences = []

for i in range(len(tokenized_text) - window_size):
    input_seq = [vocab[token] for token in tokenized_text[i:i+window_size]]
    target_seq = vocab[tokenized_text[i+window_size]]
    input_sequences.append(input_seq)
    target_sequences.append(target_seq)

In [None]:
print("unique_tokens:",vocabulary_size)

unique_tokens: 8091


# Training

In [None]:
# split the dataset into training and validation sets
train_inputs, val_inputs, train_targets, val_targets = train_test_split(input_sequences, target_sequences, test_size=0.2, random_state=42)

# convert sequences to numpy arrays
train_inputs = np.array(train_inputs)
train_targets = np.array(train_targets)
val_inputs = np.array(val_inputs)
val_targets = np.array(val_targets)

In [None]:
len(train_inputs)

55760

In [None]:
len(val_targets)

13941

In [None]:
# normalize input sequences
train_inputs = train_inputs / vocabulary_size
val_inputs = val_inputs / vocabulary_size

In [None]:
# convert target sequences to one-hot encoded vectors
train_targets = to_categorical(train_targets, num_classes=vocabulary_size)
val_targets = to_categorical(val_targets, num_classes=vocabulary_size)

In [None]:
# define the model architecture
model = Sequential()
model.add(Embedding(vocabulary_size, 100, input_length=window_size))
model.add(LSTM(128))
model.add(Dense(vocabulary_size, activation='softmax'))

In [None]:
# compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam')
# train the model
model.fit(train_inputs, train_targets, validation_data=(val_inputs, val_targets), batch_size=128, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7faccc4ab8b0>

In [None]:
# Generate a poem
seed_input = "شمس"
generated_poem = seed_input

for _ in range(10):
    # tokenize the seed input
    tokenized_seed = tokenize_text(seed_input)
    if len(tokenized_seed) < window_size:
        # pad the seed input if it's shorter than the window size
        tokenized_seed = [""] * (window_size - len(tokenized_seed)) + tokenized_seed
    # convert the seed input to numerical representation
    seed_input_seq = [vocab.get(token, vocab.get('<unk>', 0)) for token in tokenized_seed]
    # normalize and reshape the seed input
    seed_input_seq = np.array(seed_input_seq) / vocabulary_size
    seed_input_seq = seed_input_seq.reshape(1, window_size)
    # predict the next token
    predictions = model.predict(seed_input_seq)
    predicted_indices = np.argmax(predictions, axis=-1)
    predicted_tokens = [list(vocab.keys())[list(vocab.values()).index(index)] for index in predicted_indices]
    # append the predicted tokens to the generated poem
    generated_poem += " " + " ".join(predicted_tokens)
    # update the seed input for the next iteration
    seed_input = " ".join(tokenized_seed[1:]) + " " + " ".join(predicted_tokens)

print(generated_poem)

شمس ال ال ال ال ال ال ال ال ال ال


In [None]:
def calculate_perplexity(model, input_sequences, target_labels):
    # convert input sequences and target labels to numpy arrays
    input_sequences = np.array(input_sequences)
    target_labels = np.array(target_labels)
    # generate predictions using the model
    predictions = model.predict(input_sequences)
    # calculate cross-entropy loss
    cross_entropy = -np.log(predictions[np.arange(len(target_labels)), target_labels])
    # calculate average loss
    average_loss = np.mean(cross_entropy)
    # calculate perplexity
    perplexity = np.exp(average_loss)

    return perplexity, average_loss

perplexity, avg_loss = calculate_perplexity(model, input_sequences, target_sequences)
print("Perplexity:", perplexity)
print("Avg_LOss:", avg_loss)

Perplexity: 587.0803
Avg_LOss: 6.3751616
