# Installing libraries

In [None]:
! pip install wikipedia
! pip install reportlab
! pip install arabic_reshaper
! pip install python-bidi

# Importing libraries

In [None]:
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import wikipedia
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow.keras.utils as ku
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Image, Spacer
from reportlab.lib.styles import getSampleStyleSheet 
from reportlab.pdfbase import pdfmetrics
from reportlab.pdfbase.ttfonts import TTFont
from reportlab.lib.styles import ParagraphStyle
import arabic_reshaper
from bidi.algorithm import get_display
pdfmetrics.registerFont(TTFont('Arabic', '/kaggle/input/pdf-font/arfonts-bahij-tanseek-pro.ttf'))

# Function  to Generate documents

In [None]:
def get_documents():
    # Set the language to Arabic
    wikipedia.set_lang("ar")

    # Search for the term "صلاح الدين" (Saladin) on Wikipedia
    results = wikipedia.search("صلاح الدين")

    # Choose the most relevant page from the search results
    # Assumes the first result is the most relevant
    page = wikipedia.page(results[0])

    # Extract the content of the selected Wikipedia page
    # Limit the extracted content to the first 100,000 characters
    text = page.content[:100000]

    # Print the first 1,000 characters of the extracted content
    print(text[:1000] + '\n')

    # Return the extracted content
    return text

# Function to build chars dictionaries

In [None]:
def get_chars_dictionaries(text):
    # Create a sorted list of unique characters in the text
    chars = sorted(list(set(text)))

    # Create a dictionary mapping each character to its index
    # char_indices maps characters to their corresponding indices
    char_indices = dict((c, i) for i, c in enumerate(chars))

    # Create a dictionary mapping each index to its corresponding character
    # indices_char maps indices to their corresponding characters
    indices_char = dict((i, c) for i, c in enumerate(chars))

    # Return the list of unique characters and the two dictionaries
    return chars, char_indices, indices_char

# Function to prepare data as X & y

In [None]:
def prepare_data(text, window=40):
    X = []  # List to store input sequences
    y = []  # List to store corresponding target characters

    # Iterate over sentences in the text
    for sentence in text.split("."):
        # Remove leading/trailing whitespace from the sentence
        sentence = sentence.strip()

        # Skip empty sentences
        if len(sentence) == 0:
            continue

        # Iterate over the characters in the sentence
        for i in range(0, len(sentence) - window, 1):
            # Extract a substring of length 'window' as the input sequence
            X.append(sentence[i : i + window])

            # Extract the character that follows the input sequence as the target
            y.append(sentence[i + window])

    # Return the input sequences (X) and target characters (y)
    return X, y

# Function to Convert characters to One Hot representation

In [None]:
def convert_to_OHV(X, y, maxlen, chars, char_indices):
    # Create a 3D numpy array to store the one-hot vector representation of input sequences
    X_OHV = np.zeros((len(X), maxlen, len(chars)), dtype=bool)
    
    # Create a 2D numpy array to store the one-hot vector representation of target characters
    y_OHV = np.zeros((len(y), len(chars)), dtype=bool)

    # Iterate over input sequences (X) and target characters (y)
    for i, sentence in enumerate(X):
        # Iterate over characters in the input sequence
        for j, char in enumerate(sentence):
            # Set the corresponding element in X_OHV to 1 based on the character index
            X_OHV[i, j, char_indices[char]] = 1
        
        # Set the corresponding element in y_OHV to 1 based on the target character index
        y_OHV[i, char_indices[y[i]]] = 1
    
    # Return the one-hot vector representations of input sequences and target characters
    return X_OHV, y_OHV

# Function to build models

In [None]:
def build_char_model(model_parameters, model_number, maxlen, chars, h=100, drop_rate=0.1):
    # Create a Sequential model
    model = tf.keras.Sequential([
        # Add a bidirectional SimpleRNN layer with 'h' hidden units, and input shape specified
        tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(h, return_sequences=True), input_shape=(maxlen, len(chars))),
        # Add another bidirectional SimpleRNN layer with double the hidden units of the previous RNN layer
        tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(h * 2)),
        # Add a dropout layer to prevent overfitting, using the specified dropout rate
        tf.keras.layers.Dropout(drop_rate),
        # Add a Dense output layer with a softmax activation function to output a probability distribution over the characters
        tf.keras.layers.Dense(len(chars), activation='softmax')
    ])

    # Compile the model with categorical crossentropy as the loss function and the Adam optimizer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Save the model parameters (hidden units, dropout rate) for future reference using the model number as the key
    model_parameters[model_number] = [h, drop_rate]

    # Print the details of the model including the model number and parameters
    print(f"\nModel{model_number}")
    print(f"Model Parameters: No. Hidden Neurons = {model_parameters[model_number][0]}, Drop Rate = {model_parameters[model_number][1]}\n")

    # Plot the model architecture (plot_model function needs to be defined elsewhere or imported)
    # This might save a visual plot of the model or display it using a graphical frontend
    plot_model(model, model_number)

    # Return the constructed model
    return model

# Function to generate characters using the trained language model

In [None]:
def generate_char_text(model, text, maxlen, chars, char_indices, indices_char, temperature=0.5):
    # Randomly select a starting index in the text
    start_index = np.random.randint(0, len(text) - maxlen - 1)
    
    # Extract the seed text starting from the selected index
    generated_text = text[start_index: start_index + maxlen]
    total_generated_text = text[start_index: start_index + maxlen]
    
    # Print the seed text
    print("Seed:", generated_text)

    # Generate 400 characters
    for i in range(400):
        # Create a one-hot encoded representation of the generated text
        sampled = np.zeros((1, maxlen, len(chars)))
        for t, char in enumerate(generated_text):
            sampled[0, t, char_indices[char]] = 1.

        # Predict the next character probabilities using the model
        preds = model.predict(sampled, verbose=0)[0]
        
        # Adjust the predicted probabilities based on the temperature
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)

        # Sample the next character index based on the adjusted probabilities
        next_index = np.random.choice(len(chars), p=preds)
        
        # Convert the sampled index to the corresponding character
        next_char = indices_char[next_index]

        # Append the next character to the generated text
        generated_text += next_char
        
        # Remove the first character from the generated text to maintain the maxlen
        generated_text = generated_text[1:]
        
        # Append the next character to the total generated text
        total_generated_text += next_char
    
    # Return the total generated text
    return total_generated_text

# Function to generate PDF containing the results

In [None]:
def generate_results_pdf(model_parameters, generated_text):
    
    # Create a custom style for Arabic text
    arabic_style = ParagraphStyle(name='ArabicStyle', fontName='Arabic', fontSize=12)
    
    # Create a PDF document
    doc = SimpleDocTemplate("Results.pdf", pagesize=letter)

    # Define the styles for the document
    styles = getSampleStyleSheet()

    # Create a list to store the PDF components
    pdf_components = []

    # Iterate over each model parameter in the dictionary
    for i, model_parameter in model_parameters.items():

        # Create a heading for the model
        heading1 = Paragraph(f"Model {i}:", styles["Heading1"])

        # Create a paragraph with the model parameters
        text = f"Model Parameters: Embedding dimension = {model_parameters[i][0]}, No. Hidden Neurons = {model_parameters[i][1]}" if i>3 else f"Model Parameters: No. Hidden Neurons = {model_parameters[i][0]}, Drop Rate = {model_parameters[i][1]}"
        para1 = Paragraph(text, styles["Normal"])

        # Load and add the model plot image to the PDF components
        img = Image(f"/kaggle/working/model_plot{i}.png", width=300, height=500)
        pdf_components += [heading1] + [Spacer(1, 20)] + [para1] + [Spacer(1, 20)] + [img]

        # Load and add the model history plot image to the PDF components
        acc = Image(f"/kaggle/working/Model_history_{i}.png", width=300, height=300)
        print(generated_text[i])
        pdf_components += [Spacer(1, 20)] + [acc] + [Spacer(1, 20)] + [Paragraph("Generated Text:", styles["Heading1"])] + [Spacer(1, 20)] +  [Paragraph(get_display(arabic_reshaper.reshape(generated_text[i])), arabic_style)] + [Spacer(1, 20)]

    # Create a heading for the Conclusion
    heading2 = Paragraph("Conclusion:")

    # Conclusion paragraph
    text = "The first model seems to outperform the other two models based on the accuracy and generated text." if list(model_parameters.keys())[2] < 4 else "The third model seems to outperform the other two models based on the accuracy and generated text."
    para2 = Paragraph(text, styles["Normal"])

    # Build the PDF document with the components
    return pdf_components + [heading2] + [para2] + [Spacer(1, 20)] , doc

# Function to plot model history

In [None]:
def plot_history(history, save_path=None):
    # Plot training accuracy values
    plt.plot(history.history['accuracy'])
    plt.title(f'Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend(['Train'], loc='upper left')

    # Save the plot if save_path is provided
    if save_path:
        plt.savefig(save_path)

    plt.show()

# Function to plot model architecture

In [None]:
def plot_model(model, model_number):

    # Plot the model architecture using plot_model function from Keras
    tf.keras.utils.plot_model(model, to_file=f'model_plot{model_number}.png', show_shapes=True, show_layer_names=True)

    # Load the image generated by plot_model function
    img = mpimg.imread(f'/kaggle/working/model_plot{model_number}.png')

    # Set the figure size (width, height) in inches for the plot
    plt.figure(figsize=(10, 11))

    # Plot the image without axis labels
    plt.imshow(img)
    plt.axis('off')
    plt.show()

# Function to fit LMs

In [None]:
def fit_char_models(models, X_OHV, y_OHV, text, maxlen, chars, char_indices, indices_char):

    generated_text = {}
    # Iterate over each model in the list of models
    for i, model in enumerate(models):

        # Print the model number for clarity
        print(f"\nModel{i+1}:")

        # Fit the model on the training data
        history = model.fit(X_OHV, y_OHV, batch_size=64, epochs=25)

        # Print a blank line for clarity
        print("\n")

        # Plot the training history for the current model
        plot_history(history, save_path=f"Model_history_{i+1}.png")
        seed_text = "صلاح الدين"
        generated_text[i+1] = generate_char_text(model, text, maxlen, chars, char_indices, indices_char)
    return generated_text

# Function to train character level LMs

In [None]:
def train_char_LMs():
    # Set the maximum length of input sequences
    maxlen = 40
    
    # Get the input text documents
    text = get_documents()
    
    # Get the character dictionaries
    chars, char_indices, indices_char = get_chars_dictionaries(text)
    
    # Prepare the input sequences and target characters
    X, y = prepare_data(text, window=maxlen)
    
    # Convert the input sequences and target characters to one-hot vector representations
    X_OHV, y_OHV = convert_to_OHV(X, y, maxlen, chars, char_indices)
    
    # Initialize a dictionary to store model parameters
    model_parameters = {}
    
    # Define the list of models to train
    models = [
        build_char_model(model_parameters, 1, maxlen, chars),
        build_char_model(model_parameters, 2, maxlen, chars, h=150, drop_rate=0.5),
        build_char_model(model_parameters, 3, maxlen, chars, h=200, drop_rate=0.7),
    ]
    
    # Train the models and generate text
    generated_text = fit_char_models(models, X_OHV, y_OHV, text, maxlen, chars, char_indices, indices_char)
    
    # Generate the results PDF
    pdf_components, doc = generate_results_pdf(model_parameters, generated_text)
    
    # Return the PDF components and the document object
    return pdf_components, doc

In [None]:
pdf_components, doc = train_char_LMs()

# Function to preprocess data for word level LM

In [None]:
def preprocess_data(text):
    # Split the text into sentences using the Arabic comma (،) as the delimiter
    sentences = text.split("،")

    # Create a Tokenizer object from the Keras preprocessing utilities
    # Set the maximum number of words to keep in the word index to 15000
    tokenizer = Tokenizer(num_words=15000)

    # Build the word index from the sentences
    tokenizer.fit_on_texts(sentences)

    # Get the total number of unique words in the word index (plus 1 for the reserved 0 index)
    total_words = len(tokenizer.word_index) + 1

    # Return the total number of words, the tokenizer object, and the list of sentences
    return total_words, tokenizer, sentences

# Function to prepare data as X & y

In [None]:
def prepare_training_data(sentences, tokenizer, total_words):
    # Initialize an empty list to store the input sequences
    input_sequences = []

    # Convert the sentences to sequences of integer word indices
    tokenized_sentences = tokenizer.texts_to_sequences(sentences)

    # Create input sequences by iterating over each tokenized sentence
    for sentence in tokenized_sentences:
        for i in range(1, len(sentence)):
            # Create an n-gram sequence from the start of the sentence to the current position
            n_gram_sequence = sentence[: i + 1]
            # Append the n-gram sequence to the list of input sequences
            input_sequences.append(n_gram_sequence)

    # Find the maximum length of the input sequences
    maxlen = max([len(x) for x in input_sequences])

    # Pad the input sequences to the maximum length
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=maxlen, padding='pre'))

    # Split the input sequences into input data (Xs) and labels (labels)
    Xs, labels = input_sequences[:, :-1], input_sequences[:, -1]

    # One-hot encode the labels
    ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

    # Return the input data, one-hot encoded labels, and the maximum sequence length
    return Xs, ys, maxlen

# Function to build word level LMs

In [None]:
def build_word_model(model_number, model_parameters, maxlen, vocab_length, d=20, h=100):

    # Define the input layer
    i = tf.keras.layers.Input((maxlen-1,))

    # Add an embedding layer
    x = tf.keras.layers.Embedding(vocab_length, d)(i)

    # Add Hidden Layer 1 - RNN Layer
    x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(h, return_sequences=True)) (x)
    
    # Add Hidden Layer 2 - RNN Layer
    x = tf.keras.layers.Bidirectional(tf.keras.layers.SimpleRNN(h * 2)) (x)

    # Add a Dropout layer for regularization
    x = tf.keras.layers.Dropout(0.1)(x)

    # Add a Dense layer with sigmoid activation for binary classification
    x = tf.keras.layers.Dense(vocab_length, activation='softmax')(x)

    # Create the model
    model = tf.keras.models.Model(i, x)

    # Compile the model with binary cross-entropy loss and Adam optimizer
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Save the model parameters for future reference
    model_parameters[model_number] = [d, h]

    # Print the model details
    print(f"\nModel{model_number}")
    print(f"Model Parameters: Embedding dimension = {model_parameters[model_number][0]}, No. Hidden Neurons = {model_parameters[model_number][1]}\n")

    # Plot the model architecture
    plot_model(model, model_number)

    # Return the model
    return model

# Function to generate words using the trained LM

In [None]:
def generate_word_text(seed_text, next_words, model, max_sequence_len, tokenizer, temperature=0.5):
    # Generate the specified number of words
    for _ in range(next_words):
        # Convert the seed text to a sequence of integer word indices
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        
        # Pad the sequence to the maximum sequence length
        token_list = pad_sequences([token_list], maxlen=max_sequence_len)
        
        # Adjust the temperature to introduce randomness
        preds = model.predict(token_list, verbose=0)[0]
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        
        # Sample the next word based on the adjusted probabilities
        predicted_index = np.random.choice(len(preds), p=preds)
        
        # Find the word corresponding to the predicted index
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
                
        # Append the new word to the seed text
        seed_text += " " + output_word

    # Capitalize the first letter of the generated text and return it
    return seed_text.title()

# Function to fit word LMs

In [None]:
def fit_word_models(models, Xs, ys, maxlen, tokenizer):

    generated_text = {}
    # Iterate over each model in the list of models
    for i, model in enumerate(models):

        # Print the model number for clarity
        print(f"\nModel{i+1}:")

        # Fit the model on the training data
        history = model.fit(Xs, ys, batch_size=64, epochs=100)

        # Print a blank line for clarity
        print("\n")

        # Plot the training and validation history for the current model
        plot_history(history, save_path=f"Model_history_{i+4}.png")

        seed_text = "صلاح الدين"
        generated_text[i+4] = generate_word_text(seed_text, 100, model, maxlen-1, tokenizer)
    return generated_text

# Function to train word level LMs

In [None]:
def train_word_LLMs():
    # Get the text data
    text = get_documents()

    # Preprocess the text data
    total_words, tokenizer, sentences = preprocess_data(text)

    # Prepare the training data
    Xs, ys, maxlen = prepare_training_data(sentences, tokenizer, total_words)

    # Initialize an empty dictionary to store model parameters
    model_parameters = {}

    # Define the list of models to train
    models = [
        build_word_model(4, model_parameters, maxlen, total_words),
        build_word_model(5, model_parameters, maxlen, total_words, d=200, h=150),
        build_word_model(6, model_parameters, maxlen, total_words, d=300, h=200),
    ]

    # Train the models and generate text
    generated_text = fit_word_models(models, Xs, ys, maxlen, tokenizer)

    # Generate the results PDF
    pdf_components, _ = generate_results_pdf(model_parameters, generated_text)

    # Return the PDF components
    return pdf_components

In [None]:
pdf_components_2 = train_word_LLMs()

# Final conclusion

In [None]:
heading = Paragraph("Final Conclusion:")
para = Paragraph("The third Word level Language model is performing very well when generating text and the text seems to be kind of reasonable.", getSampleStyleSheet()["Normal"])

# Outputing the PDF

In [None]:
doc.build(pdf_components + pdf_components_2 + [heading] + [Spacer(1, 20)] + [para])