### Semitic_preprocessor

In [1]:
import numpy as np

class Semitic_preprocessor:
  @staticmethod
  def string_vectorizer(string, vocabulary, max_len):
    empty = Semitic_preprocessor.empty_matrix(max_len, vocabulary)
    for i,l in enumerate(string):
      empty[i, vocabulary.index(l)] = 1
    return empty


  @staticmethod
  def create_vocabulary(list_of_words):
    vocabulary = set(''.join(list_of_words))
    vocabulary = sorted([i for i in vocabulary])
    return vocabulary


  @staticmethod
  def empty_matrix(max_len, vocabulary):
    array = []
    for i in range(max_len):
      array.append([0] * len(vocabulary))
    return np.array(array)

  @staticmethod
  def generate_report(model, model_history):
    pass

### Data

In [2]:
!git clone https://github.com/delmedigo88/Semitic.git

Cloning into 'Semitic'...
remote: Enumerating objects: 73, done.[K
remote: Counting objects: 100% (73/73), done.[K
remote: Compressing objects: 100% (66/66), done.[K
remote: Total 73 (delta 12), reused 60 (delta 5), pack-reused 0[K
Receiving objects: 100% (73/73), 25.47 MiB | 11.53 MiB/s, done.
Resolving deltas: 100% (12/12), done.


In [3]:
# load all neccessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers
from sklearn.preprocessing import OneHotEncoder

In [4]:
# define type of model

model_type = 'full_char'
max_word_length = 8

In [5]:
# Read txt files directly from raw file on Github

df = pd.read_csv("Semitic/data/arabic_words_clean.csv",
                 # separate by # the word and root
                 encoding_errors= "ignore")

df_2side=df.sample(frac=.05, random_state=1)
df.drop(df_2side.index, axis=0, inplace=True)
#df_2side.to_excel('word-root-table_side.xlsx', index=False)

## Three Letters Prediction

In [6]:
# text data to matrix representation

sp = Semitic_preprocessor()

arabic_vocabulary = sp.create_vocabulary(df['word'])
test_df = df.copy()

test_df['word_as_matrix'] = test_df['word'].apply(lambda x: sp.string_vectorizer(x, arabic_vocabulary, max_word_length))
test_df['first_letter'] = test_df['root'].apply(lambda x: x[:3])
test_df['letter_as_matrix'] = test_df['first_letter'].apply(lambda x: sp.string_vectorizer(x, arabic_vocabulary, 3))

X = np.stack(test_df['word_as_matrix'].to_numpy())
y = np.stack(test_df['letter_as_matrix'].to_numpy())

print(f' The shape of X is: {X.shape}')
print(f' The shape of y is: {y.shape}')

 The shape of X is: (838490, 8, 36)
 The shape of y is: (838490, 3, 36)


In [7]:
# define hyper-parameters

hyper_param = dict(num_epochs = 15,
                   loss_func ='mean_squared_error',
                   lr = .01, batch_size= 64, test_size = .15,
                   activation_func = 'relu', output_activation = 'linear',
                   validation_split = .1,
                   random_state = 42)

# Split the data into training and testing| sets

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = hyper_param['test_size'],
                                                    random_state= hyper_param['random_state'])

# reshape data for LSTM

X_train_reshaped = X_train.reshape(X_train.shape[0], 1, X_train.shape[1] * len(arabic_vocabulary))
y_train_reshaped = y_train.reshape(y_train.shape[0], 1, y_train.shape[1] * len(arabic_vocabulary))
X_test_reshaped = X_test.reshape(X_test.shape[0], 1, X_test.shape[1] * len(arabic_vocabulary))
y_test_reshaped = y_test.reshape(y_test.shape[0], 1, y_test.shape[1] * len(arabic_vocabulary))

print(f"X_train shape: {X_train_reshaped.shape}")
print(f"y_train shape: {y_train_reshaped.shape}")

X_train shape: (712716, 1, 288)
y_train shape: (712716, 1, 108)


In [13]:
# build LSTM architechture
from tensorflow.keras.layers import LSTM, Bidirectional, Dropout, Reshape, Dense
from tensorflow.keras.layers import RepeatVector, TimeDistributed
from tensorflow.keras.models import Sequential

model = Sequential()
model.add(LSTM(100, input_shape=(8, 36)))
model.add(RepeatVector(3))
model.add(LSTM(50, return_sequences=True))


model.add(TimeDistributed(Dense(36)))


adam_optimizer = tf.keras.optimizers.Adam(learning_rate=hyper_param['lr'])

# compile the model

model.compile(optimizer = adam_optimizer,
                  loss= 'mse',
                  metrics=['accuracy'])

In [14]:
# Fit the model


# Fit the model

model.fit(X_train, y_train,
          epochs =15, batch_size= 64, validation_split = .1)

# Evaluate the model on the test set

loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.0011485518189147115, Test Accuracy: 0.9767546057701111


In [17]:
test_word_maktub = ''

In [16]:
# Build Training Report:

from datetime import datetime

now = datetime.now()

current_date = now.strftime("%Y-%m-%d")
current_hour = now.hour
current_minute = now.minute

training_summary = f"Report Time: {current_date} {current_hour:02d}:{current_minute:02d}\n"
training_summary += f"model type: {model_type} LSTM \n"
training_summary += f"max word length: {max_word_length}\n"
training_summary += "\nHyper-parameters:\n"
for key, value in hyper_param.items():
  training_summary += f"{key}: {value}\n"

training_summary += "\nArchitechture:\n"


string = ""
i = 0
for layer in model.layers:
  if isinstance(layer, layers.Dense):
    i+=1
    model.layers[i].units
    tmp = f"{i+1} layer: {model.layers[i].units} units\n"
    string += tmp
training_summary += string
training_summary += "\nModel score:\n"
training_summary += f'Test Loss: {loss:.5f}, Test Accuracy: {accuracy:.2%}\n'
training_summary += "\nTraining logs:\n"
training_summary += str(model.history)

training_summary += '''Epoch 1/15
10023/10023 [==============================] - 134s 13ms/step - loss: 0.0035 - accuracy: 0.9262 - val_loss: 0.0019 - val_accuracy: 0.9595
Epoch 2/15
10023/10023 [==============================] - 128s 13ms/step - loss: 0.0017 - accuracy: 0.9647 - val_loss: 0.0017 - val_accuracy: 0.9647
Epoch 3/15
10023/10023 [==============================] - 129s 13ms/step - loss: 0.0015 - accuracy: 0.9696 - val_loss: 0.0014 - val_accuracy: 0.9701
Epoch 4/15
10023/10023 [==============================] - 129s 13ms/step - loss: 0.0013 - accuracy: 0.9724 - val_loss: 0.0014 - val_accuracy: 0.9715
Epoch 5/15
10023/10023 [==============================] - 132s 13ms/step - loss: 0.0013 - accuracy: 0.9742 - val_loss: 0.0013 - val_accuracy: 0.9735
Epoch 6/15
10023/10023 [==============================] - 128s 13ms/step - loss: 0.0012 - accuracy: 0.9754 - val_loss: 0.0013 - val_accuracy: 0.9735
Epoch 7/15
10023/10023 [==============================] - 128s 13ms/step - loss: 0.0012 - accuracy: 0.9766 - val_loss: 0.0013 - val_accuracy: 0.9740
Epoch 8/15
10023/10023 [==============================] - 129s 13ms/step - loss: 0.0011 - accuracy: 0.9772 - val_loss: 0.0012 - val_accuracy: 0.9752
Epoch 9/15
10023/10023 [==============================] - 130s 13ms/step - loss: 0.0011 - accuracy: 0.9779 - val_loss: 0.0012 - val_accuracy: 0.9763
Epoch 10/15
10023/10023 [==============================] - 127s 13ms/step - loss: 0.0011 - accuracy: 0.9786 - val_loss: 0.0012 - val_accuracy: 0.9759
Epoch 11/15
10023/10023 [==============================] - 127s 13ms/step - loss: 0.0011 - accuracy: 0.9788 - val_loss: 0.0012 - val_accuracy: 0.9770
Epoch 12/15
10023/10023 [==============================] - 133s 13ms/step - loss: 0.0010 - accuracy: 0.9793 - val_loss: 0.0011 - val_accuracy: 0.9775
Epoch 13/15
10023/10023 [==============================] - 130s 13ms/step - loss: 0.0010 - accuracy: 0.9795 - val_loss: 0.0012 - val_accuracy: 0.9765
Epoch 14/15
10023/10023 [==============================] - 127s 13ms/step - loss: 0.0010 - accuracy: 0.9797 - val_loss: 0.0011 - val_accuracy: 0.9770
Epoch 15/15
10023/10023 [==============================] - 129s 13ms/step - loss: 9.9580e-04 - accuracy: 0.9802 - val_loss: 0.0012 - val_accuracy: 0.9760
3931/3931 [==============================] - 18s 4ms/step - loss: 0.0011 - accuracy: 0.9768
Test Loss: 0.0011485518189147115, Test Accuracy: 0.9767546057701111'''
print(training_summary)

with open(f'training_summary_{current_date} {current_hour:02d}:{current_minute:02d}.txt', 'w') as f:
    f.write(training_summary)

Report Time: 2024-03-27 17:17
model type: full_char LSTM 
max word length: 8

Hyper-parameters:
num_epochs: 10
loss_func: mean_squared_error
lr: 0.01
batch_size: 64
test_size: 0.15
activation_func: relu
output_activation: linear
validation_split: 0.1
random_state: 42

Architechture:

Model score:
Test Loss: 0.00115, Test Accuracy: 97.68%

Training logs:
<keras.src.callbacks.History object at 0x7f24644168c0>Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test Loss: 0.0011485518189147115, Test Accuracy: 0.9767546057701111


In [22]:
model.layers

[<keras.src.layers.rnn.lstm.LSTM at 0x7f632c655630>,
 <keras.src.layers.reshaping.repeat_vector.RepeatVector at 0x7f632c666c20>,
 <keras.src.layers.rnn.lstm.LSTM at 0x7f632c6839d0>,
 <keras.src.layers.rnn.time_distributed.TimeDistributed at 0x7f632c674100>]

In [17]:
# save model

import pickle

with open(f'lstm_{model_type}_{current_date} {current_hour:02d}:{current_minute:02d}.pkl', 'wb') as f:
    pickle.dump(model, f)

In [18]:
# download all files

from google.colab import files

files.download(f'/content/lstm_{model_type}_{current_date} {current_hour:02d}:{current_minute:02d}.pkl')

files.download(f'/content/training_summary_{current_date} {current_hour:02d}:{current_minute:02d}.txt')

#files.download('/content/word-root-table_side.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>