### Semitic_preprocessor

In [1]:
import numpy as np

class Semitic_preprocessor:
  @staticmethod
  def string_vectorizer(string, vocabulary, max_len):
    empty = Semitic_preprocessor.empty_matrix(max_len, vocabulary)
    for i,l in enumerate(string):
      empty[i, vocabulary.index(l)] = 1
    return empty


  @staticmethod
  def create_vocabulary(list_of_words):
    vocabulary = set(''.join(list_of_words))
    vocabulary = sorted([i for i in vocabulary])
    return vocabulary


  @staticmethod
  def empty_matrix(max_len, vocabulary):
    array = []
    for i in range(max_len):
      array.append([0] * len(vocabulary))
    return np.array(array)

  @staticmethod
  def generate_report(model, model_history):
    pass

### Data

In [2]:
!git clone https://github.com/delmedigo88/Semitic.git

fatal: destination path 'Semitic' already exists and is not an empty directory.


In [3]:
# load all neccessary packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.model_selection import train_test_split
from tensorflow.keras import models, layers
from sklearn.preprocessing import OneHotEncoder

In [4]:
# define type of model

model_type = 'third_char'

if model_type == 'first_char':
  letter_index = 0
elif model_type == 'second_char':
  letter_index = 1
else:
  letter_index = 2
max_word_length = 8

In [5]:
# Read txt files directly from raw file on Github

df = pd.read_csv("Semitic/data/arabic_words_clean.csv",
                 # separate by # the word and root
                 encoding_errors= "ignore")

df_2side=df.sample(frac=.05, random_state=1)
df.drop(df_2side.index, axis=0, inplace=True)
df_2side.to_excel('word-root-table_side.xlsx', index=False)

## One Letter Prediction

In [6]:
# text data to matrix representation

sp = Semitic_preprocessor()

arabic_vocabulary = sp.create_vocabulary(df['word'])
test_df = df.copy()

test_df['word_as_matrix'] = test_df['word'].apply(lambda x: sp.string_vectorizer(x, arabic_vocabulary, max_word_length))
test_df['first_letter'] = test_df['root'].apply(lambda x: x[letter_index])
test_df['letter_as_matrix'] = test_df['first_letter'].apply(lambda x: sp.string_vectorizer(x, arabic_vocabulary, 1))

X = np.stack(test_df['word_as_matrix'].to_numpy())
y = np.stack(test_df['letter_as_matrix'].to_numpy())

print(f' The shape of X is: {X.shape}')
print(f' The shape of y is: {y.shape}')

 The shape of X is: (838490, 8, 36)
 The shape of y is: (838490, 1, 36)


In [7]:
# define hyper-parameters

hyper_param = dict(num_epochs = 10,
                   loss_func ='mean_squared_error',
                   lr = .001, batch_size= 64, test_size = .15,
                   activation_func = 'relu', output_activation = 'linear',
                   validation_split = .15,
                   random_state = 42)

# Split the data into training and testing| sets

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = hyper_param['test_size'],
                                                    random_state= hyper_param['random_state'])

In [8]:
# build model's architechture
mlp_model = models.Sequential([

    layers.Flatten(input_shape=(X.shape[1], X.shape[2])),  # Flatten the 3D input
    layers.Dense(256, activation= hyper_param['activation_func']),
    layers.Dense(128, activation=hyper_param['activation_func']),
    layers.Dense(64, activation=hyper_param['activation_func']),
    layers.Dense(32, activation=hyper_param['activation_func']),
    layers.Dense(1 * X.shape[2], activation=hyper_param['output_activation']),
    layers.Reshape((1, X.shape[2]))

])

# define optimizer

adam_optimizer = tf.keras.optimizers.Adam(learning_rate=hyper_param['lr'])

# compile the model

mlp_model.compile(optimizer = adam_optimizer,
                  loss= hyper_param['loss_func'],
                  metrics=['Accuracy'])

In [9]:
# Fit the model

mlp_model_logs = mlp_model.fit(X_train, y_train,
                                                 epochs = hyper_param['num_epochs'],
                                                 batch_size= hyper_param['batch_size'],
                                                 validation_split= hyper_param['validation_split'])

# Evaluate the model on the test set

loss, accuracy = mlp_model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 0.0007600419921800494, Test Accuracy: 0.9838281273841858


In [10]:
# Build Training Report:

from datetime import datetime

now = datetime.now()

current_date = now.strftime("%Y-%m-%d")
current_hour = now.hour
current_minute = now.minute

training_summary = f"Report Time: {current_date} {current_hour:02d}:{current_minute:02d}\n"
training_summary += f"model type: {model_type}\n"
training_summary += f"max word length: {max_word_length}\n"
training_summary += "\nHyper-parameters:\n"
for key, value in hyper_param.items():
  training_summary += f"{key}: {value}\n"

training_summary += "\nArchitechture:\n"

string ="1 layer: reshpaing layer\n"
i = 0
for layer in mlp_model.layers:
  if isinstance(layer, layers.Dense):
    i+=1
    mlp_model.layers[i].units
    tmp = f"{i+1} layer: {mlp_model.layers[i].units} units\n"
    string += tmp
string +="7 layer: reshpaing layer\n"
training_summary += string
training_summary += "\nModel score:\n"
training_summary += f'Test Loss: {loss:.5f}, Test Accuracy: {accuracy:.2%}\n'
training_summary += "\nTraining logs:\n"
training_summary += str(mlp_model_logs.history)
print(training_summary)

with open(f'training_summary_{current_date} {current_hour:02d}:{current_minute:02d}.txt', 'w') as f:
    f.write(training_summary)

Report Time: 2024-03-25 12:18
model type: third_char
max word length: 8

Hyper-parameters:
num_epochs: 10
loss_func: mean_squared_error
lr: 0.001
batch_size: 64
test_size: 0.15
activation_func: relu
output_activation: linear
validation_split: 0.15
random_state: 42

Architechture:
1 layer: reshpaing layer
2 layer: 256 units
3 layer: 128 units
4 layer: 64 units
5 layer: 32 units
6 layer: 36 units
7 layer: reshpaing layer

Model score:
Test Loss: 0.00076, Test Accuracy: 98.38%

Training logs:
{'loss': [0.00394314294680953, 0.0017324592918157578, 0.001142406719736755, 0.0009365206933580339, 0.000802602618932724, 0.0007158611551858485, 0.0006488523213192821, 0.0005953171639703214, 0.0005407070857472718, 0.0005042597185820341], 'Accuracy': [0.9046512842178345, 0.9601590037345886, 0.9753568768501282, 0.9799391627311707, 0.9830028414726257, 0.9850480556488037, 0.9863207340240479, 0.9876644015312195, 0.9889255166053772, 0.9897195100784302], 'val_loss': [0.0025602017994970083, 0.0014148243935778

In [11]:
# save model

import pickle

with open(f'mlp_{model_type}_{current_date} {current_hour:02d}:{current_minute:02d}.pkl', 'wb') as f:
    pickle.dump(mlp_model, f)

In [12]:
# download all files

from google.colab import files

files.download(f'/content/mlp_{model_type}_{current_date} {current_hour:02d}:{current_minute:02d}.pkl')

files.download(f'/content/training_summary_{current_date} {current_hour:02d}:{current_minute:02d}.txt')

files.download('/content/word-root-table_side.xlsx')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>