## RNN training

In [175]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np

%run preprocessing.ipynb

15
Char [['m', 's', '>', 'l', 'p'], ['w', 'm', 'n'], ['H', 'n', 'v'], ['w', 'h', 'w'], ['q', 'A', 'd', 'r'], ['E', 'l', 'Y'], ['A', 'l', '<', 'T', 'E', 'A', 'm'], ['>', 'w'], ['A', 'l', 'k', 's', 'w', 'p'], ['>', 'w'], ['A', 'l', 'E', 't', 'q'], ['v', 'm'], ['A', 'f', 't', 'q', 'r'], ['f', 'E', 'j', 'z'], ['E', 'n'], ['k', 'l'], ['*', 'l', 'k'], ['l', 'm'], ['y', 'j', 'z', 'h'], ['A', 'l', 'S', 'w', 'm'], ['>', 'S', 'l', 'A']]
Diac [['a', 'o', 'a', 'a', 'N'], ['a', 'a', 'o'], ['a', 'i', 'a'], ['a', 'u', 'a'], ['a', ' ', 'i', 'N'], ['a', 'a', ' '], [' ', ' ', 'i', 'o', 'a', ' ', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'a', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'i'], ['u', '~a'], [' ', 'o', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], ['a', 'o'], ['u', '~i'], ['a', 'i', 'a'], ['a', 'o'], ['u', 'o', 'i', 'i'], [' ', ' ', '~a', 'o', 'u'], ['a', 'o', 'F', ' ']]


### Get unique characters and diacritics

In [176]:
unique_characters = ['A', 'b', 't', 'v', 'j', 'H', 'x', 'd', '*', 'r', 'z', 's', '$', 'S', 'D', 'T', 'Z', 'E', 'g', 'f', 'q', 'k', 'l', 'm', 'n', 'h', 'w', 'y', "'", '>', '<', '&', '}', '|', '{', '`', 'Y', 'p']
unique_diacritics = ['o', 'a', 'i', '~', 'u', 'N', 'F', 'K', ' ', '~a', '~i', '~u', '~N', '~F', '~K']

num_chars = len(unique_characters)
num_classes = len(unique_diacritics)

char_to_index = {char: i for i, char in enumerate(unique_characters)}
diacritic_to_index = {diacritic: i for i, diacritic in enumerate(unique_diacritics)}

print(num_chars)
print(num_classes)

38
15


### Data

In [177]:
def readFile(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			sentences.append(line.strip())

	return sentences

PATH = "../dataset/train.txt"
corpus = readFile(PATH)

X_train = []
y_train = []
max_sequence_length = 0

# Clean each sentence in the corpus
for sentence in corpus:
	clean_sentence = run_buckwalter(sentence)
	char_list, diacritics_list = extract_labels(clean_sentence)

	X_train.append(char_list)
	y_train.append(diacritics_list)

	max_sequence_length = max(max_sequence_length, max(len(word) for word in char_list))


# Encoding and Padding
X_train_padded = []
for sentence in X_train:
	X_train_sequences = [[char_to_index[char] for char in word] for word in sentence]
	X_train_padded.append(pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post'))

X_train_padded = np.concatenate(X_train_padded, axis=0)
print(X_train_padded.shape)

y_train_padded = []
for sentence in y_train:
	y_train_sequences = [[diacritic_to_index[diacritic] for diacritic in diacritic_sequence] for diacritic_sequence in sentence]
	y_train_padded.append(pad_sequences(y_train_sequences, maxlen=max_sequence_length, padding='post'))

y_train_padded = np.concatenate(y_train_padded, axis=0)
print(y_train_padded.shape)


(2102068, 13)
(2102068, 13)


### Dimensions

In [180]:
embedding_dim = 100
num_epochs = 3

#### Model definition

In [181]:
# Define your model
model = Sequential()
model.add(Embedding(input_dim=num_chars, output_dim=embedding_dim, input_length=max_sequence_length))
model.add(LSTM(units=100, return_sequences=True))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile your model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train your model
model.fit(X_train_padded, y_train_padded, epochs=num_epochs)#, validation_data=(X_val, y_val))

# Evaluate your model
# accuracy = model.evaluate(X_val, y_val)[1]

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.src.callbacks.History at 0x1b024aa0ad0>

### Testing

In [186]:
sentence = "فأشبه ما لو استعمل نفسه في الإجارة أي وما تحصل من إجارته"
clean_sentence = run_buckwalter(sentence)
char_list, _ = extract_labels(clean_sentence)

X_test_sequences = [[char_to_index[char] for char in word] for word in char_list]
X_test = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Make predictions
predictions = model.predict(X_test)

final_output = []
index = 0
index_to_diacritic = {index: diacritic for diacritic, index in diacritic_to_index.items()}


new = sentence.split()
index = 0

for word in predictions:
	for i in range(len(new[index])):
		final_output.append(new[index][i])
		max_index = np.array(word[i]).argmax()

		if index_to_diacritic[max_index] != " ":
			final_output.append(buckwalter.untransliterate(index_to_diacritic[max_index]))
	
	index += 1
	final_output.append(" ")

final_output = "".join(final_output)

print(final_output)
# print(buckwalter.transliterate("عَنْ سَالِمِ بْنِ عَبْدِ اللَّهِ"))
# print(buckwalter.transliterate(final_output))


فَأَشَبَهَ مَا لَوْ اسْتَعْمَلَ نَفْسِهِ فَي الْإِجَارَةِ أَيْ وَمَا تَحْصِلَ مَنْ إجَارَتُهُ 
