## RNN training

In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np
from gensim.models import FastText

%run preprocessing.ipynb


15
Char [['m', 's', '>', 'l', 'p'], ['w', 'm', 'n'], ['H', 'n', 'v'], ['w', 'h', 'w'], ['q', 'A', 'd', 'r'], ['E', 'l', 'Y'], ['A', 'l', '<', 'T', 'E', 'A', 'm'], ['>', 'w'], ['A', 'l', 'k', 's', 'w', 'p'], ['>', 'w'], ['A', 'l', 'E', 't', 'q'], ['v', 'm'], ['A', 'f', 't', 'q', 'r'], ['f', 'E', 'j', 'z'], ['E', 'n'], ['k', 'l'], ['*', 'l', 'k'], ['l', 'm'], ['y', 'j', 'z', 'h'], ['A', 'l', 'S', 'w', 'm'], ['>', 'S', 'l', 'A']]
Diac [['a', 'o', 'a', 'a', 'N'], ['a', 'a', 'o'], ['a', 'i', 'a'], ['a', 'u', 'a'], ['a', ' ', 'i', 'N'], ['a', 'a', ' '], [' ', ' ', 'i', 'o', 'a', ' ', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'a', 'i'], ['a', 'o'], [' ', 'o', 'i', 'o', 'i'], ['u', '~a'], [' ', 'o', 'a', 'a', 'a'], ['a', 'a', 'a', 'a'], ['a', 'o'], ['u', '~i'], ['a', 'i', 'a'], ['a', 'o'], ['u', 'o', 'i', 'i'], [' ', ' ', '~a', 'o', 'u'], ['a', 'o', 'F', ' ']]


### Get unique characters and diacritics

In [2]:
unique_characters = ['A', 'b', 't', 'v', 'j', 'H', 'x', 'd', '*', 'r', 'z', 's', '$', 'S', 'D', 'T', 'Z', 'E', 'g', 'f', 'q', 'k', 'l', 'm', 'n', 'h', 'w', 'y', "'", '>', '<', '&', '}', '|', '{', '`', 'Y', 'p']
unique_diacritics = ['o', 'a', 'i', '~', 'u', 'N', 'F', 'K', ' ', '~a', '~i', '~u', '~N', '~F', '~K']

num_chars = len(unique_characters)
num_classes = len(unique_diacritics)

char_to_index = {char: i for i, char in enumerate(unique_characters)}
diacritic_to_index = {diacritic: i for i, diacritic in enumerate(unique_diacritics)}

print(num_chars)
print(num_classes)

38
15


### Utils

In [3]:
def readFile(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			sentences.append(line.strip())

	return sentences

### Constants

In [4]:
TRAIN_PATH = "../dataset/train.txt"
VAL_PATH = "../dataset/val.txt"

In [None]:
corpus = readFile(TRAIN_PATH)

### Get FastText word embeddings

In [None]:
loaded_model = FastText.load("./models/ft_model")
def get_word_embeddings(word):
    return loaded_model.wv[word]

In [240]:
X_train = []
y_train = []

embeddings_train = []
max_sequence_length = 0

for sentence in corpus:
	# Clean each sentence in the corpus
	clean_sentence = run_buckwalter(sentence.strip())
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = extract_labels(clean_sentence)

	X_train.append(char_list)
	y_train.append(diacritics_list)

	# Get the max sequence length and concatenate the embeddings of the words
	for word in char_list:
		max_sequence_length = max(max_sequence_length, len(word))

		embeddings_train.append(get_word_embeddings(word))

embeddings_train = np.concatenate(embeddings_train, axis=0)
print(embeddings_train.shape)

(8351478, 100)
(2102068, 13)
(2102068, 13)


### Get X_train and Y_train

In [None]:
# Encoding and Padding the data
X_train_padded = []
for sentence in X_train:
	X_train_sequences = [[char_to_index[char] for char in word] for word in sentence]
	X_train_padded.append(pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post'))

X_train_padded = np.concatenate(X_train_padded, axis=0)
print(X_train_padded.shape)

y_train_padded = []
for sentence in y_train:
	y_train_sequences = [[diacritic_to_index[diacritic] for diacritic in diacritic_sequence] for diacritic_sequence in sentence]
	y_train_padded.append(pad_sequences(y_train_sequences, maxlen=max_sequence_length, padding='post'))

y_train_padded = np.concatenate(y_train_padded, axis=0)
print(y_train_padded.shape)

### Validation Data

In [241]:
val_corpus = readFile(VAL_PATH)

X_val = []
y_val = []

for sentence in val_corpus:
	# Clean each sentence in the corpus
	clean_sentence = run_buckwalter(sentence.strip())
	# Get the char list for each word in the sentence and its corresponding diacritics
	char_list, diacritics_list = extract_labels(clean_sentence)

	X_val.append(char_list)
	y_val.append(diacritics_list)

# Encoding and Padding the data
X_val_padded = []
for sentence in X_val:
	X_val_sequences = [[char_to_index[char] for char in word] for word in sentence]
	X_val_padded.append(pad_sequences(X_val_sequences, maxlen=max_sequence_length, padding='post'))

X_val_padded = np.concatenate(X_val_padded, axis=0)
print(X_val_padded.shape)

y_val_padded = []
for sentence in y_val:
	y_val_sequences = [[diacritic_to_index[diacritic] for diacritic in diacritic_sequence] for diacritic_sequence in sentence]
	y_val_padded.append(pad_sequences(y_val_sequences, maxlen=max_sequence_length, padding='post'))

y_val_padded = np.concatenate(y_val_padded, axis=0)
print(y_val_padded.shape)

(106066, 13)
(106066, 13)


### Dimensions

In [242]:
embedding_dim = 100
num_epochs = 5

#### Model definition

In [245]:
# Define your model
model = Sequential()
model.add(Embedding(
					input_dim=embeddings_train.shape[0],
					output_dim=embedding_dim,
					input_length=max_sequence_length,
					weights=[embeddings_train],
					trainable=False
					))
model.add(LSTM(units=100, return_sequences=True))
model.add(Dense(units=num_classes, activation='softmax'))

# Compile your model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train your model
model.fit(X_train_padded, y_train_padded, epochs=num_epochs, validation_data=(X_val_padded, y_val_padded))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (2500,) + inhomogeneous part.

### Testing

In [249]:
# Evaluate the model
accuracy = model.evaluate(X_val_padded, y_val_padded)[1]
print(accuracy)

sentence = "وعليه ينبغي حمل قول أشهب"
clean_sentence = run_buckwalter(sentence)
char_list, _ = extract_labels(clean_sentence)

X_test_sequences = [[char_to_index[char] for char in word] for word in char_list]
X_test = pad_sequences(X_test_sequences, maxlen=max_sequence_length, padding='post')

# Make predictions
predictions = model.predict(X_test)

final_output = []
index = 0
index_to_diacritic = {index: diacritic for diacritic, index in diacritic_to_index.items()}


new = sentence.split()
index = 0

for word in predictions:
	for i in range(len(new[index])):
		final_output.append(new[index][i])
		max_index = np.array(word[i]).argmax()

		if index_to_diacritic[max_index] != " ":
			final_output.append(buckwalter.untransliterate(index_to_diacritic[max_index]))
	
	index += 1
	final_output.append(" ")

final_output = "".join(final_output)

print(final_output)


0.8886049389839172
وَعَلَيْهِ يَنْبَغِي حِمْلَ قوْلُ أَشَهُبُ 
