Imports

In [1]:
from gensim.models import FastText

%run preprocessing.ipynb

### Get unique characters and diacritics

In [15]:
unique_characters = ['A', 'b', 't', 'v', 'j', 'H', 'x', 'd', '*', 'r', 'z', 's', '$', 'S', 'D', 'T', 'Z', 'E', 'g', 'f', 'q', 'k', 'l', 'm', 'n', 'h', 'w', 'y', "'", '>', '<', '&', '}', '|', '{', '`', 'Y', 'p']

arabic_letters = ['ا', 'ب', 'ت', 'ث', 'ج', 'ح', 'خ', 'د', 'ذ', 'ر', 'ز', 'س', 'ش', 'ص', 'ض', 'ط', 'ظ', 'ع', 'غ', 'ف', 'ق', 'ك', 'ل', 'م', 'ن', 'ه', 'و', 'ي', 'ء', 'أ', 'إ', 'ؤ', 'ئ', 'آ', 'ٱ', 'ٰ', 'ى', 'ة']

unique_characters = arabic_letters
unique_diacritics = DIACRITICS

char_to_index = {char: i + 1 for i, char in enumerate(unique_characters)}
diacritic_to_index = {diacritic: i for i, diacritic in enumerate(unique_diacritics)}

{'ا': 1, 'ب': 2, 'ت': 3, 'ث': 4, 'ج': 5, 'ح': 6, 'خ': 7, 'د': 8, 'ذ': 9, 'ر': 10, 'ز': 11, 'س': 12, 'ش': 13, 'ص': 14, 'ض': 15, 'ط': 16, 'ظ': 17, 'ع': 18, 'غ': 19, 'ف': 20, 'ق': 21, 'ك': 22, 'ل': 23, 'م': 24, 'ن': 25, 'ه': 26, 'و': 27, 'ي': 28, 'ء': 29, 'أ': 30, 'إ': 31, 'ؤ': 32, 'ئ': 33, 'آ': 34, 'ٱ': 35, 'ٰ': 36, 'ى': 37, 'ة': 38}
{' ٌّ': 0, 'ِ': 1, 'ُ': 2, ' َّ': 3, ' ِّ': 4, ' ُّ': 5, 'َ': 6, ' ًّ': 7, ' ٍّ': 8, 'ٌ': 9, 'ْ': 10, 'ٍ': 11, 'ً': 12, ' ': 13, 'ّ': 14}


Read File

In [13]:
def readFile(path):
	sentences = []
	with open(path, 'r', encoding='utf-8') as file:
		for line in file:
			sentences.append(line.strip())

	return sentences

In [14]:
TRAIN_PATH = "../dataset/train.txt"
VAL_PATH = "../dataset/val.txt"
LSTM_PATH="./models/lstm.pth"

Read Data 

In [15]:
def Read_data():
	corpus = readFile(TRAIN_PATH)
	loaded_model = FastText.load("./models/ft_model")
	X_train = []
	y_train = []
	embeddings_train = []
	max_sequence_length = 0

	for sentence in corpus:
		# Get the char list for each word in the sentence and its corresponding diacritics
		char_list, diacritics_list = separate_words_and_diacritics(sentence)

		X_train.append(char_list)
		y_train.append(diacritics_list)

		# Get the max sequence length and concatenate the embeddings of the words
		for word in char_list:
			max_sequence_length = max(max_sequence_length, len(word))

			embeddings_train.append(loaded_model.wv[word])

	# embeddings_train = np.concatenate(embeddings_train, axis=0)
	print(np.array(embeddings_train).shape)
	return X_train, y_train, embeddings_train, max_sequence_length




Padding Data

In [16]:
def padding_data( X_train, y_train,max_sequence_length):
	# Encoding and Padding the data
	X_train_padded = []
	for sentence in X_train:
		X_train_sequences = [[char_to_index[char] for char in word] for word in sentence]
		X_train_padded.append(pad_sequences(X_train_sequences, maxlen=max_sequence_length, padding='post'))

	# X_train_padded = np.concatenate(X_train_padded, axis=0)
	print(np.array(X_train_padded).shape)

	y_train_padded = []
	for sentence in y_train:
		y_train_sequences = [[diacritic_to_index[diacritic] for diacritic in diacritic_sequence] for diacritic_sequence in sentence]
		y_train_padded.append(pad_sequences(y_train_sequences, maxlen=max_sequence_length, padding='post'))

	# y_train_padded = np.concatenate(y_train_padded, axis=0)
	print(np.array(y_train_padded).shape)
	return X_train_padded, y_train_padded

Read Validation Data

In [17]:
def Read_val():
    val_corpus = readFile(VAL_PATH)
    loaded_model = FastText.load("./models/ft_model")
    X_val = []
    y_val = []
    embeddings_train = []
    for sentence in val_corpus:
        # Clean each sentence in the corpus
        clean_sentence = run_buckwalter(sentence.strip())
        # Get the char list for each word in the sentence and its corresponding diacritics
        char_list, diacritics_list = extract_labels(clean_sentence)

        X_val.append(char_list)
        y_val.append(diacritics_list)
        for word in char_list:

            embeddings_train.append(loaded_model.wv[word])
    return X_val, y_val,embeddings_train

Padding validation data

In [18]:
def padding_val(X_val, y_val, max_sequence_length):
    # Encoding and Padding the data
    X_val_padded = []
    for sentence in X_val:
        X_val_sequences = [[char_to_index[char] for char in word] for word in sentence]
        X_val_padded.append(pad_sequences(X_val_sequences, maxlen=max_sequence_length, padding='post'))

    # X_val_padded = np.concatenate(X_val_padded, axis=0)
    print(np.array(X_val_padded).shape)

    y_val_padded = []
    for sentence in y_val:
        y_val_sequences = [[diacritic_to_index[diacritic] for diacritic in diacritic_sequence] for diacritic_sequence in sentence]
        y_val_padded.append(pad_sequences(y_val_sequences, maxlen=max_sequence_length, padding='post'))

    # y_val_padded = np.concatenate(y_val_padded, axis=0)
    print(np.array(y_val_padded).shape)
    return X_val_padded, y_val_padded