In [35]:
#GLOBAL IMPORTS

import numpy as np
import os
import pandas as pd
import re
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

#LOCAL IMPORTS


#GLOBAL VARIABLES


In [8]:
def read_data():
    #Fetch THE DATA
    path_to_data = "dataset/ara.txt"

    #Retrive some of the data
    lines = pd.read_table(path_to_data, names = ['source', 'target', 'comments'])
    return lines

lines = read_data()
lines.sample(6)


Unnamed: 0,source,target,comments
10381,It would seem that the weather is improving.,يظهر أن الطقس يتحسن.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
7979,They accused me of being a liar.,هم اتهموني بالكذب.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
397,It may rain.,من الممكن أن تمطر.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
11030,"We tried to get him to change his mind, but co...",حاولنا أن نغير رأيه لكننا لم نستطع.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
7914,She showed me around the campus.,أعطتني جولة في الحرم الجامعي.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
4133,He knows the city well.,هو يعرف المدينة جيداً.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [79]:
'''
    Cleaning the data:
        - Convert english (source) into lowercase
        - Remove Qoutes
        - Remove all special characters like “@, !, *, $, #, ?, %, etc.”
        - Remove numbers .. since they are different!
        - Remove spaces
'''
#Read the data
lines = read_data()
#Shuffle the data
lines = shuffle(lines)

#LOWER CASE
lines.source = lines.source.apply (lambda x: x.lower())
# lines.target = lines.target.apply (lambda x: x.lower()) --ARABIC

#Qoutes
lines.source = lines.source.apply (lambda x: re.sub("'", '', x))
lines.target = lines.target.apply (lambda x: re.sub("'", '', x))

#A list of all punctuations
punc = set(string.punctuation)

punc.add ('؟') #ara
punc.add ('...') #ara
punc.add ('...') #eng
punc.add ('،') #ara
lines.source = lines.source.apply(lambda x: ''.join(char1 for char1 in x if char1 not in punc))
lines.target = lines.target.apply(lambda x: ''.join(char1 for char1 in x if char1 not in punc))

#Remving digits
num_digits= str.maketrans('','', digits)
lines.source = lines.source.apply(lambda x: x.translate(num_digits))
lines.target = lines.target.apply(lambda x: x.translate(num_digits))

# Remove extra spaces
lines.source=lines.source.apply(lambda x: x.strip())
lines.target=lines.target.apply(lambda x: x.strip())
lines.source=lines.source.apply(lambda x: re.sub(" +", " ", x))
lines.target=lines.target.apply(lambda x: re.sub(" +", " ", x))

#adding start/end tags
lines.source = lines.source.apply(lambda x : '<sos> '+ x + ' <eos>')
lines.target = lines.target.apply(lambda x : '<sos> '+ x + ' <eos>')
lines.sample(6)


Unnamed: 0,source,target,comments
5755,<sos> a good idea occurred to me <eos>,<sos> خطر لي فكرة جيدة <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
7525,<sos> there are many hotels downtown <eos>,<sos> هناك الكثير من الفنادق في وسط المدينة <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
10356,<sos> how often do you forget to do your homew...,<sos> أتنسى عادة القيام بواجبات <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #6...
6062,<sos> what browser are you using <eos>,<sos> ما البرنامج الذي تتصفح به الإنترنت <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
47,<sos> im sad <eos>,<sos> أنا حزين <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
9781,<sos> when was the last time you rode a bike <...,<sos> متى كانت آخر مرة ركبت بها دراجة هوائية <...,CC-BY 2.0 (France) Attribution: tatoeba.org #4...


In [80]:
#The whole Source set length
source_vocab = []
for line in lines.source:
    chars = line.split(' ')
    for char in chars: 
        source_vocab.append (char)

target_vocab = set()
for line in lines.target:
    chars = line.split(' ')
    for char in chars: 
        if char not in target_vocab:
            target_vocab.add(char)
        
print (f'The Whole Source count \t\t{len(source_vocab)}')
print (f'Uniques in the Source count \t{len(set(source_vocab))}')
print ()
print (f'Uniques in the Target count \t{len(target_vocab)}')
#NOTE: the arabic language is vastly rich!

source_vocab = sorted(list(set(source_vocab)))
target_vocab = sorted(list(target_vocab))

# print (target_vocab)

The Whole Source count 		88003
Uniques in the Source count 	4168

Uniques in the Target count 	11774


In [81]:
#Max lengths for Source and Targets
source_length_list=[]
for l in lines.source:
    source_length_list.append(len(l.split(' ')))
src_max_length= max(source_length_list)

target_length_list=[]
for l in lines.target:
    target_length_list.append(len(l.split(' ')))
trg_max_length= max(target_length_list)

print (f'Max length of Source lang: {src_max_length}')
print (f'Max length of Target lang: {trg_max_length}')

#Word to index dictionary
source_word2idx= dict([(word, i+1) for i,word in enumerate(source_vocab)])
target_word2idx=dict([(word, i+1) for i, word in enumerate(target_vocab)])

#creating a dictionary for index to word for source and target vocabulary
source_idx2word= dict([(i, word) for word, i in  source_word2idx.items()])
target_idx2word =dict([(i, word) for word, i in target_word2idx.items()])

#Shuffle the data


Max length of Source lang: 36
Max length of Target lang: 38


In [83]:
#Splitting the data
#train_test_split from Sklearn lib

X, y = lines.source, lines.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.1)
print (f'Source Training data shape: \t{X_train.shape}')
print (f'Target Training data shape: \t{y_train.shape}')
print ()
print (f'Source Test data shape: \t{X_test.shape}')
print (f'Target Test data shape: \t{y_test.shape}')


Source Training data shape: 	(10301,)
Source Test data shape: 	(1145,)
Target Training data shape: 	(10301,)
Target Test data shape: 	(1145,)
