In [35]:
#GLOBAL IMPORTS

import numpy as np
import os
import pandas as pd
import re
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

#LOCAL IMPORTS


#GLOBAL VARIABLES


In [8]:
def read_data():
    #Fetch THE DATA
    path_to_data = "dataset/ara.txt"

    #Retrive some of the data
    lines = pd.read_table(path_to_data, names = ['source', 'target', 'comments'])
    return lines

lines = read_data()
lines.sample(6)


Unnamed: 0,source,target,comments
10381,It would seem that the weather is improving.,يظهر أن الطقس يتحسن.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
7979,They accused me of being a liar.,هم اتهموني بالكذب.,CC-BY 2.0 (France) Attribution: tatoeba.org #1...
397,It may rain.,من الممكن أن تمطر.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
11030,"We tried to get him to change his mind, but co...",حاولنا أن نغير رأيه لكننا لم نستطع.,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
7914,She showed me around the campus.,أعطتني جولة في الحرم الجامعي.,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
4133,He knows the city well.,هو يعرف المدينة جيداً.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [36]:
'''
    Cleaning the data:
        - Convert english (source) into lowercase
        - Remove Qoutes
        - Remove all special characters like “@, !, *, $, #, ?, %, etc.”
        - Remove numbers .. since they are different!
        - Remove spaces
'''

#using pandas lib

#LOWER CASE
lines = read_data()
lines.source = lines.source.apply (lambda x: x.lower())
# lines.target = lines.target.apply (lambda x: x.lower()) --ARABIC

#Qoutes
lines.source = lines.source.apply (lambda x: re.sub("'", '', x))
lines.target = lines.target.apply (lambda x: re.sub("'", '', x))

#A list of all punctuations
punc = set(string.punctuation)

punc.add ('؟') #ara
punc.add ('...') #ara
punc.add ('...') #eng
punc.add ('،') #ara
lines.source = lines.source.apply(lambda x: ''.join(char1 for char1 in x if char1 not in punc))
lines.target = lines.target.apply(lambda x: ''.join(char1 for char1 in x if char1 not in punc))

#Remving digits
num_digits= str.maketrans('','', digits)
lines.source = lines.source.apply(lambda x: x.translate(num_digits))
lines.target = lines.target.apply(lambda x: x.translate(num_digits))

# Remove extra spaces
lines.source=lines.source.apply(lambda x: x.strip())
lines.target=lines.target.apply(lambda x: x.strip())
lines.source=lines.source.apply(lambda x: re.sub(" +", " ", x))
lines.target=lines.target.apply(lambda x: re.sub(" +", " ", x))

#adding start/end tags
lines.source = lines.source.apply(lambda x : '<sos> '+ x + ' <eos>')
lines.target = lines.target.apply(lambda x : '<sos> '+ x + ' <eos>')
lines.sample(6)


Unnamed: 0,source,target,comments
1173,<sos> i keep sneezing <eos>,<sos> أعطس كثيراً <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #3...
9904,<sos> the rumor spread throughout the country ...,<sos> انتشرت الشائعة على مدار الدولة <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
1483,<sos> how youve grown <eos>,<sos> لقد كبرت <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
10723,<sos> im sick of hearing the same thing all th...,<sos> سئمت من سماع نفس الموشّح دائمًا <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #4...
894,<sos> i know the boy <eos>,<sos> أعرف ذلك الولد <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
8338,<sos> they had a dent in the rear door <eos>,<sos> لديهم بعجة في الباب الخلفي <eos>,CC-BY 2.0 (France) Attribution: tatoeba.org #2...


In [65]:
#The whole Source set length
source_vocab = []
for line in lines.source:
    chars = line.split(' ')
    for char in chars: 
        source_vocab.append (char)

target_vocab = set()
for line in lines.target:
    chars = line.split(' ')
    for char in chars: 
        if char not in target_vocab:
            target_vocab.add(char)
        
print (f'The Whole Source count \t\t{len(source_vocab)}')
print (f'Uniques in the Source count \t{len(set(source_vocab))}')
print ()
print (f'Uniques in the Target count \t{len(target_vocab)}')
#NOTE: the arabic language is vastly rich!

source_vocab = sorted(list(set(source_vocab)))
target_vocab = sorted(list(target_vocab))

# print (target_vocab)

The Whole Source count 		88003
Uniques in the Source count 	4168

Uniques in the Target count 	11774
