<a href="https://colab.research.google.com/github/Eng-Zakaria/Automated-Face-Recognition/blob/main/RNN_Translation_AR_EN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install  arabic_reshaper

Collecting arabic_reshaper
  Downloading arabic_reshaper-3.0.0-py3-none-any.whl (20 kB)
Installing collected packages: arabic_reshaper
Successfully installed arabic_reshaper-3.0.0


In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Attention
from google.colab import files
from __future__ import absolute_import, division, print_function
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import arabic_reshaper
import unicodedata
import re
import numpy as np
import os
import time
import math
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from arabic_reshaper import reshape
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input

In [3]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [4]:
!wget https://raw.githubusercontent.com/SamirMoustafa/nmt-with-attention-for-ar-to-en/master/ara_.txt

--2024-05-01 11:44:09--  https://raw.githubusercontent.com/SamirMoustafa/nmt-with-attention-for-ar-to-en/master/ara_.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 763396 (746K) [text/plain]
Saving to: ‘ara_.txt’


2024-05-01 11:44:10 (16.8 MB/s) - ‘ara_.txt’ saved [763396/763396]



In [5]:
url = "/content/ara_.txt"

In [6]:
data = open(url, "r").read().split('\n')
arabic_sentences = []
english_sentences = []
for line in data:
    pairs = line.split('\t')
    if len(pairs) == 2:
        english_sentences.append(pairs[0])
        arabic_sentences.append(pairs[1])

In [7]:
english_sentences[:10]

['Hi.',
 'Run!',
 'Help!',
 'Jump!',
 'Stop!',
 'Go on.',
 'Go on.',
 'Hello!',
 'Hurry!',
 'Hurry!']

In [8]:
arabic_sentences[:10]

['مرحبًا.',
 'اركض!',
 'النجدة!',
 'اقفز!',
 'قف!',
 'داوم.',
 'استمر.',
 'مرحباً.',
 'تعجّل!',
 'استعجل!']

In [9]:
def preprocess_arabic_text(text):

    text = re.sub(r'[^\u0600-\u06FF\s]', '', text)


    tokens = word_tokenize(text)


    arabic_stopwords = set(stopwords.words('arabic'))
    filtered_tokens = [word for word in tokens if word not in arabic_stopwords]


    preprocessed_text = ' '.join(filtered_tokens)

    return preprocessed_text

In [10]:
def preprocess_english_text(text):

    text = text.lower()

    text = re.sub(r'[^\w\s]', '', text)
    tokens = word_tokenize(text)
    english_stopwords = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word not in english_stopwords]
    preprocessed_text = ' '.join(filtered_tokens)

    return preprocessed_text

In [11]:
trans_dict = {
    'arabic_sentences': arabic_sentences,
    'english_sentences': english_sentences,
}


In [12]:
df = pd.DataFrame(trans_dict)

In [13]:
df.head()

Unnamed: 0,arabic_sentences,english_sentences
0,مرحبًا.,Hi.
1,اركض!,Run!
2,النجدة!,Help!
3,اقفز!,Jump!
4,قف!,Stop!


In [14]:
df['arabic_sentences'] = df['arabic_sentences'].map(preprocess_arabic_text)
df['english_sentences'] = df['english_sentences'].map(preprocess_english_text)

In [15]:
df.head()

Unnamed: 0,arabic_sentences,english_sentences
0,مرحبًا,hi
1,اركض,run
2,النجدة,help
3,اقفز,jump
4,قف,stop


In [16]:
arabic_tokenizer = Tokenizer()
arabic_tokenizer.fit_on_texts(df['arabic_sentences'])
arabic_seq = arabic_tokenizer.texts_to_sequences(df['arabic_sentences'])

In [17]:
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(df['english_sentences'])
english_seq = english_tokenizer.texts_to_sequences(df['english_sentences'])

In [18]:
max_arabic_len = max(len(seq) for seq in arabic_seq)
max_english_len = max(len(seq) for seq in english_seq)
arabic_seq = pad_sequences(arabic_seq, maxlen=max_arabic_len, padding='post')
english_seq = pad_sequences(english_seq, maxlen=max_english_len, padding='post')

In [19]:
arabic_seq = np.reshape(arabic_seq, (len(arabic_seq), max_arabic_len, 1))
english_seq = np.reshape(english_seq, (len(english_seq), max_english_len, 1))

In [20]:
latent_dim = 256

In [21]:
encoder_inputs = Input(shape=(max_arabic_len, 1))
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_inputs)
encoder_states = [state_h, state_c]

In [22]:
decoder_inputs = Input(shape=(max_english_len, 1))
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
decoder_dense = Dense(len(english_tokenizer.word_index) + 1, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

In [23]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [24]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [25]:
model.fit([arabic_seq, english_seq], english_seq, epochs=5, batch_size=1, validation_split=0.2)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7bb510388f40>