## Import Libraries

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


import os
import string
from string import digits
import matplotlib.pyplot as plt
%matplotlib inline
import re

import tensorflow as tf
import seaborn as sns
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', -1)




## Download dataset

In [None]:
!wget https://opus.nlpl.eu/download.php?f=TEP/v1/moses/en-fa.txt.zip -O opus.zip
!unzip opus.zip

with open("/content/TEP.en-fa.en", 'r', encoding='utf-8') as f:
    input_sentences = f.read().split('\n')
f.close()

with open("/content/TEP.en-fa.fa", 'r', encoding='utf-8') as f:
    output_sentences = f.read().split('\n')
f.close()

import csv

header = ['english_sentence', 'persian_sentence']

with open('corpus.csv', 'w', encoding='utf-8', newline='') as corpus_writer:
    writer = csv.writer(corpus_writer)

    # write the header
    writer.writerow(header)

    for value in range(len(output_sentences)):
        writer.writerow([input_sentences[value], output_sentences[value]])

corpus_writer.close()

--2022-07-17 15:59:06--  https://opus.nlpl.eu/download.php?f=TEP/v1/moses/en-fa.txt.zip
Resolving opus.nlpl.eu (opus.nlpl.eu)... 193.166.25.9
Connecting to opus.nlpl.eu (opus.nlpl.eu)|193.166.25.9|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://object.pouta.csc.fi/OPUS-TEP/v1/moses/en-fa.txt.zip [following]
--2022-07-17 15:59:08--  https://object.pouta.csc.fi/OPUS-TEP/v1/moses/en-fa.txt.zip
Resolving object.pouta.csc.fi (object.pouta.csc.fi)... 86.50.254.18, 86.50.254.19
Connecting to object.pouta.csc.fi (object.pouta.csc.fi)|86.50.254.18|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 16353318 (16M) [application/zip]
Saving to: ‘opus.zip’


2022-07-17 15:59:10 (9.46 MB/s) - ‘opus.zip’ saved [16353318/16353318]

Archive:  opus.zip
  inflating: TEP.en-fa.en            
  inflating: TEP.en-fa.fa            
  inflating: README                  


## Make our data ready

### Read CSV file

In [None]:
lines=pd.read_csv("/content/corpus.csv",encoding='utf-8')

In [None]:
lines

Unnamed: 0,english_sentence,persian_sentence
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .
...,...,...
612082,zodiacal light .,حمره مغربيه .
612083,zombi .,انسان زنده شد .
612084,zombiism .,مارخداگرائي .
612085,zonal .,مداري .


In [None]:
lines.head(20)

Unnamed: 0,english_sentence,persian_sentence
0,raspy breathing .,صداي خر خر .
1,dad .,پدر .
2,maybe its the wind .,شايد صداي باد باشه .
3,no .,نه .
4,stop please stop .,دست نگه داريد خواهش ميکنم دست نگه داريد .
5,"you have a week , evans then well burn the house .",اوانز تو فقط يک هفته وقت داري وگرنه خونتو خواهيم سوزوند .
6,william .,ويليام .
7,"god damn it , william .",لعنتي . ويليام 8 .
8,god damn it put that down .,لعنت به تو . اونو بذار زمين .
9,let go .,بذار برم .


### clean dataset

In [None]:
pd.isnull(lines).sum()

english_sentence    1
persian_sentence    1
dtype: int64

In [None]:
lines=lines[~pd.isnull(lines['english_sentence'])]

In [None]:
lines.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


### Let us pick any 300000 rows from the dataset.

In [None]:
lines=lines.sample(n=300000,random_state=42)
lines.shape

(300000, 2)

### More cleaning

In [None]:
# Lowercase all characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.lower())
lines['persian_sentence']=lines['persian_sentence'].apply(lambda x: x.lower())

In [None]:
# Remove quotes
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub("'", '', x))
lines['persian_sentence']=lines['persian_sentence'].apply(lambda x: re.sub("'", '', x))

In [None]:
exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
lines['english_sentence']=lines['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
lines['persian_sentence']=lines['persian_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))

In [None]:
# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.translate(remove_digits))
lines['persian_sentence']=lines['persian_sentence'].apply(lambda x: x.translate(remove_digits))

lines['persian_sentence'] = lines['persian_sentence'].str.replace('؟', '')
lines['persian_sentence'] = lines['persian_sentence'].str.replace('،', '')

# Remove extra spaces
lines['english_sentence']=lines['english_sentence'].apply(lambda x: x.strip())
lines['persian_sentence']=lines['persian_sentence'].apply(lambda x: x.strip())
lines['english_sentence']=lines['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
lines['persian_sentence']=lines['persian_sentence'].apply(lambda x: re.sub(" +", " ", x))


In [None]:
# Add start and end tokens to target sequences
lines['persian_sentence'] = lines['persian_sentence'].apply(lambda x : 'START_ '+ x + ' _END')

In [None]:
lines.head()

Unnamed: 0,english_sentence,persian_sentence
208269,you hesitate people die,START_ اگه ما ترديد کنيم مردم ميميرن _END
413297,wouldnt you be a powerful man some day,START_ مگه شما يه روز يه مرد قوي نميشين _END
145412,no pushing please keep in line,START_ خواهش ميكنم هل ندين تو يک صف حركت بكنين _END
111404,hey shit guys guys,START_ هي لعنتي بچهها بچهها _END
167171,yes red im following the lead right now,START_ آره رد الان دارم تعقیبش می کنم _END


### Get our words in dataset

In [None]:
### Get English and Persian Vocabulary
all_eng_words=set()
for eng in lines['english_sentence']:
    for word in eng.split():
        if word not in all_eng_words:
            all_eng_words.add(word)

all_persian_words=set()
for hin in lines['persian_sentence']:
    for word in hin.split():
        if word not in all_persian_words:
            all_persian_words.add(word)

In [None]:
len(all_eng_words)

56334

In [None]:
len(all_persian_words)

86977

In [None]:
lines['length_eng_sentence']=lines['english_sentence'].apply(lambda x:len(x.split(" ")))
lines['length_per_sentence']=lines['persian_sentence'].apply(lambda x:len(x.split(" ")))

In [None]:
lines.head()

Unnamed: 0,english_sentence,persian_sentence,length_eng_sentence,length_per_sentence
208269,you hesitate people die,START_ اگه ما ترديد کنيم مردم ميميرن _END,4,8
413297,wouldnt you be a powerful man some day,START_ مگه شما يه روز يه مرد قوي نميشين _END,8,10
145412,no pushing please keep in line,START_ خواهش ميكنم هل ندين تو يک صف حركت بكنين _END,6,11
111404,hey shit guys guys,START_ هي لعنتي بچهها بچهها _END,4,6
167171,yes red im following the lead right now,START_ آره رد الان دارم تعقیبش می کنم _END,8,9


### Limiting the lines

In [None]:
lines[lines['length_eng_sentence']>30].shape

(0, 4)

In [None]:
lines=lines[lines['length_eng_sentence']<=20]
lines=lines[lines['length_per_sentence']<=20]

In [None]:
lines.shape

(299080, 4)

In [None]:
print("maximum length of Persian Sentence ",max(lines['length_per_sentence']))
print("maximum length of English Sentence ",max(lines['length_eng_sentence']))

maximum length of Persian Sentence  20
maximum length of English Sentence  20


In [None]:
max_length_src=max(lines['length_per_sentence'])
max_length_tar=max(lines['length_eng_sentence'])

In [None]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_persian_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_persian_words)
num_encoder_tokens, num_decoder_tokens

(56334, 86977)

## Tokenizing

In [None]:
num_decoder_tokens += 1 #for zero padding

In [None]:
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])

In [None]:
input_token_index

{'\x10\x10fact': 1,
 'a': 2,
 'aa': 3,
 'aaa': 4,
 'aaaaaaaah': 5,
 'aaaah': 6,
 'aaaahhhhh': 7,
 'aaah': 8,
 'aaahhh': 9,
 'aaahhhh': 10,
 'aagh': 11,
 'aah': 12,
 'aand': 13,
 'aandr': 14,
 'aargh': 15,
 'aaron': 16,
 'aaronic': 17,
 'aarons': 18,
 'ab': 19,
 'abacinate': 20,
 'abaction': 21,
 'abacus': 22,
 'abaddon': 23,
 'abaft': 24,
 'abalienate': 25,
 'abalone': 26,
 'abalones': 27,
 'abandon': 28,
 'abandone': 29,
 'abandoned': 30,
 'abandoning': 31,
 'abandonment': 32,
 'abandons': 33,
 'abarticulation': 34,
 'abase': 35,
 'abash': 36,
 'abassid': 37,
 'abat': 38,
 'abate': 39,
 'abatement': 40,
 'abatis': 41,
 'abattoir': 42,
 'abba': 43,
 'abbacy': 44,
 'abbatial': 45,
 'abbe': 46,
 'abbey': 47,
 'abbies': 48,
 'abbott': 49,
 'abbotts': 50,
 'abbreviation': 51,
 'abby': 52,
 'abc': 53,
 'abc�s': 54,
 'abdi': 55,
 'abdicaiton': 56,
 'abdicate': 57,
 'abdication': 58,
 'abdomen': 59,
 'abdominal': 60,
 'abdominoscopy': 61,
 'abdominous': 62,
 'abduct': 63,
 'abducted': 64,
 'a

In [None]:
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())

In [None]:
len(X)

299080

In [None]:
reverse_input_char_index

{1: '\x10\x10fact',
 2: 'a',
 3: 'aa',
 4: 'aaa',
 5: 'aaaaaaaah',
 6: 'aaaah',
 7: 'aaaahhhhh',
 8: 'aaah',
 9: 'aaahhh',
 10: 'aaahhhh',
 11: 'aagh',
 12: 'aah',
 13: 'aand',
 14: 'aandr',
 15: 'aargh',
 16: 'aaron',
 17: 'aaronic',
 18: 'aarons',
 19: 'ab',
 20: 'abacinate',
 21: 'abaction',
 22: 'abacus',
 23: 'abaddon',
 24: 'abaft',
 25: 'abalienate',
 26: 'abalone',
 27: 'abalones',
 28: 'abandon',
 29: 'abandone',
 30: 'abandoned',
 31: 'abandoning',
 32: 'abandonment',
 33: 'abandons',
 34: 'abarticulation',
 35: 'abase',
 36: 'abash',
 37: 'abassid',
 38: 'abat',
 39: 'abate',
 40: 'abatement',
 41: 'abatis',
 42: 'abattoir',
 43: 'abba',
 44: 'abbacy',
 45: 'abbatial',
 46: 'abbe',
 47: 'abbey',
 48: 'abbies',
 49: 'abbott',
 50: 'abbotts',
 51: 'abbreviation',
 52: 'abby',
 53: 'abc',
 54: 'abc�s',
 55: 'abdi',
 56: 'abdicaiton',
 57: 'abdicate',
 58: 'abdication',
 59: 'abdomen',
 60: 'abdominal',
 61: 'abdominoscopy',
 62: 'abdominous',
 63: 'abduct',
 64: 'abducted',
 65

In [None]:
lines = shuffle(lines)
lines.head(10)

Unnamed: 0,english_sentence,persian_sentence,length_eng_sentence,length_per_sentence
170484,i aint got all day i got a big production here,START_ تمام روز را وقت ندارم بايد برنامه بزرگي را تهيه کنم _END,11,13
465504,its really nothing but a series of choices isnt it,START_ چيزي نيست جز مجموعه اي از انتخاب ها اينطور نيست _END,10,12
242259,on a deliberate rejection of what is universally accepted,START_ انکار تعمدي چيزهايي قرار دهند که به طور جهاني پذيرفته شده _END,9,13
448206,unless you wannashare with us,START_ مگر اين که به ما بِگي _END,5,8
99481,im sure the salary wont be a problem,START_ مطمئنم که حقوق مسئله اي نخواهد بود _END,8,9
46648,this is not going to work little chef,START_ تو نبايد اين كار را كني سرآشپز کوچولو _END,8,10
55317,means i gotta get up in front of the class,START_ معنيش اينه که من ميخوام جلوي کلاس پاشم _END,10,10
157382,take cover my lord,START_ مراقب باشيد سرور من _END,4,6
438798,i thought you were one of em i thought i was too,START_ من فكر ميكردم تو هم يكي از اونايي منم مثله تو فكر ميكردم _END,12,15
124717,youve got to help me out here how do you mean,START_ تو بايد بهم كمك كني از اينجا بيم بيرون منظروت چطوره _END,11,13


## Split the data into train and test

In [None]:
X, y = lines['english_sentence'], lines['persian_sentence']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2,random_state=42)
X_train.shape, X_test.shape

((239264,), (59816,))

### Let us save this data

In [None]:
# X_train.to_pickle('X_train.pkl')
# X_test.to_pickle('X_test.pkl')

In [None]:
def generate_batch(X = X_train, y = y_train, batch_size = 128):
    ''' Generate a batch of data '''
    while True:
        for j in range(0, len(X), batch_size):
            encoder_input_data = np.zeros((batch_size, max_length_src),dtype='float32')
            decoder_input_data = np.zeros((batch_size, max_length_tar),dtype='float32')
            decoder_target_data = np.zeros((batch_size, max_length_tar, num_decoder_tokens),dtype='float32')
            for i, (input_text, target_text) in enumerate(zip(X[j:j+batch_size], y[j:j+batch_size])):
                for t, word in enumerate(input_text.split()):
                    encoder_input_data[i, t] = input_token_index[word] # encoder input seq
                for t, word in enumerate(target_text.split()):
                    if t<len(target_text.split())-1:
                        decoder_input_data[i, t] = target_token_index[word] # decoder input seq
                    if t>0:
                        # decoder target sequence (one hot encoded)
                        # does not include the START_ token
                        # Offset by one timestep
                        decoder_target_data[i, t - 1, target_token_index[word]] = 1.
            yield([encoder_input_data, decoder_input_data], decoder_target_data)

## Encoder-Decoder Architecture

In [None]:
latent_dim=200

In [None]:
# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb =  Embedding(num_encoder_tokens, latent_dim, mask_zero = True)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(enc_emb)
# We discard `encoder_outputs` and only keep the states.
encoder_states = [state_h, state_c]

In [None]:
# Set up the decoder, using `encoder_states` as initial state.
decoder_inputs = Input(shape=(None,))
dec_emb_layer = Embedding(num_decoder_tokens, latent_dim, mask_zero = True)
dec_emb = dec_emb_layer(decoder_inputs)
# We set up our decoder to return full output sequences,
# and to return internal states as well. We don't use the
# return states in the training model, but we will use them in inference.
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb,
                                     initial_state=encoder_states)
decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Define the model that will turn
# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

In [None]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, None)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, None, 200)    11266800    ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, None, 200)    17395600    ['input_2[0][0]']                
                                                                                              

In [None]:
train_samples = len(X_train)
val_samples = len(X_test)
batch_size = 128
epochs = 3

In [None]:
early_stop=tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)


### Model training

In [None]:
model.fit_generator(generator = generate_batch(X_train, y_train, batch_size = batch_size),
                    steps_per_epoch = train_samples//batch_size,
                    epochs=epochs,
                    validation_data = generate_batch(X_test, y_test, batch_size = batch_size),
                    validation_steps = val_samples//batch_size,
                    callbacks=  early_stop )

  


Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7f98d5ff6050>

In [None]:
!pip install tables
import tables
model.save("nmt.h5")

tables.file._open_files.close_all()

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')
!cp -r "/content/nmt.h5" "/content/drive/MyDrive/nmt.h5"

Mounted at /content/drive
