In [1]:
#ENCODER DECODER MODEL FOR WORD LEVEL EMBEDDING

In [1]:
## LOADING THE REQUIRED LIBRARIES
import pandas as pd
import numpy as np
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import warnings 
warnings.filterwarnings('ignore')
from tqdm import tqdm 
import tensorflow as tf
from  tensorflow.keras.preprocessing.sequence import pad_sequences
from  sklearn.model_selection import train_test_split
from tqdm import tqdm

In [2]:
## LOADING THE PROCESSED DATASET  

df= pd.read_csv("DATA/processed_data_lang8.csv")
df.columns = ["enc_input","dec_input"] 
df["dec_output"] = df.dec_input
df

Unnamed: 0,enc_input,dec_input,dec_output
0,and he took in my favorite subject like soccer,and he took in my favorite subjects like soccer,and he took in my favorite subjects like soccer
1,actually who let me know about lang 8 was him,actually he was the one who let me know about ...,actually he was the one who let me know about ...
2,his kanji is ability is much better than me,his kanji ability is much better than mine,his kanji ability is much better than mine
3,we have known each other for only half a year ...,we have known each other for only half a year ...,we have known each other for only half a year ...
4,i heard a sentence last night when i watched tv,i heard a sentence last night when i was watch...,i heard a sentence last night when i was watch...
...,...,...,...
503896,i like thailand language because that pronounc...,i like thai because the pronunciation sounds cute,i like thai because the pronunciation sounds cute
503897,i ate kaomangai rise with boild chikin tomya...,i ate kaomangai rice with boiled chickin tom...,i ate kaomangai rice with boiled chickin tom...
503898,i think it is important thing to become to lik...,i think it is important to like coriander in o...,i think it is important to like coriander in o...
503899,yesterday i went to umeda station to date,i went to umeda station for dating yesterday,i went to umeda station for dating yesterday


In [3]:
# Adding start and end token
## THE INPUTS TO THE DECODER REQUIRES SPECIAL TOKENS FOR THE START AND THE END SO WE ARE GOING TO USE 
## <start> AS BEGINING TOKEN
## <end>  AS END TOKEN

df["dec_input"]= "<start> " + df["dec_input"]
df["dec_output"] =  df["dec_output"] + " <end>" 
df

Unnamed: 0,enc_input,dec_input,dec_output
0,and he took in my favorite subject like soccer,<start> and he took in my favorite subjects li...,and he took in my favorite subjects like socce...
1,actually who let me know about lang 8 was him,<start> actually he was the one who let me kno...,actually he was the one who let me know about ...
2,his kanji is ability is much better than me,<start> his kanji ability is much better than ...,his kanji ability is much better than mine <end>
3,we have known each other for only half a year ...,<start> we have known each other for only half...,we have known each other for only half a year ...
4,i heard a sentence last night when i watched tv,<start> i heard a sentence last night when i w...,i heard a sentence last night when i was watch...
...,...,...,...
503896,i like thailand language because that pronounc...,<start> i like thai because the pronunciation ...,i like thai because the pronunciation sounds c...
503897,i ate kaomangai rise with boild chikin tomya...,<start> i ate kaomangai rice with boiled chic...,i ate kaomangai rice with boiled chickin tom...
503898,i think it is important thing to become to lik...,<start> i think it is important to like corian...,i think it is important to like coriander in o...
503899,yesterday i went to umeda station to date,<start> i went to umeda station for dating yes...,i went to umeda station for dating yesterday <...


In [4]:
# Splitting And Sampling around 100k datapoints
#THE TOTAL DATASET HAS 500K DATAPOINTS WHICH WILL TAKE MUCH HIGHER TRAINING TIME. THEREFORE I AM SAMPLING ONE-FIFTH OF THE TOTAL DATASET

#df_sampled = pd.concat((df[df.enc_input].sample(frac= 0.2,random_state=1)))
df_sampled = df.sample(frac = 0.2)
print(df.shape)
print(df_sampled.shape)

(503901, 3)
(100780, 3)


In [5]:
## ONCE THE DATA IS SAMPLED WE ARE SPLITTIND THE DATA IN TO TRAIN AND TEST

df_train ,df_val = train_test_split(df_sampled,test_size=0.2,random_state = 3)

In [6]:
## IN THE COLUMN WHICH HAS DECODER INPUTS ADDING "<end>" TOKEN TO BE LEARNED BY THE TOKENIZER

df_train["dec_input"].iloc[0]  = df_train.iloc[0]["dec_input"] + " <end>"
df_train

Unnamed: 0,enc_input,dec_input,dec_output
230600,so i need to have a practice in writing englis...,<start> so i need to at least practice writing...,so i need to at least practice writing in engl...
59864,my colleague joined in j parc in order to use...,<start> my colleague joinedj parc in order to...,my colleague joinedj parc in order to use the...
472638,when i asked my american friends what i should...,<start> when i asked my american friends what ...,when i asked my american friends what i should...
391091,the main character is so beautiful like a real...,<start> the main character is so beautiful lik...,the main character is so beautiful like a real...
550,you know the earthquake was too bad for my family,<start> you know the earthquake affected my fa...,you know the earthquake affected my family ver...
...,...,...,...
379723,if i dropped out to learn english again i coul...,<start> if i had quit my english studies again...,if i had quit my english studies again i could...
3980,my flight determined as previous attached file,<start> my flight is confirmed as per the atta...,my flight is confirmed as per the attached fil...
197260,i like to watch tv programme week of sports ...,<start> i like to watch the tv programme week...,i like to watch the tv programme week of spor...
318828,you can buy things or making a reservation for...,<start> you can buy things or make reservation...,you can buy things or make reservations at a r...


In [7]:
## VALIDATION DATA
df_val

Unnamed: 0,enc_input,dec_input,dec_output
293263,by 17 30 the departure time for the party,<start> 17 30 is the departure time for the p...,17 30 is the departure time for the party <end>
354268,now i want to form new band,<start> now i want to form a new band,now i want to form a new band <end>
382132,the company shows their confidence that people...,<start> the company is confident that people w...,the company is confident that people will beco...
226942,i am thinking what i am going to create,<start> i am thinking of what to create,i am thinking of what to create <end>
230979,nowdays i have been even putting on long johns,<start> lately i have even been putting on lon...,lately i have even been putting on long johns ...
...,...,...,...
525,the examination fee cost 150,<start> the examination fee is 150,the examination fee is 150 <end>
477273,especially daily l watch so many movie,<start> iwatch so many movies daily,iwatch so many movies daily <end>
312392,i carried them within my bag and walked about ...,<start> i carried them in my bag and walked ab...,i carried them in my bag and walked about 30 m...
138563,but i do not use its well,<start> but i do not use them well,but i do not use them well <end>


In [8]:
## HERE I AM SAMPLING 1000 POINTS FROM THE DATAFRAME AS TEST DATA WHICH ARE NOT PRESEENT IN THE TRAIN AND VALIDAION DATA
np.random.seed(5) 
df_test = df.loc[np.random.choice(np.array([x for x in df.index.values if x not in df_sampled.index.values]),1000,replace= False,)]
df_test

Unnamed: 0,enc_input,dec_input,dec_output
144491,to give one to me,<start> to give me one,to give me one <end>
455893,normally a couple of colleagues gather togethe...,<start> normally a couple of colleagues gather...,normally a couple of colleagues gather togethe...
13942,after i came back home i drunk a cup of alcohol,<start> after i came back home i drank a cup o...,after i came back home i drank a cup of alcoho...
128993,then she gave each one of them a blue ribbon w...,<start> then she gave each one of them a blue ...,then she gave each one of them a blue ribbon w...
402709,now i forget these things and i enjoy studying...,<start> now i can forget these things and i en...,now i can forget these things and i enjoy stud...
...,...,...,...
352451,even though i have not written for while thank...,<start> even though i have not written for whi...,even though i have not written for while thank...
191225,today he picked me up to his selected restaurant,<start> today he picked me up at the restauran...,today he picked me up at the restaurant that h...
179487,all of my friends are thinking of which univer...,<start> all of my friends are thinking about w...,all of my friends are thinking about which uni...
38491,i dare not to mention when exactly they start ...,<start> i do not dare to mention when exactly ...,i do not dare to mention when exactly they sta...


In [9]:
# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer

In [10]:
## TOKENIZER FOR ENCODER INPUT
tk_inp = Tokenizer()
tk_inp.fit_on_texts(df_train.enc_input.apply(str))

In [11]:
# TOKENIZER FOR DECODER INPUT
tk_out = Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n' )
tk_out.fit_on_texts(df_train.dec_input.apply(str))

In [12]:
## THIS CLASS CONVERTS TEXT DATA TO INTEGER SEQUENCES AND RETURNS THE PADDED SEQUENCES

class Dataset :
    def __init__(self, data , tk_inp ,tk_out, max_len):
        ## SETTING THE REQUIRED ATTRIBUTES
        self.encoder_inp = data["enc_input"].apply(str).values
        self.decoder_inp = data["dec_input"].apply(str).values
        self.decoder_out = data["dec_output"].apply(str).values
        self.tk_inp = tk_inp
        self.tk_out = tk_out
        self.max_len = max_len
        
    def __getitem__(self,i):
        # INPUT SEQUENCES
        self.encoder_seq = self.tk_inp.texts_to_sequences([self.encoder_inp[i]])
        # DECODER INPUT SEQUENCES 
        self.decoder_inp_seq = self.tk_out.texts_to_sequences([self.decoder_inp[i]])
        # DECODER INPUT SEQUENCES
        self.decoder_out_seq = self.tk_out.texts_to_sequences([self.decoder_out[i]])
        
        # PADDING THE ENCODER INPUT SEQUENCES
        self.encoder_seq = pad_sequences(self.encoder_seq, padding="post",maxlen = self.max_len)
        # PADDING THE DECODER INPUT SEQUENCES
        self.decoder_inp_seq = pad_sequences(self.decoder_inp_seq, padding="post",maxlen = self.max_len)
        # PADDING DECODER OUTPUT SEQUENCES
        self.decoder_out_seq = pad_sequences(self.decoder_out_seq ,padding="post", maxlen = self.max_len)

        ##  RETURNING THE ENCODER INPUT , DECODER INPUT , AND DECODER OUTPUT
        return self.encoder_seq ,  self.decoder_inp_seq,  self.decoder_out_seq
    
    def __len__(self):
        # RETURN THE LEN OF INPUT ENDODER
        return len(self.encoder_inp)

In [13]:
## THIS CLASS CONVERTES THE DATASET INTO THE REQUIRED BATCH SIZE

class Dataloader(tf.keras.utils.Sequence):
    def __init__(self,batch_size,dataset):
        # INTIALIZING THE REQUIRED VARIABLES 
        self.dataset = dataset
        self.batch_size = batch_size
        self.totl_points = self.dataset.encoder_inp.shape[0]
        
    def __getitem__(self,i):
        # STATING THE START AND STOP VATIABLE CONTAINGING INDEX VALUES FOR EACH BATCH
        start = i * self.batch_size
        stop = (i+1)*self.batch_size
        
        # PLACEHOLDERS FOR BATCHED DATA
        batch_enc =[]
        batch_dec_input = []
        batch_dec_out =[]

        for j in range(start,stop): 
            
            a,b,c = self.dataset[j] 
            batch_enc.append(a[0]) 
            batch_dec_input.append(b[0])
            batch_dec_out.append(c[0]) 
        
        # Conveting list to array   
        batch_enc = (np.array(batch_enc)) 
        batch_dec_input = np.array(batch_dec_input)
        batch_dec_out = np.array(batch_dec_out)
        
        ## RETURNING BATCHED DATA IN REQUIRED FORM
        return [batch_enc , batch_dec_input],batch_dec_out
    
    def __len__(self):
        # Returning the number of batches
        return int(self.totl_points/self.batch_size)

In [14]:
# FORMING OBJECTS OF DATASET AND DATALOADER FOR TRAIN DATASET
train_dataset = Dataset(df_train,tk_inp,tk_out,35)
train_dataloader = Dataloader( batch_size = 512, dataset=train_dataset)

# FORMING OBJECTS OF DATASET AND DATALOADER FOR VALIDATION DATASET
val_dataset = Dataset(df_val , tk_inp,tk_out,35)
val_dataloader = Dataloader(batch_size=512 , dataset=val_dataset)

In [15]:
# ENCODER DECODER MODEL
## LOADING THE TENSORFLOW LIBRARIES

from tensorflow.keras import layers
from tensorflow.keras import Model

In [16]:
## DEFINING THE ENCODER LAYER AS A FUNCTION

def encoder(input_shape,vocab, emb_output, lstm_units, enc_input):
    '''THIS FUNCTION TAKES IN THE SEQUENCES AND RETURNS THE ENCODER OUTPUT'''
    ## FIRST LAYER : EMBEDDING LAYER
    enc_emb = layers.Embedding(vocab, emb_output,mask_zero = True,input_length=input_shape)(enc_input)
    ## SECOND LAYER : LSTM LAYER
    enc_lstm , enc_state_h,enc_state_c = layers.LSTM(units= lstm_units,return_sequences=True,return_state=True)(enc_emb)
    ## RETURNING THE LSTM OUTPUTS AND STATES
    return enc_lstm , enc_state_h,enc_state_c


## DEFINING THE DECODER LAYER AS A FUNCTION 
def decoder(input_shape,vocab, emb_output, lstm_units,enc_states, dec_input):
  ## FIRST LAYER : EMBEDDING LAYER
  dec_emb = layers.Embedding(vocab, emb_output , mask_zero = True,input_length=input_shape)(dec_input)
  ## SECONG LAYER : LSTM LAYER
  dec_lstm, dec_state_h,dec_state_c = layers.LSTM(units=lstm_units,return_sequences=True,return_state=True)(dec_emb,initial_state= enc_states)
  ## RETURNING THE LSTM OUTPUTS AND STATES
  return dec_lstm, dec_state_h,dec_state_c

In [17]:
## DEFINING THE MODEL ARCHITECTURE

# INPUT LAYER
enc_input = layers.Input(shape=(35))
# ENCODER DEFINED FORM FUNCTON ABOVE
enc_lstm , enc_state_h,enc_state_c = encoder(35,len(tk_inp.word_index)+1 , 300 ,256, enc_input )


# DECODER INPUT LAYER
dec_input = layers.Input(shape = (35))
# DECODER DEFINEA FROM ABOVE FUNCTION
dec_lstm , dec_state_h,dec_state_c = decoder(35,len(tk_out.word_index)+1 , 300 , 256 , [enc_state_h,enc_state_c],dec_input)
# DENCSE LAYER CONNECTOD TO DECODER OUTPUT
dense = layers.Dense(len(tk_out.word_index)+1,activation="softmax")(dec_lstm)

# MODEL DEFINING
model  = Model(inputs=[enc_input,dec_input],outputs=dense)

2022-10-17 19:22:37.186700: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudnn.so.8'; dlerror: libcudnn.so.8: cannot open shared object file: No such file or directory
2022-10-17 19:22:37.186743: W tensorflow/core/common_runtime/gpu/gpu_device.cc:1850] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...
2022-10-17 19:22:37.187130: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [18]:
# MODEL SUMMARY
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 35)]         0           []                               
                                                                                                  
 input_2 (InputLayer)           [(None, 35)]         0           []                               
                                                                                                  
 embedding (Embedding)          (None, 35, 300)      9455100     ['input_1[0][0]']                
                                                                                                  
 embedding_1 (Embedding)        (None, 35, 300)      7971600     ['input_2[0][0]']                
                                                                                              

In [19]:
## DEFINING THE CALLBACKS
callback =[ tf.keras.callbacks.ModelCheckpoint( "/model_save/word_trainable_embedding_best.h5",save_best_only=True,mode="min" ,save_weights_only=True),
           tf.keras.callbacks.EarlyStopping(monitor='val_loss',patience=5,verbose=1,min_delta=0.0001)
]

## STORING THE NUMBER OF STEPS IN ONE EPOCH FOR TRAIN AND VALIDATION DATASET
train_steps = train_dataloader.__len__()
val_steps  = val_dataloader.__len__()

# COMPILING THE MODEL
model.compile(optimizer="adam",loss='sparse_categorical_crossentropy')

In [20]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
from tensorflow.python.client import device_lib

Num GPUs Available:  0


In [30]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# The GPU id to use, "0" to  "7" 
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,3,4,5,7"


In [None]:
!nvidia-smi

In [28]:
ls

Benchmark_Wordlevel.ipynb  EDA.ipynb                             [0m[01;34mmodel_save[0m/
[01;34mDATA[0m/                      eng_word_trainable_embedding_besh.h5  resouces.txt


In [41]:
## FITTING THE MODEL
model.fit(train_dataloader,steps_per_epoch=train_steps,epochs=50,validation_data = val_dataloader,validation_steps =val_steps,callbacks=callback)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 00022: early stopping


<keras.callbacks.History at 0x7f8a643b78d0>

In [30]:
!pip uninstall pydotplus

Found existing installation: pydotplus 2.0.2
Uninstalling pydotplus-2.0.2:
  Would remove:
    /DATA/gupta92/.local/lib/python3.7/site-packages/pydotplus-2.0.2.dist-info/*
    /DATA/gupta92/.local/lib/python3.7/site-packages/pydotplus/*
Proceed (Y/n)? ^C
[31mERROR: Operation cancelled by user[0m[31m
[0m

In [None]:
!pip install pydot
!pip install pydotplus
!pip install graphviz

In [28]:
tf.keras.utils.plot_model(model,show_shapes=True)

('You must install pydot (`pip install pydot`) and install graphviz (see instructions at https://graphviz.gitlab.io/download/) ', 'for plot_model/model_to_dot to work.')


In [None]:
# LOADING THE WEIGHTS FOR BEST MODEL
#model.load_weights("model_save/word_trainable_embedding/besh.h5")
model.built = True
model.load_weights("eng_word_trainable_embedding_besh.h5")

In [44]:
## THIS FUNCTION IS USED IN THE INFERENCE TIME TO PREDICT THE RESULTS GIVEN THE INPUT TEXT

def predict(inp , model):
    ##  TAKES INPUT AS TEXT AND THE MODEL

    # CONVERT TEXT INPUT TO SEQUENCES 
    seq = tk_inp.texts_to_sequences([inp])
    # PADDING THE SEQUENCE
    seq = pad_sequences(seq,maxlen = 35,padding="post")
    ## INITIAL STATES FOR ENCODER
    state = [tf.zeros(shape=(1,256)),tf.zeros(shape= (1,256))]

    # SEQUENCE TO EMBEDDING
    enc_emb  = model.layers[2](seq)
    # PASSING EMBBEDDED SEQUENCES TO LSTM LAYER
    enc_output,state_h,state_c= model.layers[4](enc_emb,state)

    # PLACE HOLDER FOR PREDECTED WORDS
    pred = []
    # PLACE HOLDER FOR STATES 
    input_state = [state_h,state_c]
    # CURRENT VECTOR TO BE PASSED TO DECODER 
    current_vec = tf.ones((1,1))
    
    for i in range(35): # FOR i UP TO 35 (MAX LENGTH)
        ## CONVERT THE CURRENT VECTOR SEQUENCE WORD TO EMBEDDINGS
        dec_emb  = model.layers[3](current_vec)
        ## PASSING EMBEDDED VECTOR TO DECODER LSTM LAYER
        dec_output,dec_state_h,dec_state_c = model.layers[5](dec_emb , input_state)
        # PASSING DECODER OUTPUT TO DENSE LAYER
        dense = model.layers[6](dec_output)

        # SELECTING INDEX OF MAXIMUM DENSE OUTPUT AS CURRENT VECTOR
        current_vec = np.argmax(dense ,axis = -1)
        # UPDATING THE INPUT STATES
        input_state = [dec_state_h,dec_state_c]

        # APPENDING THE ACTUAL TEXT TO "pred" VARIABLE
        pred.append(tk_out.index_word[current_vec[0][0]])
        ## IF THE CURRENT VECTOR IS "<end>" BREAK THE LOOP
        if tk_out.index_word[current_vec[0][0]]=="<end>":
            break
    ## RETURN THE JOINED STRING IN LIST "pred"
    return " ".join(pred)

In [45]:
# Prediction on Test Set
print("INPUT SENTENCE ===> ",df_test.enc_input.values[19])
print("PREDICTED SENTENCE ===> ",predict(df_test.enc_input.values[19],model))
print("ACTUAL SENTENCE ===> ",df_test.dec_output.values[19])

INPUT SENTENCE ===>  today is the first day in a week
PREDICTED SENTENCE ===>  today is the first day a week <end>
ACTUAL SENTENCE ===>  today is the first day of the week <end>


In [46]:
print("INPUT SENTENCE ===> ",df_test.enc_input.values[50])
print("PREDICTED SENTENCE ===> ",predict(df_test.enc_input.values[50],model))
print("ACTUAL SENTENCE ===> ",df_test.dec_output.values[50])

INPUT SENTENCE ===>  hello it is the first time for me to write an english article on this site
PREDICTED SENTENCE ===>  hello it is the first time i have written this website for english <end>
ACTUAL SENTENCE ===>  hello it is the first time i have written an english article on this site <end>


In [47]:
%%time
# Inference Time
predict(df_test.enc_input.values[50],model)

UsageError: Line magic function `%%time` not found.


In [48]:
#BELU SCore
import nltk.translate.bleu_score as bleu

In [49]:
# VALIDATION BELU SCORE
BLEU_val_emb = []
test_data = df_val.loc[np.random.choice(df_val.index,size = 2000)]
for ind,i in tqdm(test_data.iterrows(),position=0):
    try:
        pred = predict(str(i.enc_input),model).split()
        act = [str(i.dec_output).split()]
        b =bleu.sentence_bleu(act,pred)
        BLEU_val_emb.append(b)
    except:
        continue

2000it [06:19,  5.27it/s]


In [50]:
print("BELU Score = ",np.mean(BLEU_val_emb))

BELU Score =  0.1294694307316595


In [None]:
# VALIDATION BELU SCORE
BLEU_val_emb = []
test_data = df_val
for ind,i in tqdm(test_data.iterrows(),position=0):
    try:
        pred = predict(str(i.enc_input),model).split()
        act = [str(i.dec_output).split()]
        b =bleu.sentence_bleu(act,pred)
        BLEU_val_emb.append(b)
    except:
        continue

8019it [21:21,  6.00it/s]

In [None]:
print("BELU Score = ",np.mean(BLEU_val_emb))