### Reading Data and Tokenization

In [1]:
import pandas as pd

data = pd.read_csv("data/fra.txt",sep='\t',header= None)
ang = list(data[0])
fra = list(data[1])

from spacy.lang.en import English
from spacy.lang.fr import French

nlp = English()
tokenizer = nlp.Defaults.create_tokenizer(nlp)
nlp_f = French()
tokenizer_f = nlp_f.Defaults.create_tokenizer(nlp_f)

ang = [['<S>'] + [token.string.strip() for token in tokenizer(text.lower())] + ['</S>'] for text in ang][:10000]

fra = [['<S>'] + [token.string.strip() for token in tokenizer_f(text.lower())] + ['</S>'] for text in fra][:10000]

KeyboardInterrupt: 

### Padding, word2id and shifting

In [None]:
from gensim.models import Word2Vec
import numpy as np

EMBEDDING_SIZE = 120
w2v = Word2Vec(ang, size=EMBEDDING_SIZE, window=10, min_count=1, negative=10, workers=10)
word_map = {}
word_map["<PAD>"] = 0
word_vectors = [np.zeros((EMBEDDING_SIZE,))]
for i, w in enumerate([w for w in w2v.wv.vocab]):
    word_map[w] = i+1
    word_vectors.append(w2v.wv[w])
word_vectors = np.vstack(word_vectors)

w2v = Word2Vec(fra, size=EMBEDDING_SIZE, window=10, min_count=1, negative=10, workers=10)
word_map_fr = {}
word_map_fr["<PAD>"] = 0
word_vectors_fr = [np.zeros((EMBEDDING_SIZE,))]
for i, w in enumerate([w for w in w2v.wv.vocab]):
    word_map_fr[w] = i+1
    word_vectors_fr.append(w2v.wv[w])
word_vectors_fr = np.vstack(word_vectors_fr)
i2w = dict(zip([*word_map_fr.values()],[*word_map_fr]))


In [None]:
def pad(a,shift = False):
    shape = len(a)
    max_s = max([len(x) for x in a])
    if shift:
        token = np.zeros((shape,max_s+1),dtype = np.int)
        mask  =  np.zeros((shape,max_s+1),dtype = np.int)
        for i,o in enumerate(a):
            token[i,:len(o)] = o
            mask[i,:len(o)] = 1
        return token[:,1:],mask[:,1:],max_s       
    else:            
        token = np.zeros((shape,max_s),dtype = np.int)
        mask  =  np.zeros((shape,max_s),dtype = np.int)
        for i,o in enumerate(a):
            token[i,:len(o)] = o
            mask[i,:len(o)] = 1
        return token,mask,max_s 
    
ang_tok,ang_mask,ang_pl = pad([[word_map[w] for w in text] for text in ang])
fra_tok,fra_mask,fr_pl = pad([[word_map_fr[w] for w in text] for text in fra])
fra_toks_s,fra_mask_s,_ = pad([[word_map_fr[w] for w in text] for text in fra],shift = True)

### Seq2Seq model

In [None]:
from tensorflow.keras import layers,Model
from tensorflow.keras.initializers import Constant
import tensorflow as tf

class S2S(tf.keras.Model):
    def __init__(self,Win,Wout,i2w):
      
        super(S2S, self).__init__() 
        
        self.nv_in = Win.shape[0]
        self.r = Win.shape[1]
        self.nv_out = Wout.shape[0]
        
        self.i2w = i2w
        
        self.Win = layers.Embedding(self.nv_in,self.r)
        self.Win.build((None, ))
        self.Win.set_weights([Win])
        self.Win.trainable = True

        self.Wout = layers.Embedding(self.nv_out,self.r)
        self.Wout.build((None, ))
        self.Wout.set_weights([Wout])
        self.Wout.trainable = True
        
        self.encoder = layers.GRU(self.r, return_sequences=True, return_state=True,dropout=0.2)
        
        self.decoder = layers.GRU(self.r, return_sequences=True, return_state=True,dropout=0.2)
        
        self.mapper = layers.Dense(self.nv_out,activation = "softmax")

        self.attention = layers.Attention()

        self.H = layers.Dense(self.r*2,activation = "tanh")

        self.W = layers.Dense(self.r,activation = "relu")

    @tf.function
    def encode(self,x,x_mask):
        
        x = self.Win(x)
        x_mask = tf.cast(x_mask,dtype=bool)
    
        hidden_seq,hidden_last = self.encoder(x,mask=x_mask)

        return hidden_seq,hidden_last
    
    @tf.function
    def decode(self,encoder_seq,encoder_mask,decoder_last,context_last,x_out,attention = False):

        x_out = self.Wout(x_out)

        input_decoder = tf.concat([x_out,context_last],2)

        encoder_mask = tf.cast(encoder_mask,dtype=bool)

        _,decoder_last = self.decoder(input_decoder, initial_state=decoder_last)

        decoder_last = tf.expand_dims(decoder_last,1)
        
        query = self.W(decoder_last)

        key = encoder_seq 
        value = encoder_seq 
            
        context_vector = self.attention([query,key,value],mask = [None,encoder_mask])

        probs = self.mapper(self.H(tf.concat([decoder_last,context_vector],2)))

        decoder_last = tf.squeeze(decoder_last)
        
        if attention == True:
            decoder_last = tf.expand_dims(decoder_last,axis = 0)
            scores = tf.matmul(query, key, transpose_b=True)
            weights = tf.nn.softmax(scores)
            return weights,decoder_last,context_vector
        else:
            return probs,decoder_last,context_vector

 
    def att_wei(self,x,x_mask,x_out,x_out_mask):
        
        x = tf.expand_dims(x,axis=0)
        x_mask = tf.expand_dims(x_mask,axis=0)
        x_out = tf.expand_dims(x_out,axis=0)
        x_out_mask = tf.expand_dims(x_out_mask,axis=0)
        
        mask = tf.cast(tf.matmul(tf.transpose(x_out_mask),x_mask),dtype=tf.bool)

        encoder_seq,hidden_last = model.encode(x,x_mask)
        context_last = tf.zeros([x.shape[0],model.r])
        context_last = tf.expand_dims(context_last,1)

        input_0 = tf.gather(x_out, [0], axis=1)
        
        
        weights,hidden_last,context_last = model.decode(encoder_seq,x_mask,hidden_last,context_last,input_0,attention = True)
        pro = []
        
        pro.append(weights)

        for t in range(1,x_out.shape[1]):
            input_0 = tf.gather(x_out, [t], axis=1)
            weights,hidden_last,context_last = model.decode(encoder_seq,x_mask,hidden_last,context_last,input_0,attention = True)
            pro.append(weights)

        pro = tf.concat(pro,1)
        
        out = tf.boolean_mask(tf.squeeze(pro),mask)
        
        out = tf.reshape(out,(tf.reduce_sum(x_out_mask),tf.reduce_sum(x_mask)))
                
        return out.numpy() 


    
    def generate(self,start_emb,x,x_mask):
        aout = []
        
        x = tf.expand_dims(x,axis=0)
        x_mask = tf.expand_dims(x_mask,axis=0)

        encoder_seq,hidden_last = model.encode(x,x_mask)
        context_last = tf.zeros([x.shape[0],model.r])
        context_last = tf.expand_dims(context_last,1)
       
        input_0 = tf.expand_dims(tf.expand_dims(start_emb,axis=0),axis=0)

        probs,hidden_last,context_last = model.decode(encoder_seq,x_mask,hidden_last,context_last,input_0)

        val,argval = tf.nn.top_k(tf.squeeze(probs), k=2, sorted=True, name=None)
        x_out = argval.numpy()[0]
        aout.append(self.i2w[x_out])
    
        for t in range(10):
            hidden_last = tf.expand_dims(hidden_last,axis=0)
            input_0 = tf.expand_dims(tf.expand_dims(x_out,axis=0),axis=0)
            probs,hidden_last,context_last = model.decode(encoder_seq,x_mask,hidden_last,context_last,input_0)
            
            val,argval = tf.nn.top_k(tf.squeeze(probs), k=2, sorted=True, name=None)
            x_out = argval.numpy()[0]
            aout.append(self.i2w[x_out])
                
        return aout

In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
# sphinx_gallery_thumbnail_number = 2

def plot_weight(vegetables,farmers,harvest):
    
    vegetables = vegetables[1:]
    harvest = harvest[0:-1,:]
    
    
    fig, ax = plt.subplots(figsize=(10, 10))
    im = ax.imshow(harvest)

    # We want to show all ticks...
    ax.set_xticks(np.arange(len(farmers)))
    ax.set_yticks(np.arange(len(vegetables)))
    # ... and label them with the respective list entries
    ax.set_xticklabels(farmers)
    ax.set_yticklabels(vegetables)

    # Rotate the tick labels and set their alignment.
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")

    # Loop over data dimensions and create text annotations.
    for i in range(len(vegetables)):
        for j in range(len(farmers)):
            text = ax.text(j, i, harvest[i, j],
                           ha="center", va="center", color="w")

    ax.set_title("Attention")
    fig.tight_layout()
    plt.show()

### Function for optim

In [None]:
@tf.function
def compute_loss(model,loss_f,x,x_mask,x_out,y_onehot,y_mask):
    pro = []
    
    encoder_seq,hidden_last = model.encode(x,x_mask)
    context_last = tf.zeros([x.shape[0],model.r])
    context_last = tf.expand_dims(context_last,1)
    
    input_0 = tf.gather(x_out, [0], axis=1)
    probs,hidden_last,context_last = model.decode(encoder_seq,x_mask,hidden_last,context_last,input_0)

    pro.append(probs)

    for t in range(1,y_onehot.shape[1]):
        input_0 = tf.gather(x_out, [t], axis=1)
        probs,hidden_last,context_last = model.decode(encoder_seq,x_mask,hidden_last,context_last,input_0)
        pro.append(probs)
        
    pro = tf.concat(pro,1)

    y_true= tf.boolean_mask(y_onehot,y_mask)
    y_pred = tf.boolean_mask(pro,y_mask)
    
    
    return loss_f(y_true,y_pred),y_true,y_pred


@tf.function
def compute_apply_gradients(model,loss_f,x,x_mask,x_out,y_onehot,y_mask,optimizer):
    with tf.GradientTape() as tape:
        
        loss,label,prediction = compute_loss(model,loss_f,x,x_mask,x_out,y_onehot,y_mask)
        
    gradients = tape.gradient(loss, model.trainable_variables)
    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))

    return loss,label,prediction




### Split

In [None]:
batch_size = 128

X = np.hstack([ang_tok,ang_mask])
print(X.shape)
Y = np.hstack([fra_tok,fra_toks_s,fra_mask_s])
print(Y.shape)
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.80, random_state=101)


train_data = tf.data.Dataset.from_tensor_slices((X_train,Y_train)).batch(batch_size)

test_data = tf.data.Dataset.from_tensor_slices((X_test,Y_test)).batch(batch_size)



### Model declaration and Losses

In [None]:
from tqdm.notebook import tqdm
import os

model = S2S(word_vectors,word_vectors_fr,i2w)

epochs = 50

optimizer = tf.keras.optimizers.Adam()

checkpoint_dir = 'training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 model=model)


train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.CategoricalAccuracy(name='train_accuracy')

test_loss = tf.keras.metrics.Mean(name='test_loss')
test_accuracy = tf.keras.metrics.CategoricalAccuracy(name='test_accuracy')

loss_f = tf.keras.losses.CategoricalCrossentropy()

tr_loss = []
te_loss = []
tr_acc = []
te_acc = []
 


### Optim

In [None]:

for epoch in range(1, epochs + 1):
    print(epoch,flush=True,)

    for x,y in tqdm(train_data):
        
        x,x_mask = tf.split(x,2,axis=1)
        x_out,y,y_mask = tf.split(y,3,axis=1)
        
        y_onehot = tf.one_hot(y,depth = word_vectors_fr.shape[0])
        loss,label,prediction = compute_apply_gradients(model,loss_f,x,x_mask,x_out,y_onehot,y_mask,optimizer)

        train_loss(loss)
        train_accuracy(label, prediction)
        
    for x,y in tqdm(test_data):

        x,x_mask = tf.split(x,2,axis=1)
        x_out,y,y_mask = tf.split(y,3,axis=1)
        
        y_out_onehot = tf.one_hot(y,depth = word_vectors_fr.shape[0])
        loss,label,prediction = compute_loss(model,loss_f,x,x_mask,x_out,y_out_onehot,y_mask)

        test_loss(loss)
        test_accuracy(label, prediction)
      
    print(
    f'Loss: {train_loss.result()}, '
    f'Accuracy: {train_accuracy.result() * 100}, '
    f'Test Loss: {test_loss.result()}, '
    f'Test Accuracy: {test_accuracy.result() * 100}')
    
    tr_loss.append(train_loss.result())
    te_loss.append(test_loss.result())
    tr_acc.append(train_accuracy.result())
    te_acc.append(test_accuracy.result())
    if epoch % 2 == 0:
        checkpoint.save(file_prefix=checkpoint_prefix)
        
    if epoch % 10 == 0:
        print(" ".join(ang[1000]))
        print(" ".join(model.generate(word_map_fr["<S>"],ang_tok[1000,:],ang_mask[1000,:])))

        print(" ".join(ang[5000]))
        print(" ".join(model.generate(word_map_fr["<S>"],ang_tok[5000,:],ang_mask[5000,:])))

        wei = model.att_wei(ang_tok[5000,:],ang_mask[5000,:],fra_tok[5000,:],fra_mask[5000,:])
        plot_weight(fra[5000],ang[5000],wei)



In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

plt.plot(tr_loss, c="orange")
plt.plot(te_loss, c="blue")
plt.title("Loss")
plt.show()

plt.clf()
plt.plot(tr_acc, c="orange")
plt.plot(te_acc, c="red")
plt.title("Accuracy")
plt.show()