## Importing Libraries

In [1]:
import tensorflow as tf

In [2]:
a=[[[1,2,3,4],
 [5,6,7,8]]]
tf.argmax(a,axis=-1)

<tf.Tensor: shape=(1, 2), dtype=int64, numpy=array([[3, 3]])>

## Importing Data

In [3]:
!wget https://www.manythings.org/anki/fra-eng.zip

--2023-06-25 05:03:33--  https://www.manythings.org/anki/fra-eng.zip
Resolving www.manythings.org (www.manythings.org)... 173.254.30.110
Connecting to www.manythings.org (www.manythings.org)|173.254.30.110|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 7420323 (7.1M) [application/zip]
Saving to: ‘fra-eng.zip’


2023-06-25 05:03:35 (4.19 MB/s) - ‘fra-eng.zip’ saved [7420323/7420323]



In [4]:
!unzip "/content/fra-eng.zip" -d "/content/fra"

Archive:  /content/fra-eng.zip
  inflating: /content/fra/_about.txt  
  inflating: /content/fra/fra.txt    


In [5]:
dataset=tf.data.TextLineDataset("/content/fra/fra.txt")

In [6]:
for i in dataset.take(5):
  print(i)

tf.Tensor(b'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)', shape=(), dtype=string)
tf.Tensor(b'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)', shape=(), dtype=string)
tf.Tensor(b'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)', shape=(), dtype=string)
tf.Tensor(b'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)', shape=(), dtype=string)


In [7]:
len(list(dataset))# total number of lines/documents

217975

In [8]:
for i in dataset.skip(217974).take(1):
  print(i)
  print(len(tf.strings.split(i," "))) # len of last line/document to get the sequence length

tf.Tensor(b"It may be impossible to get a completely error-free corpus due to the nature of this kind of collaborative effort. However, if we encourage members to contribute sentences in their own languages rather than experiment in languages they are learning, we might be able to minimize errors.\tIl est peut-\xc3\xaatre impossible d'obtenir un Corpus compl\xc3\xa8tement d\xc3\xa9nu\xc3\xa9 de fautes, \xc3\xa9tant donn\xc3\xa9e la nature de ce type d'entreprise collaborative. Cependant, si nous encourageons les membres \xc3\xa0 produire des phrases dans leurs propres langues plut\xc3\xb4t que d'exp\xc3\xa9rimenter dans les langues qu'ils apprennent, nous pourrions \xc3\xaatre en mesure de r\xc3\xa9duire les erreurs.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2024159 (CK) & #2024564 (sacredceltic)", shape=(), dtype=string)
106


## Data Preprocessing

### Converting text to vectors

In [9]:
# Vectorizing english sequence - token index method
english_Vector=tf.keras.layers.TextVectorization(max_tokens=10000,
                                                 standardize="lower_and_strip_punctuation",
                                                 output_sequence_length=70
                                                 )

In [10]:
# Vectorizing french sequence - token index method
french_Vector=tf.keras.layers.TextVectorization(max_tokens=10000,
                                                 standardize="lower_and_strip_punctuation",
                                                 output_sequence_length=70
                                                 )

### Getting Vocabulary

In [11]:
#for getting all vocabulary
def vocab(inp):
  text=tf.strings.split(inp,"\t")
  return text[0],"start "+text[1]+" end"

In [12]:
vocab_text=dataset.map(vocab)

In [13]:
english=vocab_text.map(lambda x,y:x)
french=vocab_text.map(lambda x,y:y)

In [14]:
#getting vocabulary for vector layer
english_Vector.adapt(english)
french_Vector.adapt(french)

### Vectorizing

In [15]:
# model should have one input for encoder (english), one input for decoder (french shifted right), one output for decoder (french w/ end token)
def in_out(text):
  split=tf.strings.split(text, "\t")
  return split[0],"start "+split[1],split[1]+" end"


In [16]:
inp_out=dataset.map(in_out)

In [17]:
def vectorizer(inp_enc,inp_dec,out_dec):
  return english_Vector(inp_enc),french_Vector(inp_dec),french_Vector(out_dec)

In [18]:
data=inp_out.map(vectorizer)

In [19]:
for i,j,k in data.take(1):
  print(i,j,k)

tf.Tensor(
[44  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0
  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0], shape=(70,), dtype=int64) tf.Tensor(
[  2 104   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(70,), dtype=int64) tf.Tensor(
[104   3   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0], shape=(70,), dtype=int64)


In [20]:
english_Vector.get_vocabulary()[44]

'go'

In [21]:
french_Vector.get_vocabulary()[104]

'va'

In [22]:
# Data preparation for model
def vectorizer_model(inp_enc,inp_dec,out_dec):
  return (english_Vector(inp_enc),french_Vector(inp_dec)),french_Vector(out_dec)

In [23]:
required_data=inp_out.map(vectorizer_model)

In [24]:
for i,j in required_data.take(1):
  print(i)

(<tf.Tensor: shape=(70,), dtype=int64, numpy=
array([44,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0])>, <tf.Tensor: shape=(70,), dtype=int64, numpy=
array([  2, 104,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0])>)


In [25]:
req_data_1=required_data.shuffle(buffer_size=3000).batch(64).prefetch(buffer_size=tf.data.AUTOTUNE)
number_of_batches=217975/64
training_data_1=req_data_1.take(int(0.8*number_of_batches))
val_data_buf=req_data_1.skip(int(0.8*number_of_batches))
val_data_1=val_data_buf.take(int(0.1*number_of_batches))
test_data_1=val_data_1.skip(int(0.1*number_of_batches))

In [26]:
for i,j in training_data_1.take(1):
  print(i)

(<tf.Tensor: shape=(64, 70), dtype=int64, numpy=
array([[  21,  487,    0, ...,    0,    0,    0],
       [1099,    0,    0, ...,    0,    0,    0],
       [   2,  141,   58, ...,    0,    0,    0],
       ...,
       [   2,   38,  490, ...,    0,    0,    0],
       [  35,   26,   44, ...,    0,    0,    0],
       [ 460,  164,    0, ...,    0,    0,    0]])>, <tf.Tensor: shape=(64, 70), dtype=int64, numpy=
array([[   2,    4,   25, ...,    0,    0,    0],
       [   2,    1,    0, ...,    0,    0,    0],
       [   2, 6795,  118, ...,    0,    0,    0],
       ...,
       [   2,    4,  387, ...,    0,    0,    0],
       [   2,  937,  214, ...,    0,    0,    0],
       [   2, 2728,    0, ...,    0,    0,    0]])>)


In [131]:
req_data=required_data.shuffle(buffer_size=3000).unbatch().batch(64).prefetch(buffer_size=tf.data.AUTOTUNE)

In [132]:
for i,j in req_data.take(1):
  print(i)

(<tf.Tensor: shape=(64,), dtype=int64, numpy=
array([434,   9,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])>, <tf.Tensor: shape=(64,), dtype=int64, numpy=
array([  2, 811,  34,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
         0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0])>)


## Model Building

In [91]:
# encoder
input_1=tf.keras.layers.Input(shape=(70))
embedding=tf.keras.layers.Embedding(10000,(50))(input_1)
encoded_output=tf.keras.layers.Bidirectional(tf.keras.layers.GRU(300))(embedding)

In [92]:
# decoder
input_dec=tf.keras.layers.Input(shape=(70,))
embedding_dec=tf.keras.layers.Embedding(10000,50)(input_dec)
decoded_output=tf.keras.layers.GRU(600,return_sequences=True)(embedding_dec,initial_state=encoded_output)

In [135]:
# fully connected
dp=tf.keras.layers.Dropout(0.5)(decoded_output)
fc=tf.keras.layers.Dense(10000,activation="softmax")(dp)

In [136]:
# Translation
translation=tf.keras.Model([input_1,input_dec],fc)

In [137]:
translation.summary()

Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_8 (InputLayer)           [(None, 70)]         0           []                               
                                                                                                  
 input_9 (InputLayer)           [(None, 70)]         0           []                               
                                                                                                  
 embedding_7 (Embedding)        (None, 70, 50)       500000      ['input_8[0][0]']                
                                                                                                  
 embedding_8 (Embedding)        (None, 70, 50)       500000      ['input_9[0][0]']                
                                                                                            

In [138]:
translation.compile(
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    optimizer=tf.keras.optimizers.Adam(5e-4),metrics="accuracy")

In [139]:
translation.fit(training_data_1,validation_data=val_data_1,epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f27fd304370>

In [140]:
index_to_word={x:y for x, y in zip(range(len(french_Vector.get_vocabulary())),
                                   french_Vector.get_vocabulary())}

In [141]:
def translator(english_sent):
  eng_vector=english_Vector([english_sent])

  dec_inp="start"
  for i in range(70):
    fre_vector=french_Vector([dec_inp])
    output=translation.predict([eng_vector,fre_vector])

    word=tf.argmax(output,axis=-1)[0][i].numpy()
    french_word=index_to_word[word]

    if french_word=="end":
      break
    dec_inp+=' '+french_word
  return dec_inp

In [142]:
translator("i see.")



'start je vois'

In [146]:
translator(" i go there")



'start jy suggère'

In [147]:
translator("where are you")



'start où vous [UNK]'