In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Input,LSTM,Dense
import numpy as np

In [None]:
batch_size=64
epochs=100
latent_dim=256#latent dimensions of the encoding space.
num_samples=10000
data="fra.txt"

In [None]:
#vectorizing the data
input_texts=[]
target_texts=[]
input_characters=set()
target_characters=set()
with open(data,"r",encoding="utf-8") as f:
  lines=f.read().split("\n")
for line in lines[:min(num_samples,len(lines)-1)]:
  input_text, target_text=line.split("\t")
  target_text="\t"+target_text+"\n"
  input_texts.append(input_text)
  target_texts.append(target_text)
  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
    for char in target_text:
      if char not in target_characters:
        target_characters.add(char)

In [None]:
target_texts

['\tVa !\n',
 '\tCours\u202f!\n',
 '\tCourez\u202f!\n',
 '\tÇa alors\u202f!\n',
 '\tAu feu !\n',
 "\tÀ l'aide\u202f!\n",
 '\tSaute.\n',
 '\tÇa suffit\u202f!\n',
 '\tStop\u202f!\n',
 '\tArrête-toi !\n',
 '\tAttends !\n',
 '\tAttendez !\n',
 '\tPoursuis.\n',
 '\tContinuez.\n',
 '\tPoursuivez.\n',
 '\tJe comprends.\n',
 "\tJ'essaye.\n",
 "\tJ'ai gagné !\n",
 "\tJe l'ai emporté !\n",
 '\tOh non !\n',
 '\tAttaque !\n',
 '\tAttaquez !\n',
 '\tSanté !\n',
 '\tÀ votre santé !\n',
 '\tMerci !\n',
 '\tTchin-tchin !\n',
 '\tLève-toi.\n',
 '\tVa, maintenant.\n',
 '\tAllez-y maintenant.\n',
 '\tVas-y maintenant.\n',
 "\tJ'ai pigé !\n",
 '\tCompris !\n',
 '\tPigé\u202f?\n',
 '\tCompris\u202f?\n',
 "\tT'as capté\u202f?\n",
 '\tMonte.\n',
 '\tMontez.\n',
 '\tSerre-moi dans tes bras !\n',
 '\tSerrez-moi dans vos bras !\n',
 '\tJe suis tombée.\n',
 '\tJe suis tombé.\n',
 '\tJe sais.\n',
 '\tJe suis parti.\n',
 '\tJe suis partie.\n',
 "\tJ'ai perdu.\n",
 "\tJ'ai 19 ans.\n",
 '\tJe vais bien.\n',
 '\tÇa v

In [None]:
input_characters=sorted(list(input_characters))
target_characters=sorted(list(target_characters))
num_encoder_tokens=len(input_characters)
num_decoder_tokens=len(target_characters)
max_encoder_seq_length=max([len(txt) for txt in input_texts])#This value represents the length of the longest input sequence and is used to define the maximum length for input sequences in the encoder.
max_decoder_seq_length=max([len(txt) for txt in target_texts])#This value represents the length of the longest target sequence and is used to define the maximum length for target sequences in the decoder.


In [None]:
print("Number of Samples:", len(input_texts))
print("Number of unique input characters:",num_encoder_tokens)
print("Number of unique output tokens:",num_decoder_tokens)
print("Max sequence length for inputs:",max_encoder_seq_length)
print("Max sequence length for outputs:",max_decoder_seq_length)

Number of Samples: 10000
Number of unique input characters: 71
Number of unique output tokens: 94
Max sequence length for inputs: 16
Max sequence length for outputs: 59


In [None]:
#assigning token to each character
input_token_index=dict([(char,i) for i,char in enumerate(input_characters)])
target_token_index=dict([(char,i) for i, char in enumerate(target_characters)])

In [None]:
input_token_index,target_token_index

({' ': 0,
  '!': 1,
  '$': 2,
  '%': 3,
  '&': 4,
  "'": 5,
  ',': 6,
  '-': 7,
  '.': 8,
  '0': 9,
  '1': 10,
  '2': 11,
  '3': 12,
  '4': 13,
  '5': 14,
  '6': 15,
  '7': 16,
  '8': 17,
  '9': 18,
  ':': 19,
  '?': 20,
  'A': 21,
  'B': 22,
  'C': 23,
  'D': 24,
  'E': 25,
  'F': 26,
  'G': 27,
  'H': 28,
  'I': 29,
  'J': 30,
  'K': 31,
  'L': 32,
  'M': 33,
  'N': 34,
  'O': 35,
  'P': 36,
  'Q': 37,
  'R': 38,
  'S': 39,
  'T': 40,
  'U': 41,
  'V': 42,
  'W': 43,
  'Y': 44,
  'a': 45,
  'b': 46,
  'c': 47,
  'd': 48,
  'e': 49,
  'f': 50,
  'g': 51,
  'h': 52,
  'i': 53,
  'j': 54,
  'k': 55,
  'l': 56,
  'm': 57,
  'n': 58,
  'o': 59,
  'p': 60,
  'q': 61,
  'r': 62,
  's': 63,
  't': 64,
  'u': 65,
  'v': 66,
  'w': 67,
  'x': 68,
  'y': 69,
  'z': 70},
 {'\t': 0,
  '\n': 1,
  ' ': 2,
  '!': 3,
  '$': 4,
  '%': 5,
  '&': 6,
  "'": 7,
  '(': 8,
  ')': 9,
  ',': 10,
  '-': 11,
  '.': 12,
  '0': 13,
  '1': 14,
  '3': 15,
  '5': 16,
  '6': 17,
  '8': 18,
  '9': 19,
  ':': 20,
  '?'

In [None]:
#one hot encoding of texts
encoder_input_data=np.zeros((len(input_texts),max_encoder_seq_length,num_encoder_tokens),dtype="float32")
decoder_input_data=np.zeros((len(input_texts),max_decoder_seq_length,num_decoder_tokens),dtype="float32")
decoder_target_data=np.zeros((len(input_texts),max_decoder_seq_length,num_decoder_tokens),dtype="float32")

In [None]:
for i,(input_text,target_text) in enumerate(zip(input_texts,target_texts)):
  for t, char in enumerate(input_text):
    encoder_input_data[i,t,input_token_index[char]]=1.
  encoder_input_data[i,t+1:,input_token_index[" "]]=1.
  for t,char in enumerate(target_text):
    decoder_input_data[i,t,target_token_index[char]]=1.
    if t>0:
      #decoder target data will be ahead by one timestep
      decoder_target_data[i,t-1,target_token_index[char]]=1.
  decoder_input_data[i,t+1:,target_token_index[" "]]=1.
  decoder_target_data[t,t:,target_token_index[" "]]=1.

In [None]:
encoder_input_data[0].shape

(16, 71)

In [None]:
#defining the input sequence and processing it
encoder_inputs=Input(shape=(None,num_encoder_tokens))
encoder=LSTM(latent_dim,return_state=True)
encoder_outputs,state_h,state_c=encoder(encoder_inputs)#state_h=hidden cell and state_c is cell state
#dicarding the encoder outputs and keeping the state only
encoder_states=[state_h,state_c]

In [None]:
#setting up the decoder using encoder_states as initial state
decoder_inputs=Input(shape=(None,num_decoder_tokens))
#setting up the decoder to return the full output sequence and to return the internal states as well. we dont use the return states in the training model, but we will use them in inference.
decoder_lstm=LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_outputs,_,_=decoder_lstm(decoder_inputs,initial_state=encoder_states)
decoder_dense=Dense(num_decoder_tokens,activation="softmax")

decoder_outputs=decoder_dense(decoder_outputs)

In [None]:
#model creation
from tensorflow.keras.models import Model
model=Model([encoder_inputs,decoder_inputs],decoder_outputs)
model.compile(optimizer="rmsprop",loss="categorical_crossentropy",metrics=["accuracy"])
model.fit([encoder_input_data,decoder_input_data],decoder_target_data,batch_size=batch_size,epochs=epochs,validation_split=0.2)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 16ms/step - accuracy: 0.0432 - loss: 1.2259 - val_accuracy: 0.0528 - val_loss: 1.2414
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0547 - loss: 1.0589 - val_accuracy: 0.0555 - val_loss: 1.2178
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 13ms/step - accuracy: 0.0582 - loss: 1.0400 - val_accuracy: 0.0556 - val_loss: 1.2016
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 14ms/step - accuracy: 0.0620 - loss: 1.0259 - val_accuracy: 0.0628 - val_loss: 1.1818
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.0671 - loss: 0.9990 - val_accuracy: 0.0689 - val_loss: 1.1629
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 11ms/step - accuracy: 0.0719 - loss: 0.9806 - val_accuracy: 0.0704 - val_loss: 1.1448
Epoch 7/100
[1m

<keras.src.callbacks.history.History at 0x78b4f146fd00>