In [1]:
import numpy as np
from tensorflow.keras.layers import LSTM , Dropout , Embedding , Input , Dense
from tensorflow.keras.models import Model

In [2]:
input_texts = []
target_texts = [] 
input_chars = set()
output_chars = set()

In [3]:
with open("fra-eng/fra.txt","r") as file:
    lines = file.read().split("\n")

In [4]:
BATCH_SIZE = 64
EPOCHS = 40
LATENT_DIM = 256
NUM_SAMPLES = 10_000

In [5]:
lines

['Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)',
 'Go.\tMarche.\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8090732 (Micsmithel)',
 'Go.\tEn route !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #8267435 (felix63)',
 'Go.\tBouge !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #9022935 (Micsmithel)',
 'Hi.\tSalut !\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #509819 (Aiji)',
 'Hi.\tSalut.\tCC-BY 2.0 (France) Attribution: tatoeba.org #538123 (CM) & #4320462 (gillux)',
 'Run!\tCours\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906331 (sacredceltic)',
 'Run!\tCourez\u202f!\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #906332 (sacredceltic)',
 'Run!\tPrenez vos jambes à vos cous !\tCC-BY 2.0 (France) Attribution: tatoeba.org #906328 (papabear) & #2077449 (sacredceltic)',
 'Run!\tFile !\tCC-BY 2.0 (France) Attribution: tatoeba.org #90

In [6]:
for line in lines[: min(NUM_SAMPLES , len(lines)-1)]:
    input_text , target_text , _ = line.split("\t")

    target_text = "\t" + target_text + "\n"
    input_texts.append(input_text)
    target_texts.append(target_text)

    for char in input_text:
        if char not in input_chars:
            input_chars.add(char)

    for char in target_text:
        if char not in output_chars:
            output_chars.add(char)

In [7]:
len(input_chars)

70

In [8]:
len(output_chars)

91

In [9]:
input_chars = sorted(list(input_chars))
output_chars = sorted(list(output_chars))
num_encoder_tokens = len(input_chars)
num_target_tokens = len(output_chars)

max_encoder_seq_length = max([len(word) for word in input_texts])
max_decoder_seq_length = max([len(word) for word in target_texts])

In [10]:
input_token2index = {char:i for i ,char in enumerate(input_chars)}
target_token2index = {char:i for i,char in enumerate(output_chars)}

In [19]:
target_token2index

{'\t': 0,
 '\n': 1,
 ' ': 2,
 '!': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '5': 14,
 '8': 15,
 '9': 16,
 ':': 17,
 '?': 18,
 'A': 19,
 'B': 20,
 'C': 21,
 'D': 22,
 'E': 23,
 'F': 24,
 'G': 25,
 'H': 26,
 'I': 27,
 'J': 28,
 'K': 29,
 'L': 30,
 'M': 31,
 'N': 32,
 'O': 33,
 'P': 34,
 'Q': 35,
 'R': 36,
 'S': 37,
 'T': 38,
 'U': 39,
 'V': 40,
 'W': 41,
 'Y': 42,
 'a': 43,
 'b': 44,
 'c': 45,
 'd': 46,
 'e': 47,
 'f': 48,
 'g': 49,
 'h': 50,
 'i': 51,
 'j': 52,
 'k': 53,
 'l': 54,
 'm': 55,
 'n': 56,
 'o': 57,
 'p': 58,
 'q': 59,
 'r': 60,
 's': 61,
 't': 62,
 'u': 63,
 'v': 64,
 'w': 65,
 'x': 66,
 'y': 67,
 'z': 68,
 '\xa0': 69,
 '«': 70,
 '»': 71,
 'À': 72,
 'Ç': 73,
 'É': 74,
 'Ê': 75,
 'à': 76,
 'â': 77,
 'ç': 78,
 'è': 79,
 'é': 80,
 'ê': 81,
 'î': 82,
 'ï': 83,
 'ô': 84,
 'ù': 85,
 'û': 86,
 'œ': 87,
 '\u2009': 88,
 '’': 89,
 '\u202f': 90}

In [22]:
output_chars[90]

'\u202f'

In [11]:
input_token2index

{' ': 0,
 '!': 1,
 '"': 2,
 '$': 3,
 '%': 4,
 '&': 5,
 "'": 6,
 ',': 7,
 '-': 8,
 '.': 9,
 '0': 10,
 '1': 11,
 '2': 12,
 '3': 13,
 '5': 14,
 '7': 15,
 '8': 16,
 '9': 17,
 ':': 18,
 '?': 19,
 'A': 20,
 'B': 21,
 'C': 22,
 'D': 23,
 'E': 24,
 'F': 25,
 'G': 26,
 'H': 27,
 'I': 28,
 'J': 29,
 'K': 30,
 'L': 31,
 'M': 32,
 'N': 33,
 'O': 34,
 'P': 35,
 'Q': 36,
 'R': 37,
 'S': 38,
 'T': 39,
 'U': 40,
 'V': 41,
 'W': 42,
 'Y': 43,
 'a': 44,
 'b': 45,
 'c': 46,
 'd': 47,
 'e': 48,
 'f': 49,
 'g': 50,
 'h': 51,
 'i': 52,
 'j': 53,
 'k': 54,
 'l': 55,
 'm': 56,
 'n': 57,
 'o': 58,
 'p': 59,
 'q': 60,
 'r': 61,
 's': 62,
 't': 63,
 'u': 64,
 'v': 65,
 'w': 66,
 'x': 67,
 'y': 68,
 'z': 69}

In [12]:
encoder_input_data = np.zeros(
    (len(input_texts) , max_encoder_seq_length , num_encoder_tokens)
,dtype='float32')

decoder_input_data = np.zeros((
    len(target_texts) , max_decoder_seq_length , num_target_tokens
),dtype='float32')

decoder_target_data = np.zeros((
    len(target_texts) , max_decoder_seq_length , num_target_tokens
),dtype='float32')

In [13]:
for i, (input_text2,target_text2) in enumerate(zip(input_texts,target_texts)):
    for t,char in enumerate(input_text2):
        encoder_input_data[i,t,input_token2index[char]] = 1.
    encoder_input_data[i , t+1: , input_token2index[' ']] =1.

    for t,char in enumerate(target_text2):
        decoder_input_data[i,t,target_token2index[char]] = 1.
        if t >0:
            decoder_target_data[i , t-1 , target_token2index[char]] = 1.
    decoder_input_data[i,t+1: , target_token2index[' ']]= 1.
    decoder_target_data[i,t: , target_token2index[" "]]= 1.

## Encoder

In [14]:
encoder_input = Input(shape=(None,num_encoder_tokens))
encoder = LSTM(LATENT_DIM,return_state=True)
encoder_output , state_h , state_c = encoder(encoder_input)
encoder_states = [state_h,state_c]

## Decoder

In [15]:
decoder_inputs = Input(shape=(None,num_target_tokens))
decoder = LSTM(LATENT_DIM,return_sequences=True , return_state=True)
decoder_output , _ , _ = decoder(decoder_inputs,initial_state=encoder_states)
decoder_dense = Dense(num_target_tokens,activation="softmax")
decoder_output = decoder_dense(decoder_output)

In [16]:
model = Model(inputs=[encoder_input,decoder_inputs],outputs=decoder_output)
model.summary()

In [17]:
model.compile(optimizer="rmsprop",loss="categorical_crossentropy",
              metrics=["accuracy"])

model.fit([encoder_input_data,decoder_input_data],
          decoder_target_data,
          batch_size=BATCH_SIZE,
          epochs = EPOCHS)

Epoch 1/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 186ms/step - accuracy: 0.7052 - loss: 1.5087
Epoch 2/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 174ms/step - accuracy: 0.7436 - loss: 0.9470
Epoch 3/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 179ms/step - accuracy: 0.7654 - loss: 0.8402
Epoch 4/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 127ms/step - accuracy: 0.7914 - loss: 0.7451
Epoch 5/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 113ms/step - accuracy: 0.8112 - loss: 0.6528
Epoch 6/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 114ms/step - accuracy: 0.8211 - loss: 0.6118
Epoch 7/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 113ms/step - accuracy: 0.8288 - loss: 0.5855
Epoch 8/40
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m19s[0m 122ms/step - accuracy: 0.8355 - loss: 0.5612
Epoch 9/40
[1m1

<keras.src.callbacks.history.History at 0x272e13b1c30>

In [18]:
encoder_model = Model(encoder_input, encoder_states)

decoder_state_input_h = Input(shape=(LATENT_DIM,))
decoder_state_input_c = Input(shape=(LATENT_DIM,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
decoder_outputs, state_h, state_c = decoder(
    decoder_inputs, initial_state=decoder_states_inputs)

decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [54]:
reverse_input_char_index = dict((i,char) for char,i in input_token2index.items())

reverse_target_char_index = dict((i,char) for char,i in target_token2index.items())

In [66]:
def decode_sequence2(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq,verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_target_tokens))
    
    target_seq[0, 0, target_token2index['\t']] = 1.


    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value , verbose=0)

       
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        
        decoded_sentence += sampled_char
       
        if (sampled_char == '\n' or
           len(decoded_sentence) > max_decoder_seq_length):
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_target_tokens))
        target_seq[0, 0, sampled_token_index] = 1.

        # Update states
        states_value = [h, c]

    return decoded_sentence

In [74]:
for seq_index in range(1):
    input_seq = encoder_input_data[seq_index:seq_index+1]
    decoded_sentence = decode_sequence2(input_seq=input_seq)
    print(f"output {decoded_sentence}")

output Sourrous !



In [84]:
input_text = "Who?"

In [85]:
input_seq = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype='float32')

for t, char in enumerate(input_text):
    if char in input_token2index:  
        input_seq[0, t, input_token2index[char]] = 1.0

In [86]:
for t in range(len(input_text), max_encoder_seq_length):
    input_seq[0, t, input_token2index[' ']] = 1.0  # 

In [87]:
decode_sequence2(input_seq)

'Qui est chaure\xa0?\n'