In [3]:

import pandas as pd

# Read TSV
df = pd.read_csv("input.tsv", sep='\t')

# Save as CSV
df.to_csv("output.csv", index=False)

In [4]:
df = df.drop(df.columns[[0, 2]], axis=1)

In [5]:
df = df.rename(columns={
    "Let's try something.": "er",
    "Essayons quelque chose !": "fr"
})

In [6]:
df.to_csv("output.csv", index=False)

In [7]:
df = df.head(5000)
df.to_csv("output.csv", index=False)

In [8]:
df = pd.read_csv("output.csv")
src_texts = df['er'].astype(str).tolist()
tgt_texts = df['fr'].astype(str).tolist()

In [9]:
!pip install tensorflow



In [10]:

import pandas as pd
import numpy as np
import tensorflow as tf
from google.colab import files
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Embedding, Dense
print("✅ All libraries loaded.")



✅ All libraries loaded.


In [11]:
tgt_input_texts = ["<sos> " + text for text in tgt_texts]
tgt_target_texts = [text + " <eos>" for text in tgt_texts]


In [29]:
# Source Tokenizer
src_tokenizer = Tokenizer()
src_tokenizer.fit_on_texts(src_texts)
src_seq = src_tokenizer.texts_to_sequences(src_texts)
src_seq = pad_sequences(src_seq, padding='post')

# Target Tokenizer
tgt_tokenizer = Tokenizer(filters='')
tgt_tokenizer.fit_on_texts(tgt_input_texts + tgt_target_texts)
tgt_input_seq = pad_sequences(tgt_tokenizer.texts_to_sequences(tgt_input_texts), padding='post')
tgt_output_seq = pad_sequences(tgt_tokenizer.texts_to_sequences(tgt_target_texts), padding='post')


In [30]:
src_vocab_size = len(src_tokenizer.word_index) + 1
tgt_vocab_size = len(tgt_tokenizer.word_index) + 1
emb_dim = 128
lstm_units = 256

# Encoder
encoder_inputs = Input(shape=(None,))
enc_emb = Embedding(src_vocab_size, emb_dim)(encoder_inputs)
_, state_h, state_c = LSTM(lstm_units, return_state=True)(enc_emb)

# Decoder
decoder_inputs = Input(shape=(None,))
dec_emb = Embedding(tgt_vocab_size, emb_dim)(decoder_inputs)
decoder_lstm = LSTM(lstm_units, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(dec_emb, initial_state=[state_h, state_c])
decoder_dense = Dense(tgt_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')
model.summary()


In [31]:
model.fit([src_seq, tgt_input_seq],
          np.expand_dims(tgt_output_seq, -1),
          batch_size=32,
          epochs=10,
          validation_split=0.1)


Epoch 1/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 38ms/step - loss: 2.8798 - val_loss: 0.7940
Epoch 2/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - loss: 0.7891 - val_loss: 0.7767
Epoch 3/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - loss: 0.7664 - val_loss: 0.7605
Epoch 4/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - loss: 0.7232 - val_loss: 0.7518
Epoch 5/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - loss: 0.7137 - val_loss: 0.7447
Epoch 6/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - loss: 0.6874 - val_loss: 0.7400
Epoch 7/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 32ms/step - loss: 0.6732 - val_loss: 0.7341
Epoch 8/10
[1m141/141[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 34ms/step - loss: 0.6400 - val_loss: 0.7346
Epoch 9/10
[1m141/141[0m [32

<keras.src.callbacks.history.History at 0x78b58447aed0>

In [15]:
# Define the encoder model
encoder_model = Model(encoder_inputs, [state_h, state_c])

# Define the decoder model
decoder_state_h = Input(shape=(lstm_units,))
decoder_state_c = Input(shape=(lstm_units,))
decoder_states_inputs = [decoder_state_h, decoder_state_c]
decoder_outputs, state_h_inf, state_c_inf = decoder_lstm(
    dec_emb, initial_state=decoder_states_inputs)
decoder_states = [state_h_inf, state_c_inf]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states)

In [16]:
def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq, verbose=0)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1))
    # Populate the first character of target sequence with the start token.
    target_seq[0, 0] = tgt_tokenizer.word_index['<sos>']

    # Sampling loop for a batch of sequences
    stop_condition = False
    decoded_sentence = ''
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict(
            [target_seq] + states_value, verbose=0)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_token = tgt_tokenizer.index_word[sampled_token_index]

        # Exit condition: either hit max length or find stop token.
        if (sampled_token == '<eos>' or
                len(decoded_sentence.split()) > 50): # Set a max length
            stop_condition = True
        else:
            decoded_sentence += ' ' + sampled_token
            # Update the target sequence (of length 1).
            target_seq = np.zeros((1, 1))
            target_seq[0, 0] = sampled_token_index

        # Update states
        states_value = [h, c]

    return decoded_sentence.strip()

def translate_sentence(input_sentence):
    input_seq = src_tokenizer.texts_to_sequences([input_sentence])
    input_seq = pad_sequences(input_seq, maxlen=src_seq.shape[1], padding='post')
    return decode_sequence(input_seq)

Now you can use the `translate_sentence` function to translate a new English sentence into French.

In [17]:
english_sentence = "This is a test sentence."
french_translation = translate_sentence(english_sentence)
print(f"English: {english_sentence}")
print(f"French: {french_translation}")

English: This is a test sentence.
French: vous êtes trop très homme pour la autres.


In [21]:
!pip install nltk

