In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Input, TimeDistributed
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
# Sample data (in a real scenario, you'd have much more data)
french_sentences = [
    "Bonjour comment allez-vous",
    "Je m'appelle Bassel",
    "J'aime l'intelligence artificielle",
    "Quel temps fait-il aujourd'hui",
    "Je vais au cinéma ce soir",
    "Pouvez-vous m'aider s'il vous plaît",
    "J'adore la cuisine française",
    "Où est la station de métro la plus proche",
    "Je travaille dans l'informatique",
    "Quel est votre livre préféré",
    "Je voudrais réserver une table pour deux",
    "Comment dit-on 'merci' en anglais",
    "J'apprends le français depuis deux ans",
    "Quelle est la capitale de la France",
    "Je suis désolé je ne comprends pas",
    "Pouvez-vous parler plus lentement",
    "J'ai besoin d'un billet d'avion pour Paris",
    "Quel est votre plat préféré",
    "Je suis en vacances pour deux semaines",
    "Avez-vous des recommandations de restaurants"
]

english_sentences = [
    "Hello how are you",
    "My name is Bassel",
    "I love artificial intelligence",
    "What's the weather like today",
    "I'm going to the cinema tonight",
    "Can you help me please",
    "I love French cuisine",
    "Where is the nearest metro station",
    "I work in information technology",
    "What is your favorite book",
    "I would like to book a table for two",
    "How do you say 'thank you' in English",
    "I have been learning French for two years",
    "What is the capital of France",
    "I'm sorry I don't understand",
    "Can you speak more slowly",
    "I need a plane ticket to Paris",
    "What is your favorite dish",
    "I am on vacation for two weeks",
    "Do you have any restaurant recommendations"
]

In [3]:

# Tokenize the sentences
french_tokenizer = Tokenizer()
english_tokenizer = Tokenizer()

french_tokenizer.fit_on_texts(french_sentences)
english_tokenizer.fit_on_texts(english_sentences)

# Convert sentences to sequences
french_sequences = french_tokenizer.texts_to_sequences(french_sentences)
english_sequences = english_tokenizer.texts_to_sequences(english_sentences)


In [4]:
# Pad sequences
max_length = 10  # Adjust based on your data
french_padded = pad_sequences(french_sequences, maxlen=max_length, padding='post')
english_padded = pad_sequences(english_sequences, maxlen=max_length, padding='post')

# Define vocabulary sizes
french_vocab_size = len(french_tokenizer.word_index) + 1
english_vocab_size = len(english_tokenizer.word_index) + 1

In [5]:
# Build the model
model = Sequential([
    Input(shape=(max_length,)),
    Embedding(french_vocab_size, 128),
    LSTM(256, return_sequences=True),
    TimeDistributed(Dense(english_vocab_size, activation='softmax'))
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

# Reshape target data
english_padded_3d = english_padded.reshape(english_padded.shape[0], english_padded.shape[1], 1)

# Train the model (in practice, you'd need much more data and epochs)
model.fit(french_padded, english_padded_3d, epochs=100, batch_size=1)



Epoch 1/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 5ms/step - loss: 4.0797  
Epoch 2/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 2.6017
Epoch 3/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.6118
Epoch 4/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.4125
Epoch 5/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.4912
Epoch 6/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.3626
Epoch 7/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.4802
Epoch 8/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.3250
Epoch 9/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.6289
Epoch 10/100
[1m20/20[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 2.46

<keras.src.callbacks.history.History at 0x2b5c78610>

In [9]:
# Function to translate a new French sentence
def translate(sentence):
    sequence = french_tokenizer.texts_to_sequences([sentence])
    padded = pad_sequences(sequence, maxlen=max_length, padding='post')
    predicted = model.predict(padded)[0]
    
    result = []
    for word_probs in predicted:
        index = np.argmax(word_probs)
        if index != 0:  # Skip padding
            word = english_tokenizer.index_word.get(index, '')
            if word:
                result.append(word)
    
    return ' '.join(result)

# Test the model
print(translate("Je m'appelle Bassel"))  # Expected: "I am an AI assistant"

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
i name is bassel
