In [8]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Path to the CSV file
csv_file_path = "en-fr.csv"

# Read the CSV file and sample 20000 rows
sample_df = pd.read_csv(csv_file_path).sample(n=20000, random_state=42)

# Set up the tokenizer
tokenizer_input = Tokenizer(num_words=10000)
tokenizer_target = Tokenizer(num_words=10000, filters='')

# Convert all data to strings
input_texts = [str(text) for text in sample_df.iloc[:, 0].tolist()]
target_texts = ['<start> ' + str(text) for text in sample_df.iloc[:, 1].tolist()]

# Tokenization and Padding
tokenizer_input.fit_on_texts(input_texts)
tokenizer_target.fit_on_texts(target_texts)

# Adjusted Sequence Length
sequence_length = 15  # Adjust this based on your analysis of the data

input_sequences = tokenizer_input.texts_to_sequences(input_texts)
input_data = pad_sequences(input_sequences, maxlen=sequence_length)

target_sequences = tokenizer_target.texts_to_sequences(target_texts)
target_data = pad_sequences(target_sequences, maxlen=sequence_length)

# No need for one-hot encoding, use target_data directly with sparse_categorical_crossentropy


In [9]:
# Model Architecture Enhancements
from tensorflow.keras.layers import Bidirectional

# Assuming 'tokenizer_input' is for the encoder
vocab_size_input = len(tokenizer_input.word_index) + 1
embedding_dim = 256  # You can set this to the desired value
vocab_size_target = 10000  # This should match the last dimension of your target data

# Encoder
encoder_inputs = tf.keras.Input(shape=(None,))
enc_emb = tf.keras.layers.Embedding(vocab_size_input, embedding_dim)(encoder_inputs)
encoder_lstm1 = Bidirectional(tf.keras.layers.LSTM(256, return_sequences=True, dropout=0.4))(enc_emb)
encoder_outputs, forward_h, forward_c, backward_h, backward_c = Bidirectional(tf.keras.layers.LSTM(256, return_state=True, dropout=0.4))(encoder_lstm1)
state_h = tf.keras.layers.Concatenate()([forward_h, backward_h])
state_c = tf.keras.layers.Concatenate()([forward_c, backward_c])
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = tf.keras.Input(shape=(None,))
dec_emb_layer = tf.keras.layers.Embedding(vocab_size_target, embedding_dim)
dec_emb = dec_emb_layer(decoder_inputs)
decoder_lstm1 = tf.keras.layers.LSTM(512, return_sequences=True, dropout=0.4)(dec_emb, initial_state=encoder_states)  # Note: LSTM size doubled
decoder_outputs = tf.keras.layers.LSTM(512, return_sequences=True, dropout=0.4)(decoder_lstm1)  # Note: LSTM size doubled

# Attention Layer
attention = tf.keras.layers.Attention()
attn_out = attention([decoder_outputs, encoder_outputs])

# Fully connected layer
decoder_dense = tf.keras.layers.Dense(vocab_size_target, activation='softmax')
decoder_outputs = decoder_dense(attn_out)

# Define the model
model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile the model
optimizer = tf.keras.optimizers.Adam()
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy')


In [10]:
from tensorflow.keras.callbacks import EarlyStopping

# Early Stopping Callback
early_stopping = EarlyStopping(monitor='loss', patience=10, verbose=1)

# --- Model Training ---
model.fit([input_data, target_data], target_data, batch_size=64, epochs=500, callbacks=[early_stopping])

# --- Beam Search Decoder Function ---
def beam_search_decoder(data, k, tokenizer):
    sequences = [[list(), 1.0]]
    for row in data:
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(row)):
                candidate = [seq + [j], score * -np.log(row[j] + 1e-6)]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    word_sequences = []
    for seq, score in sequences:
        words = [tokenizer.index_word.get(i, '?') for i in seq]
        word_sequences.append((' '.join(words), score))
    return word_sequences


Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

In [16]:
import numpy as np

# Function to apply temperature scaling to predictions
def temperature_scaled_prediction(predictions, temperature=1.0):
    predictions = np.log(predictions + 1e-6) / temperature
    exp_preds = np.exp(predictions)
    return exp_preds / np.sum(exp_preds)

# Modified beam search decoder with temperature control
def beam_search_decoder(data, k, tokenizer, temperature=1.0):
    sequences = [[list(), 1.0]]
    for row in data:
        scaled_row = temperature_scaled_prediction(row, temperature)
        all_candidates = list()
        for i in range(len(sequences)):
            seq, score = sequences[i]
            for j in range(len(scaled_row)):
                candidate = [seq + [j], score * -np.log(scaled_row[j] + 1e-6)]
                all_candidates.append(candidate)
        ordered = sorted(all_candidates, key=lambda tup: tup[1])
        sequences = ordered[:k]
    word_sequences = []
    for seq, score in sequences:
        words = [tokenizer.index_word.get(i, '?') for i in seq]
        word_sequences.append((' '.join(words), score))
    return word_sequences


# Assuming the maximum sequence length used during data preparation is 10
max_seq_length = 10

some_input_data = input_data[0:1]

# Initialize the decoder input as a zero matrix with shape (1, max_seq_length)
decoder_input = np.zeros((1, max_seq_length))
decoder_input[0, 0] = tokenizer_target.word_index['<start>']

# Generating sequence one word at a time
for i in range(max_seq_length - 1):
    prediction = model.predict([some_input_data, decoder_input])
    # Using a beam search with temperature scaling
    beam_results = beam_search_decoder(prediction[0, :i+1, :], k=30, tokenizer=tokenizer_target, temperature=2.5)
    if i < len(beam_results) and beam_results[0][0]:
        chosen_word = beam_results[0][0].split()[0]
        chosen_word_index = tokenizer_target.word_index.get(chosen_word, 0)
        decoder_input[0, i + 1] = chosen_word_index

# Constructing the final sequence from the decoder input
final_sequence = ' '.join(tokenizer_target.index_word.get(idx, '?') for idx in decoder_input[0] if idx > 0)
print("Generated Sequence:", final_sequence)


Generated Sequence: <start> alimentaires alimentaires alimentaires alimentaires alimentaires alimentaires alimentaires alimentaires alimentaires
