### Language Translation

In [2]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
import numpy as np

batch_size=64
epochs=100
latent_dim=256
num_samples=10000
data_path='/content/fra.txt'

In [3]:
data_path

'/content/fra.txt'

In [4]:
input_texts= []
target_texts= []
input_characters= set()
target_characters= set()

with open(data_path, 'r', encoding='utf-8') as f:
  lines= f.read().split('\n')
for line in lines[: min(num_samples, len(lines) - 1)]:
  input_text, target_text, _ = line.split('\t')
  target_text= '\t' + target_text +'\n'
  input_texts.append(input_text)
  target_texts.append(target_text)

  for char in input_text:
    if char not in input_characters:
      input_characters.add(char)
  for char in target_text:
    if char not in target_characters:
      target_characters.add(char)


In [5]:
input_texts

['Go.',
 'Go.',
 'Go.',
 'Go.',
 'Hi.',
 'Hi.',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run!',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Run.',
 'Who?',
 'Wow!',
 'Wow!',
 'Wow!',
 'Duck!',
 'Duck!',
 'Duck!',
 'Fire!',
 'Help!',
 'Hide.',
 'Hide.',
 'Jump!',
 'Jump.',
 'Stop!',
 'Stop!',
 'Stop!',
 'Wait!',
 'Wait!',
 'Wait!',
 'Wait.',
 'Wait.',
 'Wait.',
 'Wait.',
 'Begin.',
 'Begin.',
 'Go on.',
 'Go on.',
 'Go on.',
 'Hello!',
 'Hello!',
 'Hello.',
 'Hello.',
 'Hello.',
 'Hello.',
 'I see.',
 'I see.',
 'I try.',
 'I won!',
 'I won!',
 'I won.',
 'Oh no!',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Relax.',
 'Shoot!',
 'Shoot!',
 'Smile.',
 'Smile.',
 'Smile.',
 'Sorry?',
 'Attack!',
 'Attack!',
 'Attack!',
 'Attack!',
 'Buy it.',
 'Buy it.',
 'Buy it.',
 'Buy it.',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Cheers!',
 'Eat it.',
 'Eat it.',
 'Exhale.',
 'Get 

In [6]:
input_characters= sorted(list(input_characters))
target_characters= sorted(list(target_characters))
num_encoder_tokens= len(input_characters)
num_decoder_tokens= len(target_characters)
max_encoder_seq_length= max([len(txt) for txt in input_texts])
max_decoder_seq_length= max([len(txt) for txt in target_texts])

In [7]:
print('NUmber of samples:', len(input_texts))
print('Number of unique input tokens:', num_encoder_tokens)
print('Number of unique output tokens:', num_decoder_tokens)
print('Max sequence length for inputs:', max_encoder_seq_length)
print('Max sequence length for outputs:', max_decoder_seq_length)

NUmber of samples: 10000
Number of unique input tokens: 70
Number of unique output tokens: 91
Max sequence length for inputs: 14
Max sequence length for outputs: 59


In [8]:
input_token_index= dict([(char, i) for i, char in enumerate(input_characters)])
target_token_index= dict([(char, i) for i, char in enumerate(target_characters)])

In [9]:
input_token_index, target_token_index

({' ': 0,
  '!': 1,
  '"': 2,
  '$': 3,
  '%': 4,
  '&': 5,
  "'": 6,
  ',': 7,
  '-': 8,
  '.': 9,
  '0': 10,
  '1': 11,
  '2': 12,
  '3': 13,
  '5': 14,
  '7': 15,
  '8': 16,
  '9': 17,
  ':': 18,
  '?': 19,
  'A': 20,
  'B': 21,
  'C': 22,
  'D': 23,
  'E': 24,
  'F': 25,
  'G': 26,
  'H': 27,
  'I': 28,
  'J': 29,
  'K': 30,
  'L': 31,
  'M': 32,
  'N': 33,
  'O': 34,
  'P': 35,
  'Q': 36,
  'R': 37,
  'S': 38,
  'T': 39,
  'U': 40,
  'V': 41,
  'W': 42,
  'Y': 43,
  'a': 44,
  'b': 45,
  'c': 46,
  'd': 47,
  'e': 48,
  'f': 49,
  'g': 50,
  'h': 51,
  'i': 52,
  'j': 53,
  'k': 54,
  'l': 55,
  'm': 56,
  'n': 57,
  'o': 58,
  'p': 59,
  'q': 60,
  'r': 61,
  's': 62,
  't': 63,
  'u': 64,
  'v': 65,
  'w': 66,
  'x': 67,
  'y': 68,
  'z': 69},
 {'\t': 0,
  '\n': 1,
  ' ': 2,
  '!': 3,
  '%': 4,
  '&': 5,
  "'": 6,
  ',': 7,
  '-': 8,
  '.': 9,
  '0': 10,
  '1': 11,
  '2': 12,
  '3': 13,
  '5': 14,
  '8': 15,
  '9': 16,
  ':': 17,
  '?': 18,
  'A': 19,
  'B': 20,
  'C': 21,
  'D'

In [10]:
encoder_input_data= np.zeros((len(input_texts), max_encoder_seq_length, num_encoder_tokens),dtype='float32')

decoder_input_data= np.zeros((len(input_texts),max_decoder_seq_length, num_decoder_tokens),dtype='float32')

decoder_target_data= np.zeros((len(input_texts),max_decoder_seq_length, num_decoder_tokens),dtype='float32')


In [11]:
for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
  for t, char in enumerate(input_text):
    encoder_input_data[i,t,input_token_index[char]]=1.
  encoder_input_data[i, t+1:, input_token_index[' ']]=1.
  for t, char in enumerate(target_text):
    decoder_input_data[i,t,target_token_index[char]]=1.
    if t > 0:
      decoder_target_data[i, t-1, target_token_index[char]]=1.
  decoder_input_data[i,t+1:, target_token_index[' ']]=1.
  decoder_target_data[i,t:,target_token_index[' ']]=1.

In [12]:
encoder_input_data[0].shape

(14, 70)

In [13]:
#Define an input sequence and process it.

encoder_inputs= Input(shape=(None,num_encoder_tokens))
encoder=LSTM(latent_dim,return_state=True)
encoder_outputs,state_h, state_c =encoder(encoder_inputs)
encoder_states= [state_h, state_c]

In [14]:
decoder_inputs= Input(shape=(None, num_decoder_tokens))
decoder_lstm= LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _= decoder_lstm(decoder_inputs, initial_state=encoder_states)



In [15]:
decoder_dense= Dense(num_decoder_tokens, activation='softmax')
decoder_outputs= decoder_dense(decoder_outputs)

In [16]:
#define the model

model=Model([encoder_inputs,decoder_inputs],decoder_outputs)

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['accuracy'])
model.fit([encoder_input_data, decoder_input_data], decoder_target_data,batch_size=batch_size,epochs=epochs,validation_split=0.2)

Epoch 1/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 17ms/step - accuracy: 0.7054 - loss: 1.5356 - val_accuracy: 0.7184 - val_loss: 1.0531
Epoch 2/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7473 - loss: 0.9532 - val_accuracy: 0.7339 - val_loss: 0.9538
Epoch 3/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7634 - loss: 0.8552 - val_accuracy: 0.7522 - val_loss: 0.8592
Epoch 4/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.7867 - loss: 0.7666 - val_accuracy: 0.7770 - val_loss: 0.7720
Epoch 5/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.8050 - loss: 0.6813 - val_accuracy: 0.7957 - val_loss: 0.7111
Epoch 6/100
[1m125/125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 12ms/step - accuracy: 0.8174 - loss: 0.6349 - val_accuracy: 0.8048 - val_loss: 0.6758
Epoch 7/100
[1m

<keras.src.callbacks.history.History at 0x7a3aa41349e0>

In [17]:
encoder_model= Model(encoder_inputs, encoder_states)

decoder_state_input_h=Input(shape=(latent_dim,))
decoder_state_input_c=Input(shape=(latent_dim,))
decoder_states_inputs=[decoder_state_input_h,decoder_state_input_c]
decoder_outputs,state_h,state_c= decoder_lstm(decoder_inputs,initial_state= decoder_states_inputs)
decoder_states= [state_h,state_c]
decoder_outputs=decoder_dense(decoder_outputs)

decoder_model= Model([decoder_inputs]+ decoder_states_inputs,[decoder_outputs]+decoder_states)

reverse_input_char_index= dict((i,char) for char, i in input_token_index.items())
reverse_target_char_index= dict((i,char) for char, i in target_token_index.items())

def decode_sequence(input_seq):
  states_value= encoder_model.predict(input_seq)
  target_seq=np.zeros((1,1,num_decoder_tokens))
  target_seq[0,0,target_token_index['\t']]=1.

  stop_condition=False
  decode_sentence = ''
  while not stop_condition:
    output_tokens, h, c= decoder_model.predict([target_seq]+ states_value)

    sampled_token_index= np.argmax(output_tokens[0,-1,:])
    sampled_char= reverse_target_char_index[sampled_token_index]
    decode_sentence += sampled_char

    if (sampled_char =='\n' or len(decode_sentence) > max_decoder_seq_length):
      stop_condition=True

    target_seq=np.zeros((1,1,num_decoder_tokens))
    target_seq[0,0,sampled_token_index]=1.

    states_value=[h,c]

  return decode_sentence

for seq_index in range(5):
  input_seq= encoder_input_data[seq_index:seq_index+1]
  decoded_sentence= decode_sequence(input_seq)
  print('-')
  print('Input sentence:', input_texts[seq_index])
  print('Decoded sentence:', decoded_sentence)





# import numpy as np
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# def translate_sentence(input_text):
#     # 1️⃣ Convert characters to token indices
#     input_seq = np.zeros((1, max_encoder_seq_length, num_encoder_tokens), dtype="float32")

#     for t, char in enumerate(input_text):
#         if char in input_token_index:
#             input_seq[0, t, input_token_index[char]] = 1.

#     # 2️⃣ Decode using your decode_sequence function
#     decoded_sentence = decode_sequence(input_seq)

#     print("-")
#     print("Input sentence:", input_text)
#     print("Decoded sentence:", decoded_sentence)

# while True:
#   text = input("Enter an English sentence (or 'q' to quit): ")
#   if text.lower() == 'q':
#     break
#   translate_sentence(text)`


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 139ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 122ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 