In [60]:
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from gensim.models import Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, Bidirectional, TimeDistributed
from tensorflow.keras.layers import Attention


In [61]:
# Load the dataset
data = pd.read_csv("Tamil_train.csv")

In [62]:

# Preprocessing
# Tokenize Tamil-English code-mixed text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(data['Transcript'])

# Create word-to-index and index-to-word mappings
word_index = tokenizer.word_index
index_word = {v: k for k, v in word_index.items()}

# Convert text to sequences of indices
sequences = tokenizer.texts_to_sequences(data['Transcript'])

# Pad sequences to make them of equal length
max_seq_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_seq_length, padding='post')

# Word2Vec Embeddings
word2vec_model = Word2Vec(sentences=data['Transcript'], vector_size=100, window=5, min_count=1, workers=4)
word_vectors = word2vec_model.wv

# Convert words to Word2Vec embeddings
embedding_matrix = np.zeros((len(word_index) + 1, 100))
for word, i in word_index.items():
  if word in word_vectors:
    embedding_matrix[i] = word_vectors[word]

# Model
model = Sequential()
model.add(Embedding(len(word_index) + 1, 100, input_shape=(max_seq_length,)))  # Removed input_length argument
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(TimeDistributed(Dense(len(word_index) + 1, activation='softmax')))


  super().__init__(**kwargs)


In [63]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [64]:
# Training the model
model.fit(padded_sequences, padded_sequences, epochs=5, batch_size=6, validation_split=0.2)


Epoch 1/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 113ms/step - accuracy: 0.4326 - loss: 6.6675 - val_accuracy: 0.6538 - val_loss: 5.8686
Epoch 2/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.6670 - loss: 5.1049 - val_accuracy: 0.6538 - val_loss: 2.8815
Epoch 3/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step - accuracy: 0.6647 - loss: 2.5492 - val_accuracy: 0.6538 - val_loss: 2.5758
Epoch 4/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 55ms/step - accuracy: 0.6476 - loss: 2.4765 - val_accuracy: 0.6538 - val_loss: 2.6078
Epoch 5/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step - accuracy: 0.6686 - loss: 2.2619 - val_accuracy: 0.6538 - val_loss: 2.5477
Epoch 6/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 57ms/step - accuracy: 0.6625 - loss: 2.2272 - val_accuracy: 0.6538 - val_loss: 2.5304
Epoch 7/10
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fe3886c04f0>

In [65]:
def translate_tamil_to_english(input_text):
  """
  Translates Tamil-English code-mixed text to English.

  Args:
      input_text: The Tamil-English text to translate.

  Returns:
      The translated English text.
  """

  # Tokenize input text
  input_sequence = tokenizer.texts_to_sequences([input_text])
  padded_input_sequence = pad_sequences(input_sequence, maxlen=max_seq_length, padding='post')

  # Print padded input sequence for debugging (optional)
  print("Padded input sequence:", padded_input_sequence)

  # Predict
  predicted_sequence = model.predict(padded_input_sequence)

  # Set a threshold for minimum probability
  threshold = 0.1

  # Filter predictions with low probabilities
  predicted_indices = [
      np.argmax(word) for word in predicted_sequence[0] if np.max(word) > threshold
  ]

  # Handle missing indices using get() in index_word
  predicted_words = [index_word.get(idx, '') for idx in predicted_indices]

  # Join the words to form translated text
  translated_text = ' '.join(predicted_words)
  return translated_text


In [66]:
input_text = "Ennode earring design nalla irruka?"


In [67]:
translated_text = translate_tamil_to_english(input_text)


Padded input sequence: [[ 62 253 254   3 255   0   0   0   0   0   0   0   0   0   0   0   0]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 260ms/step


In [68]:
print("Translated Text:", translated_text)


Translated Text:            
