In [46]:
import pathlib
import pandas as pd
import random
import string
import re
import numpy as np
import tensorflow as tf
import tensorflow.data as tf_data
import tensorflow.strings as tf_strings

import keras
from keras import layers
from keras import ops
from keras.layers import TextVectorization

In [47]:
with open("data/CCMatrix.fr-ta.ta") as f:
    tamil=f.read().split("\n")
with open("data/CCMatrix.fr-ta.fr") as f:
    french=f.read().split("\n")

In [48]:
tamil_sentence=[]
for line in tamil:
    tam = "[start] " + line + " [end]"
    tamil_sentence.append(tam)

In [49]:
df=pd.DataFrame({"french":french,"tamil":tamil_sentence})
df=df.head(400000).reset_index(drop=True)

In [50]:
df=df.sample(frac=1).reset_index(drop=True)

In [51]:
num_val_samples = int(0.15 * len(df))
num_train_samples = len(df) - 2 * num_val_samples
train_pairs = df[:num_train_samples]
val_pairs = df[num_train_samples : num_train_samples + num_val_samples]
test_pairs = df[num_train_samples + num_val_samples :]

print(f"{len(df)} total pairs")
print(f"{len(train_pairs)} training pairs")
print(f"{len(val_pairs)} validation pairs")
print(f"{len(test_pairs)} test pairs")

400000 total pairs
280000 training pairs
60000 validation pairs
60000 test pairs


In [52]:
df

Unnamed: 0,french,tamil
0,"Avant, nous avons été poser des fils de fer en...",[start] கிராம வாழ்க்கை முழுக்க வயலுடன் தொடர்பு...
1,"En voiture, à la maison ou au bureau, il est p...","[start] வளராத நிலையில், வளர்ந்து வரும் சூழலில்..."
2,La première correspond à certains de ses tubes...,[start] இதன் பொதுவான பெயர்கள் old world twiste...
3,"Elles sont publiées en l'état, et ne font pas ...",[start] நீங்கள் அரசாங்க பங்கேற்பு என்று கூறுகி...
4,"Ce soir, il devait travailler toute la nuit au...",[start] இக்கோவில் செல்ல விரும்புவோர் Italy- Al...
...,...,...
399995,"Ou bien, quand elle arrive, il est trop tard p...",[start] படித்து முடிந்ததும் அரசாங்க உத்தியோகத்...
399996,L’archevêque s’y adresse non seulement aux Rom...,[start] “Not only do non-Muslims visit these v...
399997,"Dans le même ordre d’idées, concernant notre c...",[start] An ounce of action is worth a ton of t...
399998,Si je n’étais pas bon et que j’avais perdu ma ...,[start] அவரிடம் நான் படிக்காதது வாழ்கையில் எதை...


In [53]:
strip_chars = string.punctuation + "¿"
strip_chars = strip_chars.replace("[", "")
strip_chars = strip_chars.replace("]", "")

vocab_size = 15000
sequence_length = 20
batch_size = 128
def custom_standardization(input_string):
    lowercase = tf_strings.lower(input_string)
    return tf_strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
fre_vectorization = TextVectorization(max_tokens=vocab_size,output_mode="int",output_sequence_length=sequence_length,)
tam_vectorization = TextVectorization(max_tokens=vocab_size,output_mode="int",output_sequence_length=sequence_length + 1,standardize=custom_standardization)
train_fre_texts = [pair for pair in train_pairs['french']]
train_tam_texts = [pair for pair in train_pairs['tamil']]
fre_vectorization.adapt(train_fre_texts)
tam_vectorization.adapt(train_tam_texts)

In [54]:
import json
tam_vectorization_config = tam_vectorization.get_config()
tam_vectorization_config.pop('standardize', None)
tam_vocab =tam_vectorization.get_vocabulary()
with open('tam_vectorization_config.json', 'w', encoding='utf-8') as f:
    json.dump(tam_vectorization_config, f)
    
with open('tam_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(tam_vocab, f)
    
fre_vectorization_config = fre_vectorization.get_config()
fre_vectorization_config.pop('standardize', None)
fre_vocab = fre_vectorization.get_vocabulary()
with open('fre_vectorization_config.json', 'w', encoding='utf-8') as f:
    json.dump(fre_vectorization_config, f)
    
with open('fre_vocab.json', 'w', encoding='utf-8') as f:
    json.dump(fre_vocab, f)

In [55]:
def format_dataset(eng, spa):
    eng = fre_vectorization(eng)
    spa = tam_vectorization(spa)
    return (
        {
            "encoder_inputs": eng,
            "decoder_inputs": spa[:, :-1],
        },
        spa[:, 1:],
    )


def make_dataset(pairs):
    eng_texts, spa_texts = pairs['french'],pairs['tamil']
    eng_texts = list(eng_texts)
    spa_texts = list(spa_texts)
    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(format_dataset)
    return dataset.cache().shuffle(2048).prefetch(16)


train_ds = make_dataset(train_pairs)
val_ds = make_dataset(val_pairs)

In [56]:
for inputs, targets in train_ds.take(1):
    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
    print(f"targets.shape: {targets.shape}")

inputs["encoder_inputs"].shape: (128, 20)
inputs["decoder_inputs"].shape: (128, 20)
targets.shape: (128, 20)


In [59]:
import keras.ops as ops


class TransformerEncoder(layers.Layer):
    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.dense_dim = dense_dim
        self.num_heads = num_heads
        self.attention = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(dense_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, mask=None):
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
        else:
            padding_mask = None

        attention_output = self.attention(
            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
        )
        proj_input = self.layernorm_1(inputs + attention_output)
        proj_output = self.dense_proj(proj_input)
        return self.layernorm_2(proj_input + proj_output)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "dense_dim": self.dense_dim,
                "num_heads": self.num_heads,
            }
        )
        return config


class PositionalEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
        super().__init__(**kwargs)
        self.token_embeddings = layers.Embedding(
            input_dim=vocab_size, output_dim=embed_dim
        )
        self.position_embeddings = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.sequence_length = sequence_length
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim

    def call(self, inputs):
        length = ops.shape(inputs)[-1]
        positions = ops.arange(0, length, 1)
        embedded_tokens = self.token_embeddings(inputs)
        embedded_positions = self.position_embeddings(positions)
        return embedded_tokens + embedded_positions

    def compute_mask(self, inputs, mask=None):
        if mask is None:
            return None
        else:
            return ops.not_equal(inputs, 0)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "sequence_length": self.sequence_length,
                "vocab_size": self.vocab_size,
                "embed_dim": self.embed_dim,
            }
        )
        return config


class TransformerDecoder(layers.Layer):
    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
        super().__init__(**kwargs)
        self.embed_dim = embed_dim
        self.latent_dim = latent_dim
        self.num_heads = num_heads
        self.attention_1 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.attention_2 = layers.MultiHeadAttention(
            num_heads=num_heads, key_dim=embed_dim
        )
        self.dense_proj = keras.Sequential(
            [
                layers.Dense(latent_dim, activation="relu"),
                layers.Dense(embed_dim),
            ]
        )
        self.layernorm_1 = layers.LayerNormalization()
        self.layernorm_2 = layers.LayerNormalization()
        self.layernorm_3 = layers.LayerNormalization()
        self.supports_masking = True

    def call(self, inputs, encoder_outputs, mask=None):
        causal_mask = self.get_causal_attention_mask(inputs)
        if mask is not None:
            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
            padding_mask = ops.minimum(padding_mask, causal_mask)
        else:
            padding_mask = None

        attention_output_1 = self.attention_1(
            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
        )
        out_1 = self.layernorm_1(inputs + attention_output_1)

        attention_output_2 = self.attention_2(
            query=out_1,
            value=encoder_outputs,
            key=encoder_outputs,
            attention_mask=padding_mask,
        )
        out_2 = self.layernorm_2(out_1 + attention_output_2)

        proj_output = self.dense_proj(out_2)
        return self.layernorm_3(out_2 + proj_output)

    def get_causal_attention_mask(self, inputs):
        input_shape = ops.shape(inputs)
        batch_size, sequence_length = input_shape[0], input_shape[1]
        i = ops.arange(sequence_length)[:, None]
        j = ops.arange(sequence_length)
        mask = ops.cast(i >= j, dtype="int32")
        mask = ops.reshape(mask, (1, input_shape[1], input_shape[1]))
        mult = ops.concatenate(
            [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])],
            axis=0,
        )
        return ops.tile(mask, mult)

    def get_config(self):
        config = super().get_config()
        config.update(
            {
                "embed_dim": self.embed_dim,
                "latent_dim": self.latent_dim,
                "num_heads": self.num_heads,
            }
        )
        return config

In [60]:
embed_dim = 256
latent_dim = 2048
num_heads = 8

encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
encoder = keras.Model(encoder_inputs, encoder_outputs)

decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = TransformerDecoder(embed_dim, latent_dim, num_heads)(x, encoded_seq_inputs)
#x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)

decoder_outputs = decoder([decoder_inputs, encoder_outputs])
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [61]:
epochs = 3

transformer.summary()
transformer.compile(
    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)
transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)

Epoch 1/3
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5267s[0m 2s/step - accuracy: 0.7451 - loss: 1.7586 - val_accuracy: 0.9884 - val_loss: 0.0933
Epoch 2/3
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8597s[0m 4s/step - accuracy: 0.9894 - loss: 0.0874 - val_accuracy: 0.9960 - val_loss: 0.0323
Epoch 3/3
[1m2188/2188[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5983s[0m 3s/step - accuracy: 0.9956 - loss: 0.0367 - val_accuracy: 0.9991 - val_loss: 0.0127


<keras.src.callbacks.history.History at 0x2206dcb7350>

In [66]:
tam_vocab = tam_vectorization.get_vocabulary()
tam_index_lookup = dict(zip(range(len(tam_vocab)), tam_vocab))
max_decoded_sentence_length = sequence_length

def decode_sentence(input_sentence):
    tokenized_input_sentence = fre_vectorization([input_sentence])
    decoded_sentence = "[start]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = tam_vectorization([decoded_sentence])[:, :-1]
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        sampled_token_index = tf.argmax(predictions[0, i, :]).numpy().item(0)
        sampled_token = tam_index_lookup[sampled_token_index]
        decoded_sentence += " " + sampled_token
        if sampled_token == "[end]":
            break
    return decoded_sentence

test_fre_texts = [pair for pair in test_pairs['french']]
for _ in range(5):
    input_sentence = random.choice(test_fre_texts)
    input_sentence = input_sentence.lower()
    input_sentence = input_sentence.translate(str.maketrans('', '', strip_chars))
    translated = decode_sentence(input_sentence)
    print(f"input: {input_sentence}")
    print(f"translated: {translated}")
    print()

input: mais heureusement pour ces braves futurs mendiants notre président par coup d’état vient à leur rescousse
translated: [start] அதி மற்றும் feel acs டெக்சின் தொடக்க has go an எனக் விரோதியாவான் இங்கே a being [UNK]     

input: or je ne trouve cette information nulle part  estce que quelquun aurait une réponse plus précise
translated: [start] but reportable கண் இறைவன் அனிமேஷன்கள் implemented some wonder அல்லது சிறப்பாக இதே to you have கட்டுரை     

input: en parallèle du tournoi plusieurs matches de gala étaient organisés avec notamment des anciens joueurs du cosmos
translated: [start] நுழையும் be நிலைமை regulation பெறுவதற்கும் [UNK] ordinance village வீட்டு non பதிலளித்தார் not ஒலி 12000 be [UNK]    

input: il ne se sentait pas encore tout à fait au meilleur de sa forme mais il était tout de même beaucoup mieux en point
translated: [start] reportable • படிக்கும் ஒரு worth all a தான் nothing temple reportable magic place human ஐ என்பது all சமர்ப்பிக்கவும் வேண்டும் [end]

input: ces

In [67]:
transformer.save("french_tamil_transformer.keras")

In [68]:
transformer.save_weights("french_tamil_weights.weights.h5")

In [69]:
fre_vectorization_config

{'name': 'text_vectorization_4',
 'trainable': True,
 'dtype': {'module': 'keras',
  'class_name': 'DTypePolicy',
  'config': {'name': 'float32'},
  'registered_name': None},
 'max_tokens': 15000,
 'split': 'whitespace',
 'ngrams': None,
 'output_mode': 'int',
 'output_sequence_length': 20,
 'pad_to_max_tokens': False,
 'sparse': False,
 'ragged': False,
 'vocabulary': None,
 'idf_weights': None,
 'encoding': 'utf-8',
 'vocabulary_size': 15000}

In [71]:
len(fre_vectorization.get_vocabulary())

15000

In [72]:
len(tam_vectorization.get_vocabulary())

15000

In [74]:
fre_vectorization(["or je ne trouve cette"])


<tf.Tensor: shape=(1, 20), dtype=int64, numpy=
array([[486,  16,  12, 321,  62,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   0,   0]], dtype=int64)>