

# Pre-process


In [None]:

import spacy
import string
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

# Load language models only once
nlp_dict = {
    "de": spacy.load("de_core_news_sm"),
    "en": spacy.load("en_core_web_sm"),
    "es": spacy.load("es_core_news_sm"),
    "fr": spacy.load("fr_core_news_sm"),
    "it": spacy.load("it_core_news_sm"),
    "ja": spacy.load("ja_core_news_sm"),
    "nl": spacy.load("nl_core_news_sm"),
    "pl": spacy.load("pl_core_news_sm"),
    "pt": spacy.load("pt_core_news_sm"),
    "ru": spacy.load("ru_core_news_sm"),
    "zh": spacy.load("zh_core_web_sm"),
}

# Define custom English stopword list
default_en_stops = nlp_dict["en"].Defaults.stop_words
not_stop = {"take", 'off', 'nor', 'no', 'through', 'elsewhere', 'anyway', 'until', 'without', 'noone', 'otherwise', 'not', 'none', 'else', 'nobody', 'anyhow', 'less', 'whatever', 'never', 'few', 'rather', 'however', 'nowhere'}

my_stop = {word for word in default_en_stops if word not in not_stop}

# Stopwords per language
stop_words = {
    "de": nlp_dict["de"].Defaults.stop_words,
    "en": my_stop,
    "es": nlp_dict["es"].Defaults.stop_words,
    "fr": nlp_dict["fr"].Defaults.stop_words,
    "it": nlp_dict["it"].Defaults.stop_words,
    "ja": nlp_dict["ja"].Defaults.stop_words,
    "nl": nlp_dict["nl"].Defaults.stop_words,
    "pl": nlp_dict["pl"].Defaults.stop_words,
    "pt": nlp_dict["pt"].Defaults.stop_words,
    "ru": nlp_dict["ru"].Defaults.stop_words,
    "zh": nlp_dict["zh"].Defaults.stop_words
}

punctuations = string.punctuation

def spacy_batch_tokenizer(texts, lang):
    if lang == "zh":
        return texts  # no processing
    tokenizer = nlp_dict[lang]
    result = []
    for doc in tokenizer.pipe(texts, batch_size=1024, n_process=1):  # set n_process>1 to enable multiprocessing
        tokens = [
            token.lemma_.lower().strip()
            for token in doc
            if token.lemma_.lower().strip() not in stop_words[lang]
            and token.lemma_.lower().strip() not in punctuations
            and token.lemma_.replace("-", "").replace("'", "").isalpha()
        ]
        result.append(" ".join(tokens))
    return result

# --- Apply on the DataFrame in batches ---
df = pd.read_csv("rs2.csv")
batch_size = 1000  # adjust this based on available RAM

def process_column_in_batches(df, text_col, lang_col, out_col):
    results = []
    for lang in tqdm(df[lang_col].unique(), desc=f"Processing {out_col} by language"):
        sub_df = df[df[lang_col] == lang]
        processed = spacy_batch_tokenizer(sub_df[text_col].tolist(), lang)
        results.append(pd.Series(processed, index=sub_df.index))
    df[out_col] = pd.concat(results).sort_index()  # maintain order

# Process both columns
process_column_in_batches(df, "sentence1", "lang1", "sentence1_clean")
process_column_in_batches(df, "sentence2", "lang2", "sentence2_clean")

# Save result
df.to_csv("rs2_pre_processed.csv", index=False)



# Paraphrase


In [None]:
from transformers import MarianMTModel, MarianTokenizer
import pandas as pd
from tqdm import tqdm
import torch

# Function to load model/tokenizer for a language pair (cached)
model_cache = {}


def get_model_and_tokenizer(src_lang, tgt_lang):
    if f"{src_lang}-{tgt_lang}" in model_cache:
        return model_cache[f"{src_lang}-{tgt_lang}"]

    model_name = f'Helsinki-NLP/opus-mt-{src_lang}-{tgt_lang}'

    try:
        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)
    except:
        print(f"Direct model {model_name} not found — falling back to multilingual model.")
        if tgt_lang == 'en':
            model_name = 'Helsinki-NLP/opus-mt-mul-en'
        elif src_lang == 'en':
            model_name = 'Helsinki-NLP/opus-mt-en-mul'
        else:
            raise ValueError(f"No available translation model for {src_lang} to {tgt_lang}")

        tokenizer = MarianTokenizer.from_pretrained(model_name)
        model = MarianMTModel.from_pretrained(model_name)

    model_cache[f"{src_lang}-{tgt_lang}"] = (tokenizer, model)
    return tokenizer, model


# Batched translation function
def translate_batch(texts, src_lang, tgt_lang, batch_size=16):
    tokenizer, model = get_model_and_tokenizer(src_lang, tgt_lang)
    translated_texts = []

    for i in range(0, len(texts), batch_size):
        print(i, i + batch_size)
        batch_texts = texts[i:i + batch_size]
        print(batch_texts)
        inputs = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True)
        with torch.no_grad():
            translated = model.generate(**inputs)
        decoded = tokenizer.batch_decode(translated, skip_special_tokens=True)
        translated_texts.extend(decoded)

    return translated_texts


# Function for batched back-translation
def back_translate_batch(sentences, src_langs, batch_size=1024):
    back_translated = []

    unique_src_langs = set(src_langs)
    for src_lang in tqdm(unique_src_langs, desc=f"Processing  by language"):
        print(f"Language :{src_lang}")
        intermediate_lang = "en"
        if src_lang == "en":
            intermediate_lang = "de"
        elif src_lang == "pl":
            intermediate_lang = "de"
        elif src_lang == "pt":
            intermediate_lang = "tl"

        indices = [i for i, lang in enumerate(src_langs) if lang == src_lang]
        src_sentences = [sentences[i] for i in indices]
        unique_src_sentences = list(set(src_sentences))
        print(f"Unique sentences :{len(unique_src_sentences)}")
        # Translate to intermediate
        unique_intermediate_sentences = translate_batch(unique_src_sentences, src_lang, intermediate_lang, batch_size)
        # Translate back to source
        unique_back_sentences = translate_batch(unique_intermediate_sentences, intermediate_lang, src_lang, batch_size)
        back_sentences = list(src_sentences)

        for ind in range(len(unique_src_sentences)):
            unique_word = unique_src_sentences[ind]
            back_sentences = [unique_back_sentences[ind] if x == unique_word else x for x in back_sentences]

        # Assign results back
        for idx, sent in zip(indices, back_sentences):
            back_translated.append((idx, sent))

    # Restore original order
    back_translated.sort()
    return [bt[1] for bt in back_translated]


# Language code map
lang_code_map = {
    "en": "en", "de": "de", "es": "es", "fr": "fr", "it": "it",
    "pt": "pt", "nl": "nl", "pl": "pl", "ru": "ru", "ja": "jap", "zh": "zh"
}

# Load preprocessed data
df = pd.read_csv("rs2_pre_processed.csv")
df.dropna(inplace=True)
# Common pivot
pivot_lang = "en"

# Batched back-translation for sentence1_clean
tqdm.pandas(desc="Preparing sentence1 back-translation")
sentences1 = df["sentence1_clean"].tolist()
langs1 = [lang_code_map[lang] for lang in df["lang1"]]

sentence1_bt = back_translate_batch(sentences1, langs1, batch_size=1)
df["sentence1_bt"] = sentence1_bt

# Batched back-translation for sentence2_clean
tqdm.pandas(desc="Preparing sentence2 back-translation")
sentences2 = df["sentence2_clean"].tolist()
langs2 = [lang_code_map[lang] for lang in df["lang2"]]

sentence2_bt = back_translate_batch(sentences2, langs2, batch_size=1)
df["sentence2_bt"] = sentence2_bt

# Save results
df.to_csv("rs2_backtranslated.csv", index=False)



# Augmentation



In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/drive/')
# Load data
df = pd.read_csv('drive/MyDrive/MACHINE PROJECT/rs2_backtranslated.csv')
df_1 = df.copy()
df_4 = df.copy()

df_4["sentence1_clean"] = df_4["sentence1_bt"]
df_4["sentence2_clean"] = df_4["sentence2_bt"]
df_1 = df_1.drop(df_1.sample(frac=0.5, random_state=42).index).reset_index(drop=True)
df_4 = df_4.drop(df_4.sample(frac=0.99, random_state=42).index).reset_index(drop=True)
print(df_1.shape)
print(df_4.shape)
df_final=pd.concat([df_1, df_4], axis=0, ignore_index=True)

df_final =df_final.drop(columns=["sentence1_bt","sentence2_bt"])
df_final.to_csv("/content/drive/MyDrive/MACHINE PROJECT/rs2_augmented.csv", index=False)



# Embedding



In [None]:
import os

#os.chdir("MACHINE PROJECT")
!ls

In [None]:
!pip install sentence_transformers
!pip install keras
!pip install tf_keras
!pip install tensorflow

In [None]:
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from keras.api.preprocessing.sequence import pad_sequences
from google.colab import drive
drive.mount('/content/drive/')
# Load data
df = pd.read_csv('drive/MyDrive/MACHINE PROJECT/rs2_augmented.csv')

# Load multilingual transformer model
sbert_model = SentenceTransformer('distiluse-base-multilingual-cased-v1')


# Token-level embeddings
def get_token_embeddings(sentences, max_len=30):
    """Returns padded token embeddings for each sentence."""
    embeddings = sbert_model.encode(
        sentences, output_value='token_embeddings', convert_to_numpy=True,
        show_progress_bar=True
    )

    # Truncate/pad each sentence embedding to fixed number of tokens
    padded = pad_sequences(embeddings, maxlen=max_len, padding='post', truncating='post', dtype='float32')
    return padded


sentence_list = df['sentence1_clean'].tolist() + df['sentence2_clean'].tolist()
unique_sentence_list = list(set(sentence_list))

max_len = 30  # max number of tokens per sentence
unique_sentence_token_embeddings = get_token_embeddings(unique_sentence_list, max_len=max_len)
# Create dictionary for fast lookup
sentence_to_embedding = {
    sent: embedding for sent, embedding in zip(unique_sentence_list, unique_sentence_token_embeddings)
}

In [None]:
import pickle

with open("/content/drive/MyDrive/MACHINE PROJECT/sentence_to_embedding.pkl", "wb") as f:
    pickle.dump(sentence_to_embedding, f)

# Models 

## ANN with Transformer - RAM Optimized

In [None]:
import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Lambda, Dense, Dropout, concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

import matplotlib.pyplot as plt
import ast
import pickle
import gc



def load_embeddings(file_path):
    with open(file_path, "rb") as f:
        data = pickle.load(f)
    # Reduce precision to float16 to save memory
    return {k: np.array(v, dtype=np.float16) for k, v in data.items()}

def generate_dataset(df_path, embedding_map):
    df = pd.read_csv(df_path)

    # Only keep required columns and drop the rest
    df = df[['sentence1_clean', 'sentence2_clean', 'score']]

    # Use map with memory-efficient data types
    df['sentence1_embedding'] = df['sentence1_clean'].map(embedding_map)
    df['sentence2_embedding'] = df['sentence2_clean'].map(embedding_map)

    # Convert to float16 arrays (RAM-efficient)
    X1 = np.stack(df['sentence1_embedding'].values).astype(np.float16)
    X2 = np.stack(df['sentence2_embedding'].values).astype(np.float16)
    y = df['score'].astype(np.float16).values

    return X1, X2, y

from tensorflow.keras.layers import GlobalAveragePooling1D

def build_model(input_shape):
    input1 = Input(shape=input_shape)
    input2 = Input(shape=input_shape)

    # Replace RNN with global average pooling followed by dense
    pooled1 = GlobalAveragePooling1D()(input1)
    pooled2 = GlobalAveragePooling1D()(input2)

    dense1 = Dense(128, activation='relu')(pooled1)
    dense2 = Dense(128, activation='relu')(pooled2)

    abs_diff = Lambda(lambda x: tf.abs(x[0] - x[1]))([dense1, dense2])
    mult = Lambda(lambda x: x[0] * x[1])([dense1, dense2])

    merged = concatenate([dense1, dense2, abs_diff, mult])
    dense = Dense(128, activation='relu')(merged)
    drop = Dropout(0.3)(dense)
    output = Dense(1, activation='linear')(drop)

    model = Model(inputs=[input1, input2], outputs=output)
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])

    return model


from google.colab import drive
drive.mount('/content/drive/')

max_len = 30
sentence_to_embedding = load_embeddings("drive/MyDrive/MACHINE PROJECT/sentence_to_embedding.pkl")

sample_embedding = next(iter(sentence_to_embedding.values()))
embedding_dim = sample_embedding.shape[1] if sample_embedding.ndim == 2 else sample_embedding.shape[0]
X1, X2, y = generate_dataset("drive/MyDrive/MACHINE PROJECT/rs2_augmented.csv", sentence_to_embedding)

# Clean up large variables no longer needed
del sentence_to_embedding
gc.collect()

# Train-test split
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)

del X1, X2, y
gc.collect()

model = build_model((max_len, embedding_dim))

history = model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_val, X2_val], y_val),
    epochs=10, batch_size=16  # Reduce batch size to save RAM
)

y_pred = model.predict([X1_val, X2_val], batch_size=16).flatten()



In [None]:
from scipy.stats import pearsonr, spearmanr

In [None]:
pearson_corr, _ = pearsonr(y_val, y_pred)
spearman_corr, _ = spearmanr(y_val, y_pred)

print(f"📈 Pearson Correlation:  {pearson_corr:.4f}")
print(f"📊 Spearman Correlation: {spearman_corr:.4f}")

# Plotting
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('MAE over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt

# Helper function
def evaluate(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "R²": r2_score(y_true, y_pred)
    }


print(evaluate(y_val, y_pred))

## ANN without Transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_csv("drive/MyDrive/MACHINE PROJECT/rs2_augmented.csv")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["sentence1_clean"].tolist() + df["sentence2_clean"].tolist())

# Convert to sequences
s1_seq = tokenizer.texts_to_sequences(df["sentence1_clean"])
s2_seq = tokenizer.texts_to_sequences(df["sentence2_clean"])

# Pad sequences
max_len = 50
X1 = pad_sequences(s1_seq, maxlen=max_len, padding='post')
X2 = pad_sequences(s2_seq, maxlen=max_len, padding='post')
print(X1.shape)
y = df["score"].values


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Lambda, Dense, Dropout, concatenate
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

# Train-validation split
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)

# Define parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

# Inputs
input1 = Input(shape=(max_len,))
input2 = Input(shape=(max_len,))

# Shared embedding + RNN encoder
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)

embedded1 = embedding_layer(input1)
embedded2 = embedding_layer(input2)
from tensorflow.keras.layers import GlobalAveragePooling1D
# Replace RNN with global average pooling followed by dense
pooled1 = GlobalAveragePooling1D()(embedded1)
pooled2 = GlobalAveragePooling1D()(embedded2)

dense1 = Dense(128, activation='relu')(pooled1)
dense2 = Dense(128, activation='relu')(pooled2)

abs_diff = Lambda(lambda x: tf.abs(x[0] - x[1]))([dense1, dense2])
mult = Lambda(lambda x: x[0] * x[1])([dense1, dense2])

merged = concatenate([dense1, dense2, abs_diff, mult])
dense = Dense(128, activation='relu')(merged)
drop = Dropout(0.3)(dense)
output = Dense(1, activation='linear')(drop)

model = Model(inputs=[input1, input2], outputs=output)
model.compile(loss='mse', optimizer='adam', metrics=['mae'])



# Train
history = model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_val, X2_val], y_val),
    epochs=10, batch_size=32
)
y_pred = model.predict([X1_val, X2_val]).flatten()

In [None]:
from scipy.stats import pearsonr, spearmanr
import matplotlib.pyplot as plt
# Pearson Correlation
pearson_corr, _ = pearsonr(y_val, y_pred)

# Spearman Correlation
spearman_corr, _ = spearmanr(y_val, y_pred)

print(f"📈 Pearson Correlation:  {pearson_corr:.4f}")
print(f"📊 Spearman Correlation: {spearman_corr:.4f}")


# Plot loss and MAE
plt.figure(figsize=(12, 5))

# Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()

# MAE
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('MAE over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend()

plt.tight_layout()
plt.show()




In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt

# Helper function
def evaluate(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "R²": r2_score(y_true, y_pred)
    }


print(evaluate(y_val, y_pred))


## ANN optimized

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
import gc
import pickle
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, Lambda, concatenate, GlobalAveragePooling1D

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive/')

# Load pickled sentence embeddings
def load_embeddings(file_path):
    with open(file_path, "rb") as f:
        data = pickle.load(f)
    return {k: np.array(v, dtype=np.float16) for k, v in data.items()}

# Define data generator
def data_generator(df, embedding_map):
    for _, row in df.iterrows():
        emb1 = embedding_map.get(row['sentence1_clean'])
        emb2 = embedding_map.get(row['sentence2_clean'])
        if emb1 is not None and emb2 is not None:
            yield (emb1.astype(np.float16), emb2.astype(np.float16)), np.float16(row['score'])

# Build ANN model
def build_ann_model(input_shape):
    input1 = Input(shape=input_shape)
    input2 = Input(shape=input_shape)

    pooled1 = GlobalAveragePooling1D()(input1)
    pooled2 = GlobalAveragePooling1D()(input2)

    abs_diff = Lambda(lambda x: tf.abs(x[0] - x[1]))([pooled1, pooled2])
    mult = Lambda(lambda x: x[0] * x[1])([pooled1, pooled2])

    merged = concatenate([pooled1, pooled2, abs_diff, mult])

    dense = Dense(128, activation='relu')(merged)
    drop = Dropout(0.3)(dense)
    output = Dense(1, activation='linear')(drop)

    model = Model(inputs=[input1, input2], outputs=output)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])
    return model

# Load resources
embedding_path = "drive/MyDrive/sentence_to_embedding.pkl"
sentence_to_embedding = load_embeddings(embedding_path)

csv_path = "drive/MyDrive/rs2_augmented.csv"
df_full = pd.read_csv(csv_path)[['sentence1_clean', 'sentence2_clean', 'score']]

# Remove unknown embeddings
df_full = df_full[
    df_full['sentence1_clean'].isin(sentence_to_embedding) &
    df_full['sentence2_clean'].isin(sentence_to_embedding)
].reset_index(drop=True)

# Train-test split
df_train, df_val = train_test_split(df_full, test_size=0.2, random_state=42)

# Infer input shape
sample_embedding = next(iter(sentence_to_embedding.values()))
embedding_dim = sample_embedding.shape[1] if sample_embedding.ndim == 2 else sample_embedding.shape[0]
max_len = 30
input_shape = (max_len, embedding_dim)

# Create tf.data datasets
batch_size = 40

train_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(df_train, sentence_to_embedding),
    output_signature=(
        (tf.TensorSpec(shape=input_shape, dtype=tf.float16),
         tf.TensorSpec(shape=input_shape, dtype=tf.float16)),
        tf.TensorSpec(shape=(), dtype=tf.float16)
    )
).batch(batch_size).prefetch(tf.data.AUTOTUNE)

val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(df_val, sentence_to_embedding),
    output_signature=(
        (tf.TensorSpec(shape=input_shape, dtype=tf.float16),
         tf.TensorSpec(shape=input_shape, dtype=tf.float16)),
        tf.TensorSpec(shape=(), dtype=tf.float16)
    )
).batch(batch_size).prefetch(tf.data.AUTOTUNE)

# Free memory
del df_full
gc.collect()

# Build and train model
model = build_ann_model(input_shape)
history = model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=10
)

In [None]:
BATCH_SIZE = 3000

# Recreate val_dataset without initial batching to avoid nested batches
val_dataset = tf.data.Dataset.from_generator(
    lambda: data_generator(df_val, sentence_to_embedding),
    output_signature=(
        (tf.TensorSpec(shape=input_shape, dtype=tf.float16),
         tf.TensorSpec(shape=input_shape, dtype=tf.float16)),
        tf.TensorSpec(shape=(), dtype=tf.float16)
    )
).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)  # Apply desired batch size

y_pred = model.predict(val_dataset).flatten()



In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import pearsonr, spearmanr

y_val = df_val['score'].values
pearson_corr, _ = pearsonr(y_val, y_pred)
spearman_corr, _ = spearmanr(y_val, y_pred)

print(f"📈 Pearson Correlation:  {pearson_corr:.4f}")
print(f"📊 Spearman Correlation: {spearman_corr:.4f}")

# Plotting
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('MAE over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt

# Helper function
def evaluate(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "R²": r2_score(y_true, y_pred)
    }


print(evaluate(y_val, y_pred))

## RNN with transformer

In [None]:
import pandas as pd
import numpy as np
# from sentence_transformers import SentenceTransformer
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Lambda, Dense, Dropout, concatenate
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf

import matplotlib.pyplot as plt
import ast
import pickle
import gc



def load_embeddings(file_path):
    with open(file_path, "rb") as f:
        data = pickle.load(f)
    # Reduce precision to float16 to save memory
    return {k: np.array(v, dtype=np.float16) for k, v in data.items()}

def generate_dataset(df_path, embedding_map):
    df = pd.read_csv(df_path)

    # Only keep required columns and drop the rest
    df = df[['sentence1_clean', 'sentence2_clean', 'score']]

    # Use map with memory-efficient data types
    df['sentence1_embedding'] = df['sentence1_clean'].map(embedding_map)
    df['sentence2_embedding'] = df['sentence2_clean'].map(embedding_map)

    # Convert to float16 arrays (RAM-efficient)
    X1 = np.stack(df['sentence1_embedding'].values).astype(np.float16)
    X2 = np.stack(df['sentence2_embedding'].values).astype(np.float16)
    y = df['score'].astype(np.float16).values

    return X1, X2, y

def build_model(input_shape):
    input1 = Input(shape=input_shape)
    input2 = Input(shape=input_shape)

    shared_rnn = Bidirectional(LSTM(64, return_sequences=False))

    encoded1 = shared_rnn(input1)
    encoded2 = shared_rnn(input2)

    abs_diff = Lambda(lambda x: tf.abs(x[0] - x[1]))([encoded1, encoded2])
    mult = Lambda(lambda x: x[0] * x[1])([encoded1, encoded2])

    merged = concatenate([encoded1, encoded2, abs_diff, mult])
    dense = Dense(128, activation='relu')(merged)
    drop = Dropout(0.3)(dense)
    output = Dense(1, activation='linear')(drop)

    model = Model(inputs=[input1, input2], outputs=output)
    model.compile(loss='mse', optimizer='adam', metrics=['mae'])

    return model

from google.colab import drive
drive.mount('/content/drive/')

max_len = 30
sentence_to_embedding = load_embeddings("drive/MyDrive/MACHINE PROJECT/sentence_to_embedding.pkl")

sample_embedding = next(iter(sentence_to_embedding.values()))
embedding_dim = sample_embedding.shape[1] if sample_embedding.ndim == 2 else sample_embedding.shape[0]
X1, X2, y = generate_dataset("drive/MyDrive/MACHINE PROJECT/rs2_augmented.csv", sentence_to_embedding)

# Clean up large variables no longer needed
del sentence_to_embedding
gc.collect()

# Train-test split
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)

del X1, X2, y
gc.collect()

model = build_model((max_len, embedding_dim))

history = model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_val, X2_val], y_val),
    epochs=10, batch_size=16  # Reduce batch size to save RAM
)

y_pred = model.predict([X1_val, X2_val], batch_size=16).flatten()



In [None]:
from scipy.stats import pearsonr, spearmanr

In [None]:
pearson_corr, _ = pearsonr(y_val, y_pred)
spearman_corr, _ = spearmanr(y_val, y_pred)

print(f"📈 Pearson Correlation:  {pearson_corr:.4f}")
print(f"📊 Spearman Correlation: {spearman_corr:.4f}")

# Plotting
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()

plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('MAE over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt

# Helper function
def evaluate(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "R²": r2_score(y_true, y_pred)
    }


print(evaluate(y_val, y_pred))

## RNN without transformers

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
df = pd.read_csv("drive/MyDrive/MACHINE PROJECT/rs2_augmented.csv")

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize
tokenizer = Tokenizer(num_words=20000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["sentence1_clean"].tolist() + df["sentence2_clean"].tolist())

# Convert to sequences
s1_seq = tokenizer.texts_to_sequences(df["sentence1_clean"])
s2_seq = tokenizer.texts_to_sequences(df["sentence2_clean"])

# Pad sequences
max_len = 50
X1 = pad_sequences(s1_seq, maxlen=max_len, padding='post')
X2 = pad_sequences(s2_seq, maxlen=max_len, padding='post')
y = df["score"].values


In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Bidirectional, Lambda, Dense, Dropout, concatenate
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np

# Train-validation split
X1_train, X1_val, X2_train, X2_val, y_train, y_val = train_test_split(
    X1, X2, y, test_size=0.2, random_state=42
)

# Define parameters
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100

# Inputs
input1 = Input(shape=(max_len,))
input2 = Input(shape=(max_len,))

# Shared embedding + RNN encoder
embedding_layer = Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)

embedded1 = embedding_layer(input1)
embedded2 = embedding_layer(input2)

shared_lstm = Bidirectional(LSTM(64, return_sequences=False))

encoded1 = shared_lstm(embedded1)
encoded2 = shared_lstm(embedded2)

# Feature interaction
abs_diff = Lambda(lambda x: tf.abs(x[0] - x[1]))([encoded1, encoded2])
mult = Lambda(lambda x: x[0] * x[1])([encoded1, encoded2])
merged = concatenate([encoded1, encoded2, abs_diff, mult])

# Dense layers
dense = Dense(128, activation='relu')(merged)
drop = Dropout(0.3)(dense)
output = Dense(1, activation='linear')(drop)

model = Model(inputs=[input1, input2], outputs=output)
model.compile(loss='mse', optimizer='adam', metrics=['mae'])

# Train
history = model.fit(
    [X1_train, X2_train], y_train,
    validation_data=([X1_val, X2_val], y_val),
    epochs=10, batch_size=32
)
y_pred = model.predict([X1_val, X2_val]).flatten()

In [None]:
from scipy.stats import pearsonr, spearmanr
import matplotib.pyplot as plt
# Pearson Correlation
pearson_corr, _ = pearsonr(y_val, y_pred)

# Spearman Correlation
spearman_corr, _ = spearmanr(y_val, y_pred)

print(f"📈 Pearson Correlation:  {pearson_corr:.4f}")
print(f"📊 Spearman Correlation: {spearman_corr:.4f}")


# Plot loss and MAE
plt.figure(figsize=(12, 5))

# Loss
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.title('Loss over Epochs')
plt.xlabel('Epoch')
plt.ylabel('MSE Loss')
plt.legend()

# MAE
plt.subplot(1, 2, 2)
plt.plot(history.history['mae'], label='Train MAE')
plt.plot(history.history['val_mae'], label='Val MAE')
plt.title('MAE over Epochs')
plt.xlabel('Epoch')
plt.ylabel('Mean Absolute Error')
plt.legend()

plt.tight_layout()
plt.show()




In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, mean_absolute_percentage_error, r2_score
import pandas as pd
import matplotlib.pyplot as plt

# Helper function
def evaluate(y_true, y_pred):
    return {
        "MSE": mean_squared_error(y_true, y_pred),
        "MAE": mean_absolute_error(y_true, y_pred),
        "MAPE": mean_absolute_percentage_error(y_true, y_pred),
        "R²": r2_score(y_true, y_pred)
    }


print(evaluate(y_val, y_pred))


## Transformer

In [None]:
pip install "numpy<2.0.0"

In [None]:
# import statements
import logging
import time
import numpy as np
import pandas as pd
import tensorflow_text
import tensorflow as tf
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import random
logging.getLogger('tensorflow').setLevel(logging.ERROR) # suppress warnings

Load processed dataframe

In [None]:
# read CSV file
df = pd.read_csv('/stopword_removal_dataframe.csv')

# display the first few rows of the dataframe
display(df)

In [None]:
# I want to get the minimum and maximum for column 'score'
min_score = df['score'].min()
max_score = df['score'].max()
print(f"Minimum score: {min_score}")
print(f"Maximum score: {max_score}")

In [None]:
# Just to check if the values are actually between 0 and 1
df['normalized_score'] = df['score'] / 5.0
min_normalized_score = df['normalized_score'].min()
max_normalized_score = df['normalized_score'].max()
print(f"Minimum score: {min_normalized_score}")
print(f"Maximum score: {max_normalized_score}")

In [None]:
# # Convert the filtered dataframe to two lists
# input_texts = df['processed_language1'].tolist()
# target_texts = df['processed_language2'].tolist()

# print(type(input_texts), type(target_texts))         # should both be <class 'list'>
# print(type(input_texts[0]), type(target_texts[0]))   # should both be <class 'str'>

# # Ensure all elements in input_texts and target_texts are strings
# input_texts = [str(text) for text in input_texts]
# target_texts = [str(text) for text in target_texts]

# # Create the TensorFlow dataset
# dataset = tf.data.Dataset.from_tensor_slices((input_texts, target_texts))

# # Define split ratio
# split_ratio = 0.8
# total_size = len(input_texts)
# train_size = int(total_size * split_ratio)

# # Create train and validation datasets
# train_examples = dataset.take(train_size)
# val_examples = dataset.skip(train_size)

# train_examples = train_examples.repeat()

# here we are creating the train and val examples
examples = [
    {
        "sentence1": row['processed_language1'],
        "sentence2": row['processed_language2'],
        "score": row['normalized_score']
    }
    for _, row in df.iterrows()
]

random.shuffle(examples)

split_ratio = 0.8
split_index = int(len(examples) * split_ratio)

train_examples = examples[:split_index]
val_examples = examples[split_index:]



In [None]:
# Let's check if both are in the right format
print(train_examples[0])
print(val_examples[0])

Build tokenizer

In [None]:
# # Build tokenizers from my text
# input_tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
#     (text for text in input_texts), target_vocab_size=2**13)

# target_tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
#     (text for text in target_texts), target_vocab_size=2**13)

# Combine both sides of the pairs for a shared vocabulary
all_sentences = pd.concat([df['processed_language1'], df['processed_language2']]).astype(str)

tokenizer = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(
    (text for text in all_sentences), target_vocab_size=2**13)


Helper function

In [None]:
# # Function to tokenize input-target pairs
# # So it gets the input string and the target string and returns tokenized version
# def encode_pair(input_str, target_str):
#     input_tokens = input_tokenizer.encode(input_str.numpy().decode('utf-8'))
#     target_tokens = target_tokenizer.encode(target_str.numpy().decode('utf-8'))
#     return input_tokens, target_tokens

def encode_pair(sentence1, sentence2, score):
    tokens1 = tokenizer.encode(sentence1.numpy().decode('utf-8'))
    tokens2 = tokenizer.encode(sentence2.numpy().decode('utf-8'))
    return tokens1, tokens2, score

# def tf_encode(input_str, target_str):
#     input_tokens, target_tokens = tf.py_function(encode_pair, [input_str, target_str], [tf.int64, tf.int64])
#     return input_tokens, target_tokens

# Wrap with tf.py_function to use in TensorFlow pipeline
# This lets you use regular Python code (like calling .numpy() and .decode()) inside a TensorFlow data pipeline
def tf_encode(sentence1, sentence2, score):
    tokens1, tokens2, score = tf.py_function(
        encode_pair, [sentence1, sentence2, score], [tf.int64, tf.int64, tf.float32]
    )
    tokens1.set_shape([None])
    tokens2.set_shape([None])
    score.set_shape([])  # scalar float
    return tokens1, tokens2, score

# # Map the datasets through the tokenizer
# # It applies tf_encode to every pair in the dataset. Now each example in train_dataset is a pair of tokenized sequences
# train_dataset = train_examples.map(tf_encode)
# val_dataset = val_examples.map(tf_encode)


Make batches

In [None]:
MAX_SEQ_LEN = 128

def encode_pair_fixed(input_str, target_str):
    input_tokens = tokenizer.encode(input_str.numpy().decode('utf-8'))[:MAX_SEQ_LEN]
    target_tokens = tokenizer.encode(target_str.numpy().decode('utf-8'))[:MAX_SEQ_LEN]

    # Pad manually
    input_tokens += [0] * (MAX_SEQ_LEN - len(input_tokens))
    target_tokens += [0] * (MAX_SEQ_LEN - len(target_tokens))

    return input_tokens, target_tokens

def tf_encode(input_str, target_str, score):
    input_tokens, target_tokens = tf.py_function(
        encode_pair_fixed, [input_str, target_str], [tf.int64, tf.int64]
    )
    input_tokens.set_shape([MAX_SEQ_LEN])
    target_tokens.set_shape([MAX_SEQ_LEN])
    return input_tokens, target_tokens, score


def make_batches(examples, batch_size=32):
    sent1_list = []
    sent2_list = []
    score_list = []

    for ex in examples:
        try:
            s1 = str(ex["sentence1"])
            s2 = str(ex["sentence2"])
            score = float(ex["score"])
            sent1_list.append(s1)
            sent2_list.append(s2)
            score_list.append(score)
        except Exception as e:
            print("Skipping example due to error:", ex, e)
            continue

    sent1_tensor = tf.constant(sent1_list)
    sent2_tensor = tf.constant(sent2_list)
    score_tensor = tf.constant(score_list, dtype=tf.float32)

    dataset = tf.data.Dataset.from_tensor_slices((sent1_tensor, sent2_tensor, score_tensor))
    dataset = dataset.map(tf_encode, num_parallel_calls=tf.data.AUTOTUNE)
    # dataset = dataset.padded_batch(batch_size, padded_shapes=([None], [None], []))
    # MAX_SEQ_LEN = 128
    # dataset = dataset.padded_batch(batch_size, padded_shapes=([MAX_SEQ_LEN], [MAX_SEQ_LEN], []))
    dataset = dataset.batch(batch_size)
    # dataset = dataset.repeat()  # Infinite dataset
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    # return dataset, sent1_list, sent2_list

    return dataset



In [None]:
train_batches = make_batches(train_examples)

# Inspect a single batch
for sent1, sent2, score in train_batches.take(1):
    print("Shape sent1:", sent1.shape)
    print("Shape sent2:", sent2.shape)
    print("Shape score:", score.shape)

    # Check if all shapes in the batch are as expected
    assert sent1.shape[1] == MAX_SEQ_LEN, f"sent1 sequence length mismatch: {sent1.shape[1]}"
    assert sent2.shape[1] == MAX_SEQ_LEN, f"sent2 sequence length mismatch: {sent2.shape[1]}"
    assert sent1.shape == sent2.shape, f"Mismatch between sent1 and sent2: {sent1.shape} vs {sent2.shape}"


In [None]:
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)


Get everything ready

In [None]:
print("Sample types in sent1_list:", {type(s) for s in sent1_list})
print("Sample types in sent2_list:", {type(s) for s in sent2_list})


In [None]:
# DATASET PREPARATION

# Define constants for dataset preparation
# MAX_TOKENS = 128
# BUFFER_SIZE = 20000  # Size of the buffer for shuffling the dataset.
# BATCH_SIZE = 64  # Number of samples per batch.

# Lower constants for baseline
MAX_TOKENS = 128
BUFFER_SIZE = 1000  # Size of the buffer for shuffling the dataset.
BATCH_SIZE = 16  # Number of samples per batch.

# Tokenization function
def tokenize_pairs(input_text, target_text):
    """
    Tokenizes both input and target strings using SubwordTextEncoder-like tokenizers.
    Returns int64 token sequences with shape info attached.
    """
    # Encode using SubwordTextEncoder and convert to padded tensors
    input_tokens = tf.py_function(
        lambda x: tf.constant(tokenizer.encode(x.numpy().decode('utf-8')), dtype=tf.int64),
        [input_text],
        tf.int64
    )
    target_tokens = tf.py_function(
        lambda x: tf.constant(tokenizer.encode(x.numpy().decode('utf-8')), dtype=tf.int64),
        [target_text],
        tf.int64
    )

    # Help TF understand the rank of returned tensors for later batching
    input_tokens.set_shape([None])
    target_tokens.set_shape([None])
    return input_tokens, target_tokens

# Filtering function to exclude long sequences
def filter_max_tokens(input_tensor, target_tensor):
    """
    Filters out input-target pairs where either sequence exceeds MAX_TOKENS.
    """
    num_tokens = tf.maximum(tf.shape(input_tensor)[0], tf.shape(target_tensor)[0])
    return num_tokens < MAX_TOKENS

# # Batching and preprocessing wrapper
# def make_batches(ds):
#     """
#     Caches, shuffles, tokenizes, filters, pads, and batches the dataset.
#     """
#     return (
#         ds
#         .cache()
#         .shuffle(BUFFER_SIZE)
#         .map(tokenize_pairs, num_parallel_calls=tf.data.AUTOTUNE)
#         .filter(filter_max_tokens)
#         .padded_batch(BATCH_SIZE, padded_shapes=([None], [None]))  # pad manually here
#         # .repeat() # NEW ADDITION
#         .prefetch(tf.data.AUTOTUNE)
#     )


# Prepare the training and validation datasets by applying the make_batches function.
train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples)


In [None]:
# Take a small batch (e.g., 3) from the batched and tokenized dataset
for sent1_batch, sent2_batch, score_batch in train_batches.take(1):
    print("> Sentence 1 Examples:")
    for line in sent1_batch.numpy():
        print(tokenizer.decode([token for token in line if token != 0]))

    print("----------------------------------------------")

    print("> Sentence 2 Examples:")
    for line in sent2_batch.numpy():
        print(tokenizer.decode([token for token in line if token != 0]))

    print("----------------------------------------------")

    print("> Similarity Scores:")
    print(score_batch.numpy())

    print("----------------------------------------------")

    print("> Token IDs (Sentence 1):")
    for line in sent1_batch.numpy():
        print(line)

    print("----------------------------------------------")

    print("> Token IDs (Sentence 2):")
    for line in sent2_batch.numpy():
        print(line)

# # Take a small batch (e.g., 3) from the dataset
# for src_examples, tgt_examples in train_examples.batch(3).take(1):
#     print("> Examples in Source Language:")
#     for line in src_examples.numpy():
#         print(line.decode('utf-8'))

# print("----------------------------------------------")

# print("> Examples in Target Language:")
# for line in tgt_examples.numpy():
#     print(line.decode('utf-8'))

# print("----------------------------------------------")

# # Tokenize the target examples using your custom tokenizer
# encoded = [tokenizer.encode(text.numpy().decode('utf-8')) for text in tgt_examples]

# # Print tokenized form
# for row in encoded:
#     print(row)

# print("----------------------------------------------")

In [None]:
# POSITIONAL ENCODING
# Transformers have no recurrence or convolution, so we inject sequence order using sine/cosine signals.

def get_angles(pos, i, d_model):
    """
    Computes the angle rates for the positional encoding.
    The formula ensures that each dimension of the embedding varies at a different wavelength (some change faster than others).

    Parameters:
    - pos: Position index.
    - i: Dimension index.
    - d_model: Depth of the model (number of dimensions).

    Returns:
    - The angle rates for positional encoding.
    """
    # Calculate the angles based on position and dimension index.
    # This formula helps in varying the wavelength across different dimensions.
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return pos * angle_rates

def positional_encoding(position, d_model):
    """
    Generates a positional encoding matrix.

    Parameters:
    - position: The maximum position index.
    - d_model: The depth of the model (number of dimensions).

    Returns:
    - A positional encoding matrix of shape (1, position, d_model).
    """
    # Generate angles based on positions and dimensions.
    angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    # Apply sine to even indices in the angles array (2i).
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    # Apply cosine to odd indices in the angles array (2i+1).
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    # Expand the dimensions to fit the model requirements.
    pos_encoding = angle_rads[np.newaxis, ...]
    # Cast the encoding to TensorFlow float32 type.
    return tf.cast(pos_encoding, dtype=tf.float32)


In [None]:
# MASKING
# Masks prevent attention to padding tokens or future tokens during decoding.

def create_padding_mask(seq):
    """
    Creates a padding mask for sequences.
    This mask hides the padding tokens (i.e., zeros) so they don't affect the attention mechanism. It returns 1s where padding exists, and 0s elsewhere.
    Parameters:
    - seq: The sequence of tokens.

    Returns:
    - A padding mask for the sequence.
    """
    # Create a mask where every zero in the sequence is marked with a 1 (padding) and others with a 0.
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    # Add extra dimensions to the mask so it can be added to the attention logits.
    return seq[:, tf.newaxis, tf.newaxis, :]  # Shape: (batch_size, 1, 1, seq_len)

def create_look_ahead_mask(size):
    """
    Creates a look-ahead mask to mask future tokens in a sequence.
    Each token can only see previous ones (or itself), but not the next ones, ensuring proper autoregressive behavior.
    Parameters:
    - size: Size of the mask.

    Returns:
    - A look-ahead mask of shape (size, size).
    """
    # Create a mask where every entry that is in the lower triangle (including the diagonal) is 0, and everything else is 1.
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask  # Shape: (seq_len, seq_len)


In [None]:
# SCALING AND DOT PRODUCT ATTENTION
# This is the core building block of attention: compare queries with keys, weigh values

def scaled_dot_product_attention(q, k, v, mask=None):
    """
    Calculates the attention weights and applies them to the value vectors.

    Parameters:
    - q (query): Tensor with shape (..., seq_len_q, depth)
    - k (key): Tensor with shape (..., seq_len_k, depth)
    - v (value): Tensor with shape (..., seq_len_v, depth_v)
    - mask: (Optional) Float tensor with shape broadcastable to (..., seq_len_q, seq_len_k).

    Returns:
    - output: The result of applying attention weights to the value vectors.
    - attention_weights: The attention weights.
    """
    # Compute the dot product of the query and key tensors. Transpose the key tensor for proper alignment.
    # This gives us a similarity score between each query and key.
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # Shape: (..., seq_len_q, seq_len_k)

    # Scale the dot product by the square root of the depth of the key tensor.
    # This helps in preventing the softmax function from having extremely small gradients.
    dk = tf.cast(tf.shape(k)[-1], tf.float32)  # Get the depth of the keys.
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    # Apply the mask if provided. The mask is used to nullify the effect of padding or future information.
    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  # Use a large negative number to mask.

    # Apply softmax to get the attention weights. The softmax is applied on the key sequence dimension.
    # It shows how much attention each word pays to others.
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # Shape: (..., seq_len_q, seq_len_k)

    # Apply the attention weights to the value tensor to get the output.
    output = tf.matmul(attention_weights, v)  # Shape: (..., seq_len_q, depth_v)

    return output, attention_weights



In [None]:
# MULTIHEAD ATTENTION
# Instead of attending once, we split into multiple attention "heads" for richer representations

class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads):
        """
        Initialize the MultiHeadAttention layer.

        Parameters:
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        """
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads  # Number of attention heads.
        self.d_model = d_model  # Dimensionality of the model's output space.

        # Ensure the model's dimension is divisible by the number of heads to evenly distribute dimensions to each head.
        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads  # Dimensionality per attention head.

        # Define dense layers for the queries, keys, and values.
        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        # Final dense layer.
        self.dense = tf.keras.layers.Dense(d_model)

    def split_heads(self, x, batch_size):
        """
        Split the last dimension into (num_heads, depth) and transpose the result.

        Parameters:
        - x: Input tensor.
        - batch_size: Size of the batch.

        Returns:
        - Tensor with shape (batch_size, num_heads, seq_len, depth).
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, v, k, q, mask):
        """
        The logic for the multi-head attention layer's forward pass.

        Parameters:
        - v: Value tensor.
        - k: Key tensor.
        - q: Query tensor.
        - mask: Mask to be applied.

        Returns:
        - output: Output tensor.
        - attention_weights: Attention weights.
        """
        batch_size = tf.shape(q)[0]

        # Apply dense layers to queries, keys, and values.
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)

        # Split the dense outputs into multiple heads and transpose.
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

        # Perform scaled dot product attention.
        scaled_attention, attention_weights = scaled_dot_product_attention(q, k, v, mask)

        # Transpose and reshape the attention output to match the input's dimensionality.
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)
        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

        # Apply the final dense layer.
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

        return output, attention_weights


In [None]:
def point_wise_feed_forward_network(d_model, dff):
    """
    Creates a point-wise feed forward network. This consists of two dense layers with a ReLU activation
    in between, which is used within each transformer block.

    Parameters:
    - d_model: The dimensionality of the input and output of the transformer model.
    - dff: The dimensionality of the inner layer, typically much larger than d_model to allow
           the model to combine features in the data in a high-dimensional space before projecting
           back down to d_model dimensions.

    Returns:
    - A tf.keras.Sequential model representing the feed forward network.
    """
    return tf.keras.Sequential([
        # First dense layer with dff units and ReLU activation. This expands the dimensionality to dff,
        # allowing the network to learn more complex features.
        tf.keras.layers.Dense(dff, activation='relu'),  # Output shape: (batch_size, seq_len, dff)

        # Second dense layer that projects the outputs back down to d_model dimensions.
        tf.keras.layers.Dense(d_model)  # Output shape: (batch_size, seq_len, d_model)
    ])

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, rate=0.1):
        """
        Initializes the EncoderLayer with multi-head attention, point-wise feed-forward network,
        dropout, and layer normalization components.

        Parameters:
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        - dff: Dimensionality of the feed-forward network's inner layer.
        - rate: Dropout rate.
        """
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(d_model=d_model, num_heads=num_heads)  # Multi-head attention layer.
        self.ffn = point_wise_feed_forward_network(d_model, dff)  # Point-wise feed-forward network.

        # Layer normalization (first instance).
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        # Layer normalization (second instance).
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # Dropout (first instance).
        self.dropout1 = tf.keras.layers.Dropout(rate)
        # Dropout (second instance).
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        """
        The logic for one pass of the encoder layer.

        Parameters:
        - x: Input tensor.
        - training: Boolean indicating if the layer should behave in training mode (applying dropout) or in inference mode.
        - mask: Mask to be applied on the multi-head attention layer.

        Returns:
        - The output tensor of the encoder layer.
        """
        # Apply multi-head attention to the input (self attention).
        attn_output, _ = self.mha(x, x, x, mask)  # Output shape: (batch_size, input_seq_len, d_model)
        attn_output = self.dropout1(attn_output, training=training)  # Apply dropout to the attention output.

        # Add & normalize.
        out1 = self.layernorm1(x + attn_output)  # Residual connection followed by layer normalization.

        # Apply the feed-forward network to the normalized attention output.
        ffn_output = self.ffn(out1)  # Output shape: (batch_size, input_seq_len, d_model)
        ffn_output = self.dropout2(ffn_output, training=training)  # Apply dropout to the feed-forward network output.

        # Final add & normalize step.
        out2 = self.layernorm2(out1 + ffn_output)  # Residual connection followed by another layer normalization.

        return out2

In [None]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocab_size, rate=0.1):
        """
        Initializes the Encoder part of the Transformer.

        Parameters:
        - num_layers: Number of encoder layers.
        - d_model: Dimensionality of the model's output space.
        - num_heads: Number of attention heads.
        - dff: Dimensionality of the feed-forward network's inner layer.
        - input_vocab_size: Size of the input vocabulary.
        - rate: Dropout rate.
        """
        super(Encoder, self).__init__()
        self.d_model = d_model
        self.num_layers = num_layers

        # Embedding layer for the input tokens.
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)

        # Positional encoding up to MAX_TOKENS.
        self.pos_encoding = positional_encoding(MAX_TOKENS, self.d_model)

        # Encoder layers
        self.enc_layers = [EncoderLayer(d_model=d_model, num_heads=num_heads, dff=dff, rate=rate) for _ in range(num_layers)]

        # Dropout layer.
        self.dropout = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        # Adding embedding and position encoding.
        x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # Scale embeddings.
        x += self.pos_encoding[:, :seq_len, :]  # Add position encoding.
        x = self.dropout(x, training=training)  # Apply dropout.

        # Pass the input through each encoder layer.
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training=training, mask=mask)


        return x  # (batch_size, input_seq_len, d_model)

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, *, num_layers, d_model, num_heads, dff, input_vocab_size, rate=0.1):
        """
        Initializes the Transformer model for sentence similarity.
        """
        super().__init__()
        self.encoder = Encoder(num_layers=num_layers, d_model=d_model, num_heads=num_heads, dff=dff, input_vocab_size=input_vocab_size, rate=rate)

        # # Final linear layer that projects the decoder's output to the target vocabulary size.
        # self.final_layer = tf.keras.layers.Dense(target_vocab_size)

    def call(self, inputs, training):
        """
        The logic for one forward pass through the model.

        Parameters:
        - inputs: A tuple of input tensor and target tensor.
        - training: Boolean indicating if the layer should behave in training mode or inference mode.

        Returns:
        - sentence_embedding: A fixed-size vector representation (e.g., mean-pooled encoder output).
        """
        # Create encoder padding mask
        enc_padding_mask = create_padding_mask(inputs)

        # Run encoder
        enc_output = self.encoder(inputs, training=training, mask=enc_padding_mask)  # (batch_size, seq_len, d_model)

        # Reduce to fixed-size vector (e.g., mean pooling)
        sentence_embedding = tf.reduce_mean(enc_output, axis=1)  # (batch_size, d_model)
        return sentence_embedding

        # inp, tar = inputs

        # # Create masks for padding and future tokens.
        # enc_padding_mask, look_ahead_mask, dec_padding_mask = self.create_masks(inp, tar)

        # # Pass the input through the encoder.
        # #enc_output = self.encoder(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)
        # #self.encoder(inp, training=training, mask=enc_padding_mask)
        # enc_output = self.encoder(inp, training=training, mask=enc_padding_mask)

        # # Pass the encoder output and target through the decoder.
        # #dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        # #self.decoder(tar, enc_output, training=training,
        #  #    look_ahead_mask=look_ahead_mask,
        #   #   padding_mask=dec_padding_mask)

        # dec_output, attention_weights = self.decoder(
        #       tar,
        #       enc_output,
        #       training=training,
        #       look_ahead_mask=look_ahead_mask,
        #       padding_mask=dec_padding_mask)


        # # Pass the decoder output through the final linear layer.
        # final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

        # return final_output, attention_weights


    # def create_masks(self, inp, tar):
    #     """
    #     Creates masks for padding and look ahead for the encoder and decoder.

    #     Parameters:
    #     - inp: Input tensor.
    #     - tar: Target tensor.

    #     Returns:
    #     - enc_padding_mask: Padding mask for the encoder.
    #     - look_ahead_mask: Look-ahead mask for the decoder.
    #     - dec_padding_mask: Padding mask for the decoder to mask the encoder outputs.
    #     """
    #     # Encoder padding mask.
    #     enc_padding_mask = create_padding_mask(inp)

    #     # Decoder padding mask for the second attention block (to mask encoder outputs).
    #     dec_padding_mask = create_padding_mask(inp)

    #     # Look-ahead mask (to mask future tokens) and decoder target padding mask combined.
    #     look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    #     dec_target_padding_mask = create_padding_mask(tar)
    #     look_ahead_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

    #     return enc_padding_mask, look_ahead_mask, dec_padding_mask

# -------------------------------
# Transformer Model Hyperparameters
# -------------------------------

num_layers = 4 # number of encoder/decoder layers in the Transformer
d_model = 128 # size of the embedding vector for each word
dff = 512 # size of the hidden layer inside the Feed Forward Neural Network
num_heads = 8 # number of attention heads (must divide d_model evenly)
dropout_rate = 0.1 # dropout rate for regularization

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)  # Model dimensionality, cast to float32 for calculation.
        self.warmup_steps = warmup_steps  # Number of steps to linearly increase the learning rate.

    def __call__(self, step):
        step = tf.cast(step, tf.float32)
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)  # Linearly increase then decrease based on warmup steps.
        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)  # Calculate the learning rate.
        # Final learning rate: (1 / sqrt(d_model)) * min(arg1, arg2)

# Instantiate the learning rate schedule and Adam optimizer
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(
    learning_rate,
    beta_1=0.9,         # First moment decay (default)
    beta_2=0.98,        # Second moment decay (used in the original Transformer paper)
    epsilon=1e-9        # Small value to avoid division by zero
)

# def loss_function(real, pred):
#     mask = tf.math.logical_not(tf.math.equal(real, 0))  # Create a mask for non-zero tokens.
#     loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
#     loss_ = loss_object(real, pred)  # Calculate loss using some loss object not defined here.
#     mask = tf.cast(mask, dtype=loss_.dtype)  # Cast mask to the same type as the loss.
#     loss_ *= mask  # Apply mask to the loss.
#     return tf.reduce_sum(loss_) / tf.reduce_sum(mask)  # Calculate the average loss.

# def accuracy_function(real, pred):
#     accuracies = tf.equal(real, tf.argmax(pred, axis=2))  # Check if real values match predictions.
#     mask = tf.math.logical_not(tf.math.equal(real, 0))  # Create a mask for non-zero tokens.
#     accuracies = tf.math.logical_and(mask, accuracies)  # Apply mask to accuracies.
#     accuracies = tf.cast(accuracies, dtype=tf.float32)  # Cast to float32 for calculation.
#     mask = tf.cast(mask, dtype=tf.float32)  # Cast mask to float32.
#     return tf.reduce_sum(accuracies) / tf.reduce_sum(mask)  # Calculate the average accuracy.

# train_loss = tf.keras.metrics.Mean(name='train_loss')
# train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

In [None]:
class SimilarityModel(tf.keras.Model):
    def __init__(self, transformer):
        super().__init__()
        self.transformer = transformer
        self.dense = tf.keras.Sequential([
            tf.keras.layers.Dense(128, activation='relu'),
            tf.keras.layers.Dense(1, activation='sigmoid')  # Output similarity score between 0 and 1
        ])

    def call(self, inputs, training=False):
        sent1, sent2 = inputs  # each: (batch_size, seq_len)

        embed1 = self.transformer(sent1, training=training)  # (batch_size, d_model)
        embed2 = self.transformer(sent2, training=training)  # (batch_size, d_model)

        # Combine embeddings (common in sentence similarity tasks)
        combined = tf.concat([
            embed1,
            embed2,
            tf.abs(embed1 - embed2),
            embed1 * embed2
        ], axis=1)  # (batch_size, 4 * d_model)

        return self.dense(combined)  # (batch_size, 1)


In [None]:
# -------------------------------
# Instantiate the Transformer model
# -------------------------------
transformer = Transformer(
    num_layers=num_layers,                        # Number of encoder and decoder layers
    d_model=d_model,                              # Embedding size / model dimensionality
    num_heads=num_heads,                          # Number of attention heads
    dff=dff,                                      # Hidden layer size in feed-forward network
    input_vocab_size = tokenizer.vocab_size,
    # target_vocab_size = target_tokenizer.vocab_size,
    rate=dropout_rate                             # Dropout rate
)

# -------------------------------
# Checkpointing: Saving and restoring model state
# -------------------------------

checkpoint_path = './checkpoints/train'   # Directory to save training checkpoints

# Create a checkpoint object that tracks the transformer and optimizer state
ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

# Manage multiple checkpoints (e.g., keep the 5 latest ones)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

# If a previous checkpoint exists, restore the model and optimizer state
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Latest checkpoint restored!!')

# -------------------------------
# Define input signature for tf.function (for performance optimization)
# Used to decorate the training step function later
# -------------------------------
# train_step_signature = [
#     tf.TensorSpec(shape=(None, None), dtype=tf.int64),  # Input sequence shape: (batch_size, input_seq_len)
#     tf.TensorSpec(shape=(None, None), dtype=tf.int64),  # Target sequence shape: (batch_size, target_seq_len)
# ]

train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),  # sentence1
    tf.TensorSpec(shape=(None, None), dtype=tf.int64),  # sentence2
    tf.TensorSpec(shape=(None,), dtype=tf.float32),     # similarity label (e.g., 0.0 to 1.0)
]

# -------------------------------
# Instantiate the Sentence similarity model
# -------------------------------
similarity_model = SimilarityModel(transformer)

# Initiate loss function and metrics
loss_fn = tf.keras.losses.MeanSquaredError()
train_loss = tf.keras.metrics.Mean(name="train_loss")


In [None]:
# DEBUG: Peek at one batch from train_batches
print("Inspecting batch structure...")
train_batches = make_batches(train_examples)
train_iter = iter(train_batches)

first = next(train_iter)

print("Raw output of next(train_iter):", type(first))
print("Length of output:", len(first))

# Try printing all contents in the batch
for i, item in enumerate(first):
    print(f"\nElement {i}:")
    print("  Type:", type(item))
    print("  Shape:", item.shape)
    print("  DType:", item.dtype)
    print("  Example values:", item.numpy()[:1])


In [None]:
@tf.function(input_signature=train_step_signature)
def train_step(sent1, sent2, score):
    """
    Performs a single training step for the similarity model.

    Args:
    - sent1: Tokenized first sentence (batch_size, seq_len)
    - sent2: Tokenized second sentence (batch_size, seq_len)
    - score: Continuous similarity score (batch_size,)

    Returns:
    - loss: Scalar MSE loss
    - pred: Predicted similarity score
    """
    with tf.GradientTape() as tape:
        pred = similarity_model((sent1, sent2), training=True)
        pred = tf.squeeze(pred, axis=1)  # (batch_size,)
        loss = loss_fn(score, pred)

    gradients = tape.gradient(loss, similarity_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, similarity_model.trainable_variables))
    train_loss.update_state(loss)

    return loss, pred

    # # Prepare target inputs and outputs
    # tar_inp = tar[:, :-1]
    # tar_real = tar[:, 1:]

    # with tf.GradientTape() as tape:
    #     predictions, _ = transformer([inp, tar_inp], training=True)
    #     loss = loss_function(tar_real, predictions)

    # # Compute gradients and apply them
    # gradients = tape.gradient(loss, transformer.trainable_variables)
    # optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    # # Update the training loss and accuracy metrics
    # train_loss(loss)
    # train_accuracy(accuracy_function(tar_real, predictions))

    # return loss, predictions

# epoch_accuracies = []
# all_batch_accuracies = []

EPOCHS = 10
steps_per_epoch = len(train_examples) // BATCH_SIZE
# steps_per_epoch = 1000  # Set based on dataset size / batch size

epoch_losses = []
all_batch_losses = []

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1} -------------------")
    # print("Re-initializing dataset iterator")

    # Reset metrics
    train_loss = tf.keras.metrics.Mean(name='train_loss')

    train_batches = make_batches(train_examples)  # Must return (sent1, sent2, score)
    train_iter = iter(train_batches)

    batch_losses = []
    start = time.time()

    # # Peek at 1 batch
    # try:
    #     sample = next(train_iter)
    #     print("Batch loaded successfully.")
    # except Exception as e:
    #     print("Batch loading failed:", e)


    # # Recreate infinite train_batches with .repeat()
    # train_batches = make_batches(train_examples)
    # train_iter = iter(train_batches)

    # batch_accuracies = []

    # # Reset metrics
    # train_loss = tf.keras.metrics.Mean(name='train_loss')
    # train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

    # Use fixed number of steps instead of iterating over the whole dataset
    for step in range(steps_per_epoch):
        # inp, tar = next(train_iter)
        try:
            # (batch,) = next(train_iter)  # Unpack the tuple
            # sent1, sent2, score = batch
            sent1, sent2, score = next(train_iter)


        except StopIteration:
            # If the iterator is exhausted, recreate it
            # train_iter = iter(train_batches)
            # (batch,) = next(train_iter)
            # sent1, sent2, score = batch

            train_iter = iter(train_batches)
            sent1, sent2, score = next(train_iter)



        # loss, predictions = train_step(inp, tar)
        loss, _ = train_step(sent1, sent2, score)

        batch_losses.append(loss.numpy())

        # batch_accuracies.append(accuracy_function(tar[:, 1:], predictions).numpy())

        # if step % 50 == 0:
        #     print(f'Epoch {epoch + 1} Step {step} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

        if step % 50 == 0:
            print(f'Epoch {epoch + 1} Step {step} Loss {train_loss.result():.4f}')


    all_batch_losses.append(batch_losses)
    epoch_losses.append(train_loss.result().numpy())
    # all_batch_accuracies.append(batch_accuracies)

    # epoch_accuracies.append(train_accuracy.result().numpy())

    if (epoch + 1) % 3 == 0:
        ckpt_save_path = ckpt_manager.save()
        print(f'Saving checkpoint for epoch {epoch+1} at {ckpt_save_path}')

    # print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')
    # print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')

    print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f}')
    print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')


In [None]:
# Evaluate on the validation set

val_loss = tf.keras.metrics.Mean(name="val_loss")

for sent1_val, sent2_val, score_val in val_batches:
    pred_val = similarity_model((sent1_val, sent2_val), training=False)
    pred_val = tf.squeeze(pred_val, axis=1)
    loss = loss_fn(score_val, pred_val)
    val_loss.update_state(loss)

print(f"Validation Loss (MSE): {val_loss.result().numpy():.4f}")



In [None]:
plt.figure(figsize=(10, 4))
plt.plot(all_batch_losses[0], label='Batch Loss')
#plt.plot(all_batch_accuracies[0], label='Batch Accuracy')
plt.xlabel('Batch')
plt.ylabel('Value')
plt.title('Loss & Accuracy per Batch (Epoch 1)')
plt.legend()
plt.grid(True)
plt.show()
