# Pràctica 4 (Meow)

## Dataset Text Similarity

In [1]:
import pandas as pd
import tensorflow as tf


# Login using e.g. `huggingface-cli login` to access this dataset
splits = {'train': 'train.tsv', 'validation': 'dev.tsv', 'test': 'test.tsv'}
df = pd.read_csv("hf://datasets/projecte-aina/sts-ca/" + splits["train"], sep="\t")

In [2]:
df

Unnamed: 0,id,sentence_1,sentence_2,label
0,ACN2_111,Atorga per primer cop les mencions Encarna San...,Creen la menció M. Encarna Sanahuja a la inclu...,3.50
1,Oscar2_211,"Finalment, afegiu-hi els bolets que haureu sal...","Finalment, poseu-hi les minipastanagues tallad...",1.25
2,ACN2_574,El TC suspèn el pla d'acció exterior i de rela...,El Constitucional manté la suspensió del pla e...,3.67
3,Viqui2_341,Virgin Galactic va ser fundada el 2004 per l'e...,Virgin Atlantic és una de les aerolínies de Ri...,2.25
4,ACN2_1184,Identifiquen un nou biomarcador per realitzar ...,Nous tractaments contra el càncer de mama,2.00
...,...,...,...,...
2068,ACN2_718,Els crits de la víctima van alertar diversos c...,"La víctima, que anava amb dos amics seus, va a...",1.75
2069,Viqui2_825,Mademoiselle és una pel·lícula franco-britànic...,The Haunting és una pel·lícula britànica dirig...,2.75
2070,Viqui2_873,Però la seva idea original va resultar massa a...,Però la seva idea original va suposar quelcom ...,5.00
2071,ACN2_23,El delegat del govern al Penedès sosté que la ...,El delegat del govern al Penedès sosté que el ...,3.75


# Font d'Embeddings

In [5]:
from gensim.models import KeyedVectors
kv = KeyedVectors.load_word2vec_format('../cc.ca.300.vec', binary=False)
# Obtenir un word-vector
print(kv["paraula"])

[ 2.150e-02  1.310e-02 -8.800e-03  1.820e-02 -6.000e-04  5.430e-02
 -4.400e-03  3.720e-02  1.720e-02  5.100e-03  2.010e-02 -1.080e-02
  5.200e-03  1.870e-02  2.500e-03  3.710e-02 -1.990e-02  1.510e-02
  2.530e-02 -1.830e-02  5.320e-02 -4.120e-02  1.850e-02  6.800e-02
 -1.170e-02  3.900e-03 -8.530e-02  1.360e-02  9.000e-03 -3.000e-03
 -1.700e-03  8.800e-03  5.410e-02  3.060e-02  1.410e-02  8.000e-03
 -3.210e-02  5.840e-02 -4.590e-02 -9.900e-03  2.070e-02 -8.100e-02
 -5.990e-02  1.050e-02  2.120e-02 -1.006e-01  2.200e-02 -1.380e-02
 -6.900e-03 -2.000e-03  3.540e-02 -5.200e-02 -8.400e-03  1.160e-02
  1.410e-02  2.880e-02  4.420e-02 -3.750e-02 -1.000e-04 -4.700e-02
  1.580e-02  3.800e-02  5.330e-02  7.700e-03  2.380e-02  2.580e-02
 -1.530e-02 -4.500e-03  1.490e-02  1.750e-02  3.210e-02  5.590e-02
  1.900e-02  3.200e-03 -2.560e-02  2.330e-02 -5.420e-02  2.430e-02
  3.950e-02  1.290e-02  2.200e-02 -2.050e-02  2.500e-03 -2.900e-03
 -1.060e-02  1.970e-02  2.870e-02 -5.260e-02 -5.860e-02  5.600

# Preparació d'Embeddings

In [6]:
import numpy as np

# Function to truncate word vectors to smaller dimensions
def truncate_embeddings(kv_model, new_dim):
    """
    Truncate word vectors to a smaller dimension
    """
    truncated_vectors = {}
    for word in kv_model.key_to_index:
        original_vector = kv_model[word]
        truncated_vector = original_vector[:new_dim]
        truncated_vectors[word] = truncated_vector
    return truncated_vectors

# Create truncated versions with different dimensions
print("Creating truncated versions of word embeddings...")

# Original dimension
original_dim = kv.vector_size
print(f"Original embedding dimension: {original_dim}")

# Create truncated versions
dimensions = [50, 100, 150]
truncated_models = {}

for dim in dimensions:
    print(f"Creating {dim}-dimensional version...")
    truncated_models[dim] = truncate_embeddings(kv, dim)
    print(f"Truncated to {dim} dimensions. Sample vector shape: {len(truncated_models[dim][list(truncated_models[dim].keys())[0]])}")

print("\nTruncated models created successfully!")
print(f"Available dimensions: {list(truncated_models.keys())}")

# Function to get sentence embedding with truncated vectors
def get_sentence_embedding_truncated(sentence, truncated_vectors, vector_size):
    """
    Get sentence embedding using truncated word vectors
    """
    words = sentence.lower().split()
    vectors = []
    for word in words:
        if word in truncated_vectors:
            vectors.append(truncated_vectors[word])
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(vector_size)

# Test with a sample sentence
sample_sentence = "Aquesta és una frase de prova"
print(f"\nTesting with sentence: '{sample_sentence}'")

for dim in dimensions:
    embedding = get_sentence_embedding_truncated(sample_sentence, truncated_models[dim], dim)
    print(f"Embedding dimension {dim}: shape {embedding.shape}")

Creating truncated versions of word embeddings...
Original embedding dimension: 300
Creating 50-dimensional version...
Truncated to 50 dimensions. Sample vector shape: 50
Creating 100-dimensional version...
Truncated to 100 dimensions. Sample vector shape: 100
Creating 150-dimensional version...
Truncated to 150 dimensions. Sample vector shape: 150

Truncated models created successfully!
Available dimensions: [50, 100, 150]

Testing with sentence: 'Aquesta és una frase de prova'
Embedding dimension 50: shape (50,)
Embedding dimension 100: shape (100,)
Embedding dimension 150: shape (150,)


In [14]:
print(truncated_models[50]['espiadimonis'])  # Example of accessing a specific word vector in the 50-dimensional model

[ 0.0421 -0.0043  0.0034 -0.019  -0.0184  0.0004 -0.0132  0.0397  0.0175
 -0.0109  0.0104 -0.001   0.066   0.0271  0.017   0.0045  0.01    0.0294
  0.0106 -0.0403  0.0042 -0.0259  0.0387 -0.0101  0.0352  0.0015 -0.0096
  0.0083 -0.0089  0.0146  0.0173 -0.0078  0.0296  0.0118 -0.0058  0.0196
 -0.0052 -0.0279  0.0214 -0.006   0.0377 -0.0182 -0.003  -0.016   0.0181
 -0.0419 -0.0019 -0.0125 -0.0313 -0.0184]


# Model d'Embeddings aggregats

In [7]:

def build_model_aggregated(embedding_dim: int, hidden_size: int = 128, dropout_rate: float = 0.3) -> tf.keras.Model:
    input_1 = tf.keras.Input(shape=(embedding_dim,), name="input_vector_1")
    input_2 = tf.keras.Input(shape=(embedding_dim,), name="input_vector_2")
    concatenated = tf.keras.layers.Concatenate(axis=-1)([input_1, input_2])
    x = tf.keras.layers.BatchNormalization()(concatenated)
    x = tf.keras.layers.Dense(hidden_size, activation='relu')(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(dropout_rate)(x)
    output = tf.keras.layers.Dense(1)(x) # Activació lineal per a regressió
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)
    model.compile(loss='mean_squared_error', optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
                  metrics=['mae', tf.keras.metrics.RootMeanSquaredError()])
    return model
model_agg = build_model_aggregated(embedding_dim=300)
model_agg.fit([X1_train, X2_train], Y_train, epochs=2, batch_size=64)

NameError: name 'X1_train' is not defined

# Model 2: Seqüència d'embeddings

In [15]:
import numpy as np
from typing import Optional

class SimpleAttention(tf.keras.layers.Layer):
    def __init__(self, units: int, **kwargs):
        super(SimpleAttention, self).__init__(**kwargs)
        self.units = units
        self.dropout_s1 = tf.keras.layers.Dropout(0.3)
        self.dropout_s2 = tf.keras.layers.Dropout(0.2)
        self.W_s1 = tf.keras.layers.Dense(units, activation='tanh', use_bias=True, name="attention_transform")
        # Dense layer to compute attention scores (context vector)
        self.W_s2 = tf.keras.layers.Dense(1, use_bias=False, name="attention_scorer")
        self.supports_masking = True  # Declare that this layer supports masking

    def call(self, inputs: tf.Tensor, mask: Optional[tf.Tensor] = None) -> tf.Tensor:
        # inputs shape: (batch_size, sequence_length, embedding_dim)
        # mask shape: (batch_size, sequence_length) boolean tensor

        # Attention hidden states
        hidden_states = self.dropout_s1(self.W_s1(inputs))

        # Compute attention scores
        scores = self.dropout_s2(self.W_s2(hidden_states))

        if mask is not None:
            # Apply the mask to the scores before softmax
            expanded_mask = tf.expand_dims(tf.cast(mask, dtype=tf.float32), axis=-1)
            # Add a large negative number to masked (padded) scores
            scores += (1.0 - expanded_mask) * -1e9

        # Compute attention weights
        attention_weights = tf.nn.softmax(scores, axis=1)

        # Compute the context vector (weighted sum of input embeddings)
        context_vector = tf.reduce_sum(inputs * attention_weights, axis=1)

        return context_vector

    def get_config(self) -> dict:
        config = super(SimpleAttention, self).get_config()
        config.update({"units": self.units})
        return config

    def compute_mask(self, inputs: tf.Tensor, mask: Optional[tf.Tensor] = None) -> Optional[tf.Tensor]:
        return None


def build_and_compile_model_2(
        input_length: int = 32,
        dictionary_size: int = 1000,
        embedding_size: int = 300,
        learning_rate: float = 0.001,
        trainable_embedding: bool = False,
        pretrained_weights: Optional[np.ndarray] = None,
        attention_units: int = 4,
) -> tf.keras.Model:
    input_1 = tf.keras.Input((input_length,), dtype=tf.int32, name="input_1")
    input_2 = tf.keras.Input((input_length,), dtype=tf.int32, name="input_2")

    # Determine effective embedding parameters
    if pretrained_weights is not None:
        effective_dictionary_size = pretrained_weights.shape[0]
        effective_embedding_size = pretrained_weights.shape[1]
        embedding_initializer = tf.keras.initializers.Constant(pretrained_weights)
        is_embedding_trainable = trainable_embedding
        embedding_layer_name = "embedding_pretrained"
    else:
        effective_dictionary_size = dictionary_size
        effective_embedding_size = embedding_size
        embedding_initializer = 'uniform'
        is_embedding_trainable = True
        embedding_layer_name = "embedding"

    # Shared Embedding Layer
    embedding_layer = tf.keras.layers.Embedding(
        input_dim=effective_dictionary_size,
        output_dim=effective_embedding_size,
        input_length=input_length,
        mask_zero=True,
        embeddings_initializer=embedding_initializer,
        trainable=is_embedding_trainable,
        name=embedding_layer_name
    )

    # Apply embedding layer to both inputs
    embedded_1 = embedding_layer(input_1)  # Shape: (batch_size, input_length, effective_embedding_size)
    embedded_2 = embedding_layer(input_2)  # Shape: (batch_size, input_length, effective_embedding_size)

    # Shared Attention Layer
    # Input: (batch_size, input_length, effective_embedding_size) with a mask
    # Output: (batch_size, effective_embedding_size)
    sentence_attention_layer = SimpleAttention(units=attention_units, name="sentence_attention")
    # sentence_attention_layer = tf.keras.layers.GlobalAveragePooling1D(name="sentence_attention_layer")

    sentence_vector_1 = sentence_attention_layer(embedded_1)
    sentence_vector_2 = sentence_attention_layer(embedded_2)

    # Projection layer
    first_projection_layer = tf.keras.layers.Dense(
        effective_embedding_size,
        activation='tanh',
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
        name="projection_layer"
    )
    dropout = tf.keras.layers.Dropout(0.2, name="projection_dropout")
    projected_1 = dropout(first_projection_layer(sentence_vector_1))
    projected_2 = dropout(first_projection_layer(sentence_vector_2))

    # Normalize the projected vectors (L2 normalization)
    normalized_1 = tf.keras.layers.Lambda(
        lambda x: tf.linalg.l2_normalize(x, axis=1), name="normalize_1"
    )(projected_1)
    normalized_2 = tf.keras.layers.Lambda(
        lambda x: tf.linalg.l2_normalize(x, axis=1), name="normalize_2"
    )(projected_2)

    # Compute Cosine Similarity
    similarity_score = tf.keras.layers.Lambda(
        lambda x: tf.reduce_sum(x[0] * x[1], axis=1, keepdims=True), name="cosine_similarity"
    )([normalized_1, normalized_2])

    # Scale similarity from [-1, 1] to [0, 1]
    output_layer = tf.keras.layers.Lambda(
        lambda x: 0.5 * (1.0 + x), name="output_scaling"
    )(similarity_score)

    # Define the Keras Model
    model = tf.keras.Model(
        inputs=[input_1, input_2],
        outputs=output_layer,
        name="sequence_similarity_attention_model"
    )

    # Compile the model
    model.compile(
        loss='mean_squared_error',
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        metrics=['mae'],
    )

    return model

In [None]:
# Construir y compilar el modelo
model = build_and_compile_model()
# tf.keras.utils.plot_model(model, show_shapes=True, show_layer_activations=True, )
print(model.summary())
# Entrenar el modelo
model.fit(train_dataset, epochs=num_epochs, validation_data=val_dataset)

# Model Baseline Cosinus

In [19]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import mean_squared_error, mean_absolute_error

import matplotlib.pyplot as plt

# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Process the test dataset
sentences_1 = sts_ca_dataset['sentence_1']
sentences_2 = sts_ca_dataset['sentence_2']
true_scores = np.array(sts_ca_dataset['label'])

# Fit and transform sentences to TF-IDF vectors
all_sentences = sentences_1 + sentences_2
vectorizer.fit(all_sentences)
vectors_1 = vectorizer.transform(sentences_1)
vectors_2 = vectorizer.transform(sentences_2)

# Calculate cosine similarity between each pair
similarities = []
for i in range(len(sentences_1)):
    sim = cosine_similarity(vectors_1[i], vectors_2[i])[0][0]
    similarities.append(sim)

# Convert similarities to the same scale as the scores (assuming 0-5 scale)
predicted_scores = np.array(similarities) * 5

# Evaluate the model
mse = mean_squared_error(true_scores, predicted_scores)
rmse = np.sqrt(mse)
mae = mean_absolute_error(true_scores, predicted_scores)

print(f"Cosine Similarity Baseline Results:")
print(f"MSE: {mse:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")

# Plot predicted vs true scores
plt.figure(figsize=(10, 6))
plt.scatter(true_scores, predicted_scores, alpha=0.5)
plt.plot([0, 5], [0, 5], 'r--')  # Diagonal line representing perfect prediction
plt.xlabel('True Scores')
plt.ylabel('Predicted Scores')
plt.title('Cosine Similarity Baseline: Predicted vs True Scores')
plt.grid(True)
plt.show()

Cosine Similarity Baseline Results:
MSE: 0.6337
RMSE: 0.7960
MAE: 0.6307


KeyboardInterrupt: 

## Aplicar Word2Vec al Dataset de Similitud Textual

In [21]:
# Preparant les dades pel model de regressió usant Word2Vec
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error
from scipy.spatial.distance import cosine
import matplotlib.pyplot as plt

# Funció que ja hem definit abans
# def get_sentence_embedding(sentence, model, vector_size=300):
#     words = sentence.lower().split()
#     vectors = []
#     for word in words:
#         if word in model:
#             vectors.append(model[word])
#     if vectors:
#         return np.mean(vectors, axis=0)
#     else:
#         return np.zeros(vector_size)

try:
    # Comprovar si tenim un model carregat o hem d'usar una alternativa
    if 'model' not in locals() or 'model' not in globals():
        print("No s'ha carregat cap model. Utilitzant un model alternatiu per demostració...")
        # Aquí podríem carregar un model alternatiu o usar una altra estratègia
        raise Exception("Cal tenir un model pre-entrenat")
    
    # Convertir les frases a vectors usant Word2Vec
    print("Convertint frases a vectors d'embeddings...")
    X1 = np.array([get_sentence_embedding(sent, model) for sent in sts_ca_dataset['sentence_1']])
    X2 = np.array([get_sentence_embedding(sent, model) for sent in sts_ca_dataset['sentence_2']])
    Y = np.array(sts_ca_dataset['label'])
    
    print(f"Forma dels vectors X1: {X1.shape}, X2: {X2.shape}, Y: {Y.shape}")
    
    # Baseline: similitud del cosinus
    print("Calculant similituds de cosinus com a baseline...")
    cosine_similarities = []
    for i in range(len(X1)):
        # Avoid division by zero for zero vectors
        if np.all(X1[i] == 0) or np.all(X2[i] == 0):
            sim = 0
        else:
            sim = 1 - cosine(X1[i], X2[i])  # Cosine distance is 1 - similarity
        cosine_similarities.append(sim * 5)  # Scale to 0-5 range
    
    # Avaluar el baseline
    baseline_mse = mean_squared_error(Y, cosine_similarities)
    baseline_rmse = np.sqrt(baseline_mse)
    baseline_mae = mean_absolute_error(Y, cosine_similarities)
    
    print(f"Word2Vec + Cosine Similarity Baseline Results:")
    print(f"MSE: {baseline_mse:.4f}")
    print(f"RMSE: {baseline_rmse:.4f}")
    print(f"MAE: {baseline_mae:.4f}")
    
    # Plot predicted vs true scores
    plt.figure(figsize=(10, 6))
    plt.scatter(Y, cosine_similarities, alpha=0.5)
    plt.plot([0, 5], [0, 5], 'r--')  # Diagonal line representing perfect prediction
    plt.xlabel('True Scores')
    plt.ylabel('Predicted Scores')
    plt.title('Word2Vec + Cosine Similarity: Predicted vs True Scores')
    plt.grid(True)
    plt.show()
    
    # Split the data for training a model
    X1_train, X1_test, X2_train, X2_test, Y_train, Y_test = train_test_split(
        X1, X2, Y, test_size=0.2, random_state=42
    )
    
    print("Dades preparades per entrenar el model:")
    print(f"Training set: {X1_train.shape[0]} exemples")
    print(f"Test set: {X1_test.shape[0]} exemples")
    
    # Les dades estan preparades per utilitzar-les amb el model definit anteriorment:
    # model_agg = build_model_aggregated(embedding_dim=300)
    # history = model_agg.fit([X1_train, X2_train], Y_train, epochs=50, batch_size=32, validation_split=0.2)
    
except Exception as e:
    print(f"Error: {e}")
    print("Per fer servir aquest codi, cal tenir un model Word2Vec pre-entrenat carregat.")
    print("Opcions alternatives:")
    print("1. Descarregar un model pre-entrenat en català (cc.ca.300.vec) de FastText")
    print("2. Usar Spark NLP o HuggingFace per carregar models pre-entrenats")

No s'ha carregat cap model. Utilitzant un model alternatiu per demostració...
Error: Cal tenir un model pre-entrenat
Per fer servir aquest codi, cal tenir un model Word2Vec pre-entrenat carregat.
Opcions alternatives:
1. Descarregar un model pre-entrenat en català (cc.ca.300.vec) de FastText
2. Usar Spark NLP o HuggingFace per carregar models pre-entrenats


## Entrenant el Model de Regressió amb Word2Vec Embeddings

In [None]:
import tensorflow as tf
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import EarlyStopping

try:
    # Comprovem que tenim les dades preparades
    if 'X1_train' not in locals() or 'X1_train' not in globals():
        print("Primer cal executar la cel·la anterior per preparar les dades.")
    else:
        # Construir el model de regressió
        print("Construint el model...")
        
        # Usar la funció que ja s'ha definit abans
        model_agg = build_model_aggregated(embedding_dim=300)
        
        # Mostrar el resum del model
        model_agg.summary()
        
        # Definir early stopping
        early_stopping = EarlyStopping(
            monitor='val_loss',
            patience=5,
            restore_best_weights=True,
            verbose=1
        )
        
        # Entrenar el model
        print("Entrenant el model...")
        history = model_agg.fit(
            [X1_train, X2_train],
            Y_train,
            epochs=50,
            batch_size=32,
            validation_split=0.2,
            callbacks=[early_stopping],
            verbose=1
        )
        
        # Avaluar el model
        print("Avaluant el model...")
        test_loss, test_mae, test_rmse = model_agg.evaluate([X1_test, X2_test], Y_test, verbose=0)
        
        print(f"\nModel Results on Test Data:")
        print(f"MSE: {test_loss:.4f}")
        print(f"RMSE: {test_rmse:.4f}")
        print(f"MAE: {test_mae:.4f}")
        
        # Visualitzar l'historial d'entrenament
        plt.figure(figsize=(12, 4))
        
        plt.subplot(1, 2, 1)
        plt.plot(history.history['loss'], label='Training Loss')
        plt.plot(history.history['val_loss'], label='Validation Loss')
        plt.title('Training and Validation Loss')
        plt.xlabel('Epochs')
        plt.ylabel('Loss')
        plt.legend()
        
        plt.subplot(1, 2, 2)
        plt.plot(history.history['mae'], label='Training MAE')
        plt.plot(history.history['val_mae'], label='Validation MAE')
        plt.title('Training and Validation MAE')
        plt.xlabel('Epochs')
        plt.ylabel('MAE')
        plt.legend()
        
        plt.tight_layout()
        plt.show()
        
        # Fer prediccions al conjunt de test
        Y_pred = model_agg.predict([X1_test, X2_test]).flatten()
        
        # Visualitzar prediccions vs valors reals
        plt.figure(figsize=(10, 6))
        plt.scatter(Y_test, Y_pred, alpha=0.5)
        plt.plot([0, 5], [0, 5], 'r--')  # Diagonal line representing perfect prediction
        plt.xlabel('True Scores')
        plt.ylabel('Predicted Scores')
        plt.title('Regression Model: Predicted vs True Scores')
        plt.grid(True)
        plt.show()
        
except Exception as e:
    print(f"Error: {e}")
    print("Assegureu-vos que heu executat les cel·les anteriors correctament.")