# PLH - 4th delivery
---
Made by **[César Mejía Rota]()**.

## Imports and installs
---

In [None]:
%pip install datasets
%pip install tensorflow
%pip install numpy
%pip install spacy
%pip install scipy==1.10.1
%pip install gensim
%pip install sckit-learn
%pip install torch
%pip install matplotlib
%pip install plotly
%pip install pandas
%pip install nbformat>=4.2.0
%pip install transformers

In [None]:
from catalan_general_crawling.catalan_general_crawling import CatalanGeneralCrawling
import sys

from gensim.utils import simple_preprocess
from gensim.models import Word2Vec, TfidfModel
from gensim.corpora import Dictionary

import numpy as np
import pandas as pd
from scipy import spatial
from typing import Tuple, List

from gensim.models import fasttext
import tensorflow as tf

from scipy.stats import pearsonr

## Word2Vec models (Skip-gram)
---

### Split the data in different size datasets (100MB, 500MB, 1GB and complete dataset)

In [None]:
# 1. Load the dataset
crawler = CatalanGeneralCrawling()
crawler.download_and_prepare()
dataset = crawler.as_dataset(split = 'train')


In [None]:
size100MB = 100 * 1024 * 1024
size500MB = 500 * 1024 * 1024
size1GB = 1024 * 1024 * 1024

In [None]:
size = 0
current_lines = []

cat_crawler_100mb = []
cat_crawler_500mb = []
cat_crawler_1gb = []
cat_crawler_full = []

# Split the dataset
for elem in dataset:
    text = elem['text']
    line_size = sys.getsizeof(text + '\n')
    size += line_size
    text = text.replace('\n', ' ')
    current_lines.append(text)

    if size >= size100MB and cat_crawler_100mb == []:
        cat_crawler_100mb = current_lines
        print("100MB reached")
        print("Size: ", size/1024/1024, "MB")
    if size >= size500MB and cat_crawler_500mb == []:
        cat_crawler_500mb = current_lines
        print("500MB reached")
        print("Size: ", size/1024/1024, "MB")
    if size >= size1GB and cat_crawler_1gb == []:
        cat_crawler_1gb = current_lines
        print("1GB reached")
        print("Size: ", size/1024/1024, "MB")


cat_crawler_full = current_lines
print("Full dataset reached")
print("Full dataset size: ", size/1024/1024, "MB")



Now we preprocess the text

In [None]:
#stopwords from https://github.com/Alir3z4/stop-words/blob/master/catalan.txt
stopwords_cat = set()
with open('catalan_stopwords.txt', encoding = 'utf-8') as f:
    for line in f:
        stopwords_cat.add(line.strip())

In [None]:
# Define preprocessing
def preprocess(sentence: str) -> List[str]:
    '''
    Preprocesses a sentence by tokenizing it and removing stopwords
    :param sentence: the sentence to preprocess
    :return: the preprocessed sentence
    '''
    preprocessed = simple_preprocess(sentence)
    preprocessed = [token for token in preprocessed if token not in stopwords_cat]
    return preprocessed

Now we train the model

In [None]:
def train_word2vec_skipgram(dataset, vector_size=300, workers=4, sg=1):
    '''
    Trains a Word2Vec model with the Skipgram architecture
    :param dataset: the dataset to train the model on
    :param vector_size: the size of the word vectors
    :param workers: the number of workers (theads) to use
    :param sg: the architecture to use (0 for CBOW, 1 for Skipgram)
    :return: the trained Word2Vec model
    '''
    print("Training Word2Vec model with Skipgram")
    print('Reading and preprocessing dataset...')
    dataset = [preprocess(sentence) for sentence in dataset]
    print('Dataset preprocessed')
    print('Training Word2Vec model...')
    model = Word2Vec(sentences=dataset, vector_size=vector_size, workers=workers, sg=sg)
    return model

def train_word2vec_CBOW(dataset, vector_size=300, workers=4, sg=0):
    '''
    Trains a Word2Vec model with the Skipgram architecture
    :param dataset: the dataset to train the model on
    :param vector_size: the size of the word vectors
    :param workers: the number of workers (theads) to use
    :param sg: the architecture to use (0 for CBOW, 1 for Skipgram)
    :return: the trained Word2Vec model
    '''
    print("Training Word2Vec model with CBOW")
    print('Reading and preprocessing dataset...')
    dataset = [preprocess(sentence) for sentence in dataset]
    print('Dataset preprocessed')
    print('Training Word2Vec model...')
    model = Word2Vec(sentences=dataset, vector_size=vector_size, workers=workers, sg=sg)
    return model

In [None]:
preprocessed_cat_crawler_100mb = [preprocess(sentence) for sentence in cat_crawler_100mb]

In [None]:
model = Word2Vec(sentences=preprocessed_cat_crawler_100mb, vector_size=100, workers=4, sg=1)

In [None]:
model_skipgram_100mb = Word2Vec(sentences=preprocessed_cat_crawler_100mb, vector_size=300, workers=4, sg=1)

In [None]:
model_skipgram_100MB = train_word2vec_skipgram(cat_crawler_100mb)

In [None]:
model_skipgram_500MB = train_word2vec_skipgram(cat_crawler_500mb)

In [None]:
model_skipgram_1GB = train_word2vec_skipgram(cat_crawler_1gb)

In [None]:
model_skipgram_full = train_word2vec_skipgram(cat_crawler_full)

In [None]:
def save_model(model, path):
    '''
    Saves a Word2Vec model to a file
    :param model: the model to save
    :param path: the path to save the model to
    '''
    model.save(path)

In [None]:
#save model in the folder models_word2vec_cat
save_model(model_skipgram_100MB, 'models_word2vec_cat/model_skipgram_100MB')

## Text Similarity
---

In [None]:
WORD_EMBEDDING_FILE = 'D:/GitHub/PLH4---Embeddings/cc.ca.300.bin.gz'

In [None]:

wv_model = fasttext.load_facebook_vectors(WORD_EMBEDDING_FILE)

In [None]:
dataset = {}

dataset['train'] = pd.read_csv('train.tsv', sep='\t', header=None, names=['elim','frase1', 'frase2', 'label'])
dataset['test'] = pd.read_csv('test.tsv', sep='\t', header=None, names=['elim','frase1', 'frase2', 'label'])
dataset['validation'] = pd.read_csv('dev.tsv', sep='\t', header=None, names=['elim','frase1', 'frase2', 'label'])

In [None]:
dataset['train']

In [None]:
#eliminem la primera columna
for split in ['train', 'test', 'validation']:
    dataset[split] = dataset[split].drop(columns=['elim'])

In [None]:
dataset['train']

In [None]:
input_pairs = []
for index, row in dataset['train'].iterrows():
    input_pairs.append((row['frase1'], row['frase2'], row['label']))

input_pairs_val = []
for index, row in dataset['validation'].iterrows():
    input_pairs_val.append((row['frase1'], row['frase2'], row['label']))

input_pairs_test = []
for index, row in dataset['test'].iterrows():
    input_pairs_test.append((row['frase1'], row['frase2'], row['label']))

In [None]:
all_input_pairs = input_pairs + input_pairs_val + input_pairs_test
# Preprocesamiento de las oraciones y creación del diccionario
sentences_1_preproc = [simple_preprocess(sentence_1) for sentence_1, _, _ in all_input_pairs]
sentences_2_preproc = [simple_preprocess(sentence_2) for _, sentence_2, _ in all_input_pairs]
sentence_pairs = list(zip(sentences_1_preproc, sentences_2_preproc))
# Versión aplanada para poder entrenar el modelo
sentences_pairs_flattened = sentences_1_preproc + sentences_2_preproc
diccionario = Dictionary(sentences_pairs_flattened)



In [None]:
sentences_pairs_flattened

In [None]:
corpus = [diccionario.doc2bow(sent) for sent in sentences_pairs_flattened]
modelo_tfidf = TfidfModel(corpus)

In [None]:
#convertim els datasets en llistes per poder fer el map
train = dataset['train'].values.tolist()
test = dataset['test'].values.tolist()
validation = dataset['validation'].values.tolist()

In [None]:
train

## Word Embedding models
---

In [None]:
#Create a flag in the provided function to choose between the different models of embeddings
def map_pairs(
        sentence_pairs: List[Tuple[str, str, float]],
        dictionary: Dictionary = None,
        model: TfidfModel = None,
) -> List[Tuple[Tuple[np.ndarray, np.ndarray], float]]:
    # Mapeo de los pares de oraciones a pares de vectores
    pares_vectores = []
    for i, (sentence_1, sentence_2, similitud) in enumerate(sentence_pairs):
        sentence_1_preproc = preprocess(sentence_1)
        sentence_2_preproc = preprocess(sentence_2)
        # Si usamos TF-IDF
        if model == 'W2V + TFIDF':
            # Cálculo del promedio ponderado por TF-IDF de los word embeddings
            vectors1, weights1 = map_tf_idf(sentence_1_preproc, dictionary=dictionary, )
            vectors2, weights2 = map_tf_idf(sentence_2_preproc, dictionary=dictionary,)
            vector1 = np.average(vectors1, weights=weights1, axis=0, )
            vector2 = np.average(vectors2, weights=weights2, axis=0, )
        elif model == 'W2V + MEAN':
            # Cálculo del promedio de los word embeddings
            vectors1 = [wv_model[word] for word in sentence_1_preproc if word in wv_model]
            vectors2 = [wv_model[word] for word in sentence_2_preproc if word in wv_model]
            vector1 = np.mean(vectors1, axis=0)
            vector2 = np.mean(vectors2, axis=0)
        elif model == 'OH':
            vector1 = map_one_hot(sentence_1, dictionary)
            vector2 = map_one_hot(sentence_2, dictionary)
        elif model == 'SPACY':
            vector1 = map_spacy(sentence_1)
            vector2 = map_spacy(sentence_2)
        elif model == 'ROBERTA':
            vector1 = map_roberta(sentence_1)
            vector2 = map_roberta(sentence_2)
        elif model == 'ROBERTA_MEAN':
            vector1 = map_roberta_mean(sentence_1)
            vector2 = map_roberta_mean(sentence_2)
        else:
            raise ValueError('Modelo no soportado')
        # Añadir a la lista
        pares_vectores.append(((vector1, vector2), similitud))
    return pares_vectores

In [None]:
def build_and_compile_model(embedding_size: int = 300, learning_rate: float = 1e-3) -> tf.keras.Model:
    '''
    Construye y compila un modelo de Keras para la tarea de similitud de oraciones.
    :param embedding_size: Tamaño de los vectores de embeddings
    :param learning_rate: Tasa de aprendizaje
    :return: Modelo de Keras compilado
    '''
    # Capa de entrada para los pares de vectores
    input_1 = tf.keras.Input(shape=(embedding_size,))
    input_2 = tf.keras.Input(shape=(embedding_size,))

    # Hidden layer
    first_projection = tf.keras.layers.Dense(
        embedding_size,
        kernel_initializer=tf.keras.initializers.Identity(),
        bias_initializer=tf.keras.initializers.Zeros(),
    )
    projected_1 = first_projection(input_1)
    projected_2 = first_projection(input_2)
    
    # Compute the cosine distance using a Lambda layer
    def cosine_distance(x):
        x1, x2 = x
        x1_normalized = tf.keras.backend.l2_normalize(x1, axis=1)
        x2_normalized = tf.keras.backend.l2_normalize(x2, axis=1)
        return 2.5 * (1.0 + tf.reduce_sum(x1_normalized * x2_normalized, axis=1))

    output = tf.keras.layers.Lambda(cosine_distance)([projected_1, projected_2])
    # Define output
    model = tf.keras.Model(inputs=[input_1, input_2], outputs=output)

    # Compile the model
    model.compile(loss='mean_squared_error',
                  optimizer=tf.keras.optimizers.Adamax(learning_rate))
    return model

In [None]:
# Define training constants
batch_size: int = 64
num_epochs: int = 64

In [None]:
def pair_list_to_x_y(pair_list: List[Tuple[Tuple[np.ndarray, np.ndarray], int]]) -> Tuple[Tuple[np.ndarray, np.ndarray], np.ndarray]:
    _x, _y = zip(*pair_list)
    _x_1, _x_2 = zip(*_x)
    return (np.array(_x_1), np.array(_x_2)), np.array(_y, dtype=np.float32, )

In [None]:
# Baseline
def compute_pearson_baseline(x_, y_):
    y_pred_baseline = []
    for v1, v2 in zip(*x_):
        d = 1.0 - spatial.distance.cosine(v1, v2)
        y_pred_baseline.append(d)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred_baseline, y_.flatten())
    return correlation

In [None]:
def compute_pearson(x_, y_,model):
    # Obtener las predicciones del modelo para los datos de prueba. En este ejemplo vamos a utilizar el corpus de training.
    y_pred = model.predict(x_)
    # Calcular la correlación de Pearson entre las predicciones y los datos de prueba
    correlation, _ = pearsonr(y_pred.flatten(), y_.flatten())
    return correlation

### 1. One-Hot
---

In [None]:
def map_one_hot(sentence: str, dictionary: Dictionary) -> np.ndarray:
    '''
    Mapea una oración a un vector one-hot utilizando un diccionario dado.
    :param sentence: Oración a mapear
    :param dictionary: Diccionario de Gensim
    :return: Vector one-hot
    '''
    one_hot = np.zeros(len(dictionary))
    frase = preprocess(sentence)
    for elem in frase:
        if elem in dictionary.token2id:
            word_index = dictionary.token2id[elem]
            one_hot[word_index] = 1
    return one_hot

In [None]:
mapped_OH = map_pairs(input_pairs, model='OH', dictionary=diccionario, )

mapped_train_OH = map_pairs(train,  model='OH', dictionary=diccionario, )
mapped_val_OH = map_pairs(validation, model='OH', dictionary=diccionario, )
mapped_test_OH = map_pairs(test, model='OH', dictionary=diccionario, )

In [None]:
for vectors, similitud in mapped_OH[:5]:
    print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
    print(f"Puntuación de similitud: {similitud}")

In [None]:
# Obtener las listas de train y test
x_train_OH, y_train_OH = pair_list_to_x_y(mapped_train_OH)
x_val_OH, y_val_OH = pair_list_to_x_y(mapped_val_OH)

In [None]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset_OH = tf.data.Dataset.from_tensor_slices((x_train_OH, y_train_OH))
train_dataset_OH = train_dataset_OH.shuffle(buffer_size=len(x_train_OH)).batch(batch_size)

val_dataset_OH = tf.data.Dataset.from_tensor_slices((x_val_OH, y_val_OH))
val_dataset_OH = val_dataset_OH.batch(batch_size)

In [None]:
# Show shapes
x_train_OH[0].shape, x_train_OH[1].shape, y_train_OH.shape

In [None]:
# Construir y compilar el modelo
model_OH = build_and_compile_model(embedding_size=len(diccionario))

print(model_OH.summary())

In [None]:
# Train the model
model_OH.fit(train_dataset_OH, epochs=num_epochs, validation_data=val_dataset_OH)

In [None]:
#El model al test
x_test_OH, y_test_OH = pair_list_to_x_y(mapped_test_OH)

In [104]:
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train_OH, y_train_OH)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val_OH, y_val_OH)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test_OH, y_test_OH)}")
print('\n')
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train_OH, y_train_OH, model_OH)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val_OH, y_val_OH, model_OH)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test_OH, y_test_OH, model_OH)}")

Correlación de Pearson (baseline-train): 0.5481180236438351
Correlación de Pearson (baseline-validation): 0.5449709382790233
Correlación de Pearson (baseline-test): 0.6372650860995633


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 96ms/step
Correlación de Pearson (train): 0.9572504652023259
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 100ms/step
Correlación de Pearson (validation): 0.40475411179396625
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 93ms/step
Correlación de Pearson (test): 0.52958566231992


### 2. Word2Vec/GloVe (Pre-trained)
---

#### 2.1. Word2Vec + Mean

In [105]:
mapped_w2v_mean = map_pairs(input_pairs, model='W2V + MEAN', dictionary=diccionario, )

mapped_train_w2v_mean = map_pairs(train,  model='W2V + MEAN', dictionary=diccionario, )
mapped_val_w2v_mean = map_pairs(validation, model='W2V + MEAN', dictionary=diccionario, )
mapped_test_w2v_mean = map_pairs(test, model='W2V + MEAN', dictionary=diccionario, )

In [106]:
for vectors, similitud in mapped_w2v_mean[:5]:
    print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
    print(f"Puntuación de similitud: {similitud}")

Pares de vectores: (300,), (300,)
Puntuación de similitud: 3.5
Pares de vectores: (300,), (300,)
Puntuación de similitud: 1.25
Pares de vectores: (300,), (300,)
Puntuación de similitud: 3.67
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.25
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.0


In [107]:
# Obtener las listas de train y test
x_train_w2v_mean, y_train_w2v_mean = pair_list_to_x_y(mapped_train_w2v_mean)
x_val_w2v_mean, y_val_w2v_mean = pair_list_to_x_y(mapped_val_w2v_mean)

In [108]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset_w2v_mean = tf.data.Dataset.from_tensor_slices((x_train_w2v_mean, y_train_w2v_mean))
train_dataset_w2v_mean = train_dataset_w2v_mean.shuffle(buffer_size=len(x_train_w2v_mean)).batch(batch_size)

val_dataset_w2v_mean = tf.data.Dataset.from_tensor_slices((x_val_w2v_mean, y_val_w2v_mean))
val_dataset_w2v_mean = val_dataset_w2v_mean.batch(batch_size)

In [109]:
# Show shapes
x_train_w2v_mean[0].shape, x_train_w2v_mean[1].shape, y_train_w2v_mean.shape

((2073, 300), (2073, 300), (2073,))

In [110]:
# Construir y compilar el modelo
model_w2v_mean = build_and_compile_model()

print(model_w2v_mean.summary())

None


In [111]:
# Train the model
model_w2v_mean.fit(train_dataset_w2v_mean, epochs=num_epochs, validation_data=val_dataset_w2v_mean)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.3061 - val_loss: 3.4682
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.4875 - val_loss: 3.3030
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.2870 - val_loss: 3.1936
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.1391 - val_loss: 3.1051
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.0081 - val_loss: 3.0345
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.8986 - val_loss: 2.9774
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.7965 - val_loss: 2.9309
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.7065 - val_loss: 2.8926
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x17a5e7865d0>

In [112]:
#El model al test
x_test_w2v_mean, y_test_w2v_mean = pair_list_to_x_y(mapped_test_w2v_mean)

In [113]:
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train_w2v_mean, y_train_w2v_mean)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val_w2v_mean, y_val_w2v_mean)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test_w2v_mean, y_test_w2v_mean)}")
print('\n')
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train_w2v_mean, y_train_w2v_mean, model_w2v_mean)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val_w2v_mean, y_val_w2v_mean, model_w2v_mean)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test_w2v_mean, y_test_w2v_mean, model_w2v_mean)}")

Correlación de Pearson (baseline-train): 0.33803124102011484
Correlación de Pearson (baseline-validation): 0.30109730185645944
Correlación de Pearson (baseline-test): 0.40918746240001136


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Correlación de Pearson (train): 0.7455740855450073
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (validation): 0.4462061812303049
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (test): 0.5003590748540991


#### 2.2. Word2Vec + TF-IDF (Mean ponderada)

In [114]:
#models de Word2Vec preentrenats (mean ponderada)
def map_tf_idf(sentence_preproc: List[str], dictionary: Dictionary) -> Tuple[List[np.ndarray], List[float]]:
    '''
    Mapea una oración preprocesada a una lista de vectores y pesos TF-IDF
    :param sentence_preproc: la oración preprocesada
    :param dictionary: el diccionario de Gensim
    :return: una tupla con una lista de vectores y una lista de pesos TF-IDF
    '''
    bow = dictionary.doc2bow(sentence_preproc)
    tf_idf = modelo_tfidf[bow]
    vectors, weights = [], []
    for word_index, weight in tf_idf:
        word = dictionary.get(word_index)
        if word in wv_model:
            vectors.append(wv_model[word])
            weights.append(weight)
    return vectors, weights

In [115]:
mapped_w2v_tfidf = map_pairs(input_pairs, model='W2V + MEAN', dictionary=diccionario, )

mapped_train_w2v_tfidf = map_pairs(train,  model='W2V + TFIDF', dictionary=diccionario, )
mapped_val_w2v_tfidf = map_pairs(validation, model='W2V + TFIDF', dictionary=diccionario, )
mapped_test_w2v_tfidf = map_pairs(test, model='W2V + TFIDF', dictionary=diccionario, )

In [116]:
for vectors, similitud in mapped_w2v_tfidf[:5]:
    print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
    print(f"Puntuación de similitud: {similitud}")

Pares de vectores: (300,), (300,)
Puntuación de similitud: 3.5
Pares de vectores: (300,), (300,)
Puntuación de similitud: 1.25
Pares de vectores: (300,), (300,)
Puntuación de similitud: 3.67
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.25
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.0


In [117]:
# Obtener las listas de train y test
x_train_w2v_tfidf, y_train_w2v_tfidf = pair_list_to_x_y(mapped_train_w2v_tfidf)
x_val_w2v_tfidf, y_val_w2v_tfidf = pair_list_to_x_y(mapped_val_w2v_tfidf)

In [118]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset_w2v_tfidf = tf.data.Dataset.from_tensor_slices((x_train_w2v_tfidf, y_train_w2v_tfidf))
train_dataset_w2v_tfidf = train_dataset_w2v_tfidf.shuffle(buffer_size=len(x_train_w2v_tfidf)).batch(batch_size)

val_dataset_w2v_tfidf = tf.data.Dataset.from_tensor_slices((x_val_w2v_tfidf, y_val_w2v_tfidf))
val_dataset_w2v_tfidf = val_dataset_w2v_tfidf.batch(batch_size)

In [119]:
# Show shapes
x_train_w2v_tfidf[0].shape, x_train_w2v_tfidf[1].shape, y_train_w2v_tfidf.shape

((2073, 300), (2073, 300), (2073,))

In [120]:
# Construir y compilar el modelo
model_w2v_tfidf = build_and_compile_model()

print(model_w2v_tfidf.summary())

None


In [121]:
# Train the model
model_w2v_tfidf.fit(train_dataset_w2v_tfidf, epochs=num_epochs, validation_data=val_dataset_w2v_tfidf)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.2174 - val_loss: 3.3023
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.3628 - val_loss: 3.1292
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.1407 - val_loss: 3.0165
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.9691 - val_loss: 2.9341
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.8278 - val_loss: 2.8712
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.7112 - val_loss: 2.8214
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.6058 - val_loss: 2.7809
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.5115 - val_loss: 2.7471
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x17aa0c365d0>

In [122]:
#El model al test
x_test_w2v_tfidf, y_test_w2v_tfidf = pair_list_to_x_y(mapped_test_w2v_tfidf)

In [123]:
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train_w2v_tfidf, y_train_w2v_tfidf)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val_w2v_tfidf, y_val_w2v_tfidf)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test_w2v_tfidf, y_test_w2v_tfidf)}")
print('\n')
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train_w2v_tfidf, y_train_w2v_tfidf, model_w2v_tfidf)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val_w2v_tfidf, y_val_w2v_tfidf, model_w2v_tfidf)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test_w2v_tfidf, y_test_w2v_tfidf, model_w2v_tfidf)}")

Correlación de Pearson (baseline-train): 0.3805684081019019
Correlación de Pearson (baseline-validation): 0.4066765092543781
Correlación de Pearson (baseline-test): 0.46283983418366403


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Correlación de Pearson (train): 0.7659336171473994
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (validation): 0.478756798150101
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (test): 0.5158266890907587


### 3. SpaCy
---

In [124]:
!python -m spacy download ca_core_news_md


Collecting ca-core-news-md==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/ca_core_news_md-3.7.0/ca_core_news_md-3.7.0-py3-none-any.whl (49.2 MB)
     ---------------------------------------- 0.0/49.2 MB ? eta -:--:--
     ---------------------------------------- 0.4/49.2 MB 11.2 MB/s eta 0:00:05
     - -------------------------------------- 1.9/49.2 MB 30.7 MB/s eta 0:00:02
     --- ------------------------------------ 4.8/49.2 MB 38.0 MB/s eta 0:00:02
     ------ --------------------------------- 8.4/49.2 MB 48.9 MB/s eta 0:00:01
     --------- ----------------------------- 12.4/49.2 MB 81.8 MB/s eta 0:00:01
     ------------- ------------------------- 16.7/49.2 MB 93.9 MB/s eta 0:00:01
     ---------------- ---------------------- 20.6/49.2 MB 93.9 MB/s eta 0:00:01
     ------------------- ------------------- 24.7/49.2 MB 93.9 MB/s eta 0:00:01
     ---------------------- ---------------- 28.9/49.2 MB 81.8 MB/s eta 0:00:01
     ------------------------

In [125]:
#get the embeddings from the spacy model
import spacy
nlp = spacy.load('ca_core_news_md')

def map_spacy(sentence: str, nlp=nlp) -> np.ndarray:
    '''
    Mapea una oración a un vector utilizando un modelo de Spacy
    :param sentence: la oración a mapear
    :param nlp: el modelo de Spacy
    :return: el vector de la oración
    '''
    doc = nlp(sentence)
    return doc.vector

In [126]:
mapped_spacy = map_pairs(input_pairs, model='SPACY', dictionary=diccionario, )

mapped_train_spacy = map_pairs(train,  model='SPACY', dictionary=diccionario, )
mapped_val_spacy = map_pairs(validation, model='SPACY', dictionary=diccionario, )
mapped_test_spacy = map_pairs(test, model='SPACY', dictionary=diccionario, )

In [127]:
for vectors, similitud in mapped_spacy[:5]:
    print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
    print(f"Puntuación de similitud: {similitud}")

Pares de vectores: (300,), (300,)
Puntuación de similitud: 3.5
Pares de vectores: (300,), (300,)
Puntuación de similitud: 1.25
Pares de vectores: (300,), (300,)
Puntuación de similitud: 3.67
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.25
Pares de vectores: (300,), (300,)
Puntuación de similitud: 2.0


In [128]:
# Obtener las listas de train y test
x_train_spacy, y_train_spacy = pair_list_to_x_y(mapped_train_spacy)
x_val_spacy, y_val_spacy = pair_list_to_x_y(mapped_val_spacy)

In [129]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset_spacy = tf.data.Dataset.from_tensor_slices((x_train_spacy, y_train_spacy))
train_dataset_spacy = train_dataset_spacy.shuffle(buffer_size=len(x_train_spacy)).batch(batch_size)

val_dataset_spacy = tf.data.Dataset.from_tensor_slices((x_val_spacy, y_val_spacy))
val_dataset_spacy = val_dataset_spacy.batch(batch_size)

In [130]:
# Show shapes
x_train_spacy[0].shape, x_train_spacy[1].shape, y_train_spacy.shape

((2073, 300), (2073, 300), (2073,))

In [131]:
# Construir y compilar el modelo
model_spacy = build_and_compile_model()

print(model_spacy.summary())

None


In [132]:
# Train the model
model_spacy.fit(train_dataset_spacy, epochs=num_epochs, validation_data=val_dataset_spacy)

Epoch 1/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 4.3025 - val_loss: 3.3567
Epoch 2/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.3314 - val_loss: 3.1220
Epoch 3/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 3.0829 - val_loss: 2.9959
Epoch 4/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.9290 - val_loss: 2.9070
Epoch 5/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.8050 - val_loss: 2.8347
Epoch 6/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.7012 - val_loss: 2.7767
Epoch 7/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - loss: 2.6141 - val_loss: 2.7302
Epoch 8/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 2.5401 - val_loss: 2.6925
Epoch 9/64
[1m33/33[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x178121765d0>

In [133]:
#El model al test
x_test_spacy, y_test_spacy = pair_list_to_x_y(mapped_test_spacy)

In [134]:
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train_spacy, y_train_spacy)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val_spacy, y_val_spacy)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test_spacy, y_test_spacy)}")
print('\n')
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train_spacy, y_train_spacy, model_spacy)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val_spacy, y_val_spacy, model_spacy)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test_spacy, y_test_spacy, model_spacy)}")

Correlación de Pearson (baseline-train): 0.2822341065076769
Correlación de Pearson (baseline-validation): 0.22047349313707085
Correlación de Pearson (baseline-test): 0.34550433092015365


[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Correlación de Pearson (train): 0.6517486539087302
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (validation): 0.30926600164979273
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
Correlación de Pearson (test): 0.44987599023720104


### 4. RoBERTa (CLS and Mean)
---

In [146]:
!python -m spacy download ca_core_news_trf

Collecting ca-core-news-trf==3.7.2
  Downloading https://github.com/explosion/spacy-models/releases/download/ca_core_news_trf-3.7.2/ca_core_news_trf-3.7.2-py3-none-any.whl (457.1 MB)
     ---------------------------------------- 0.0/457.1 MB ? eta -:--:--
     ---------------------------------------- 0.2/457.1 MB 6.3 MB/s eta 0:01:13
     ---------------------------------------- 0.6/457.1 MB 7.6 MB/s eta 0:01:00
     ---------------------------------------- 1.1/457.1 MB 8.7 MB/s eta 0:00:53
     ---------------------------------------- 1.6/457.1 MB 9.5 MB/s eta 0:00:48
     --------------------------------------- 2.3/457.1 MB 10.6 MB/s eta 0:00:43
     --------------------------------------- 3.2/457.1 MB 11.9 MB/s eta 0:00:39
     --------------------------------------- 4.2/457.1 MB 13.5 MB/s eta 0:00:34
     --------------------------------------- 5.5/457.1 MB 15.4 MB/s eta 0:00:30
      -------------------------------------- 7.1/457.1 MB 17.5 MB/s eta 0:00:26
      ------------------

#### 4.1. RoBERTa CLS

In [147]:
nlp_roberta = spacy.load('ca_core_news_trf')

In [165]:
def map_roberta(sentence: str, nlp=nlp_roberta) -> np.ndarray:
    '''
    Mapea una oración a un vector utilizando un modelo de Spacy
    :param sentence: la oración a mapear
    :param nlp: el modelo de Spacy
    :return: el vector de la oración
    '''
    doc = nlp(sentence)
    vector = doc._.trf_data.last_hidden_layer_state.data[-1]
    return vector

In [166]:
mapped_roberta = map_pairs(input_pairs, model='ROBERTA', dictionary=diccionario, )

mapped_train_roberta = map_pairs(train,  model='ROBERTA', dictionary=diccionario, )
mapped_val_roberta = map_pairs(validation, model='ROBERTA', dictionary=diccionario, )
mapped_test_roberta = map_pairs(test, model='ROBERTA', dictionary=diccionario, )

KeyboardInterrupt: 

In [150]:
for vectors, similitud in mapped_roberta[:5]:
    print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
    print(f"Puntuación de similitud: {similitud}")

Pares de vectores: (0,), (0,)
Puntuación de similitud: 3.5
Pares de vectores: (0,), (0,)
Puntuación de similitud: 1.25
Pares de vectores: (0,), (0,)
Puntuación de similitud: 3.67
Pares de vectores: (0,), (0,)
Puntuación de similitud: 2.25
Pares de vectores: (0,), (0,)
Puntuación de similitud: 2.0


In [157]:
# Obtener las listas de train y test
x_train_roberta, y_train_roberta = pair_list_to_x_y(mapped_train_roberta)
x_val_roberta, y_val_roberta = pair_list_to_x_y(mapped_val_roberta)

In [160]:
mapped_train_roberta

[((array([], dtype=float32), array([], dtype=float32)), 3.5),
 ((array([], dtype=float32), array([], dtype=float32)), 1.25),
 ((array([], dtype=float32), array([], dtype=float32)), 3.67),
 ((array([], dtype=float32), array([], dtype=float32)), 2.25),
 ((array([], dtype=float32), array([], dtype=float32)), 2.0),
 ((array([], dtype=float32), array([], dtype=float32)), 2.75),
 ((array([], dtype=float32), array([], dtype=float32)), 2.67),
 ((array([], dtype=float32), array([], dtype=float32)), 2.5),
 ((array([], dtype=float32), array([], dtype=float32)), 2.5),
 ((array([], dtype=float32), array([], dtype=float32)), 3.0),
 ((array([], dtype=float32), array([], dtype=float32)), 3.0),
 ((array([], dtype=float32), array([], dtype=float32)), 1.0),
 ((array([], dtype=float32), array([], dtype=float32)), 2.0),
 ((array([], dtype=float32), array([], dtype=float32)), 4.0),
 ((array([], dtype=float32), array([], dtype=float32)), 3.0),
 ((array([], dtype=float32), array([], dtype=float32)), 2.75),
 (

In [158]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset_roberta = tf.data.Dataset.from_tensor_slices((x_train_roberta, y_train_roberta))
train_dataset_roberta = train_dataset_roberta.shuffle(buffer_size=len(x_train_roberta)).batch(batch_size)

val_dataset_roberta = tf.data.Dataset.from_tensor_slices((x_val_roberta, y_val_roberta))
val_dataset_roberta = val_dataset_roberta.batch(batch_size)

In [153]:
# Show shapes
x_train_roberta[0].shape, x_train_roberta[1].shape, y_train_roberta.shape

((2073, 0), (2073, 0), (2073,))

In [154]:
# Construir y compilar el modelo
model_roberta = build_and_compile_model()
#tf.keras.utils.plot_model(model, show_shapes=True, show_layer_activations=True, )
print(model_roberta.summary())

None


In [156]:
# Train the model
model_roberta.fit(train_dataset_roberta, epochs=num_epochs, validation_data=val_dataset_roberta)

Epoch 1/64


ValueError: Input 0 of layer "functional_21" is incompatible with the layer: expected shape=(None, 300), found shape=(None, 0)

In [None]:
#El model al test
x_test_roberta, y_test_roberta = pair_list_to_x_y(mapped_test_roberta)

In [None]:
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train_roberta, y_train_roberta)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val_roberta, y_val_roberta)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test_roberta, y_test_roberta)}")
print('\n')
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train_roberta, y_train_roberta, model_roberta)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val_roberta, y_val_roberta, model_roberta)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test_roberta, y_test_roberta, model_roberta)}")

#### 4.2. RoBERTa Mean

In [None]:
def map_roberta_means(sentence: str, nlp=nlp_roberta) -> np.ndarray:
    '''
    Mapea una oración a un vector utilizando un modelo de Spacy
    :param sentence: la oración a mapear
    :param nlp: el modelo de Spacy
    :return: el vector de la oración
    '''
    doc = nlp(sentence)
    vector = np.mean(doc._.trf_data.last_hidden_layer_state.data[:-1], axis=0)
    return vector

In [None]:
mapped_roberta_mean = map_pairs(input_pairs, model='ROBERTA_MEAN', dictionary=diccionario, )

mapped_train_roberta_mean = map_pairs(train,  model='ROBERTA_MEAN', dictionary=diccionario, )
mapped_val_roberta_mean = map_pairs(validation, model='ROBERTA_MEAN', dictionary=diccionario, )
mapped_test_roberta_mean = map_pairs(test, model='ROBERTA_MEAN', dictionary=diccionario, )

In [None]:
for vectors, similitud in mapped_roberta_mean[:5]:
    print(f"Pares de vectores: {vectors[0].shape}, {vectors[1].shape}")
    print(f"Puntuación de similitud: {similitud}")

In [None]:
# Obtener las listas de train y test
x_train_roberta_mean, y_train_roberta_mean = pair_list_to_x_y(mapped_train_roberta_mean)
x_val_roberta_mean, y_val_roberta_mean = pair_list_to_x_y(mapped_val_roberta_mean)

In [None]:
# Preparar los conjuntos de datos de entrenamiento y validación
train_dataset_roberta_mean = tf.data.Dataset.from_tensor_slices((x_train_roberta_mean, y_train_roberta_mean))
train_dataset_roberta_mean = train_dataset_roberta_mean.shuffle(buffer_size=len(x_train_roberta_mean)).batch(batch_size)

val_dataset_roberta_mean = tf.data.Dataset.from_tensor_slices((x_val_roberta_mean, y_val_roberta_mean))
val_dataset_roberta_mean = val_dataset_roberta_mean.batch(batch_size)

In [None]:
# Show shapes
x_train_roberta_mean[0].shape, x_train_roberta_mean[1].shape, y_train_roberta_mean.shape

In [None]:
# Construir y compilar el modelo
model_roberta_mean = build_and_compile_model()

print(model_roberta_mean.summary())

In [None]:
# Train the model
model_roberta_mean.fit(train_dataset_roberta_mean, epochs=num_epochs, validation_data=val_dataset_roberta_mean)

In [None]:
#El model al test
x_test_roberta_mean, y_test_roberta_mean = pair_list_to_x_y(mapped_test_roberta_mean)

In [None]:
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (baseline-train): {compute_pearson_baseline(x_train_roberta_mean, y_train_roberta_mean)}")
print(f"Correlación de Pearson (baseline-validation): {compute_pearson_baseline(x_val_roberta_mean, y_val_roberta_mean)}")
print(f"Correlación de Pearson (baseline-test): {compute_pearson_baseline(x_test_roberta_mean, y_test_roberta_mean)}")
print('\n')
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {compute_pearson(x_train_roberta_mean, y_train_roberta_mean, model_roberta_mean)}")
print(f"Correlación de Pearson (validation): {compute_pearson(x_val_roberta_mean, y_val_roberta_mean, model_roberta_mean)}")
print(f"Correlación de Pearson (test): {compute_pearson(x_test_roberta_mean, y_test_roberta_mean, model_roberta_mean)}")

### 5. RoBERTa fine-tuned
---

In [None]:
%pip install tf-keras

In [None]:
from transformers import pipeline, AutoTokenizer
from scipy.special import logit

model = 'projecte-aina/roberta-base-ca-v2-cased-sts'
tokenizer = AutoTokenizer.from_pretrained(model)
pipe = pipeline('text-classification', model=model, tokenizer=tokenizer)

def prepare(sentence_pairs):
    '''
    Prepares a list of sentence pairs for the RoBERTa model
    :param sentence_pairs: the list of sentence pairs
    :return: the prepared sentence pairs
    '''
    sentence_pairs_prep = []
    for s1, s2 in sentence_pairs:
        sentence_pairs_prep.append(f"{tokenizer.cls_token} {s1}{tokenizer.sep_token}{tokenizer.sep_token} {s2}{tokenizer.sep_token}")
    return sentence_pairs_prep


In [None]:
predictions_list_train = []
predictions_list_val = []
predictions_list_test = []

input_pairs_scores = [score for _, _, score in input_pairs]
input_pairs_val_scores = [score for _, _, score in input_pairs_val]
input_pairs_test_scores = [score for _, _, score in input_pairs_test]

input_pairs_sents = [(s1, s2) for s1, s2, _ in input_pairs]
input_pairs_val_sents = [(s1, s2) for s1, s2, _ in input_pairs_val]
input_pairs_test_sents = [(s1, s2) for s1, s2, _ in input_pairs_test]

In [None]:
predictions_train = pipe(prepare(input_pairs_sents), add_special_tokens=False)
predictions_val = pipe(prepare(input_pairs_val_sents), add_special_tokens=False)
predictions_test = pipe(prepare(input_pairs_test_sents), add_special_tokens=False)

In [None]:
for prediction in predictions_train:
    prediction['score'] = (logit(prediction['score']))

for prediction in predictions_val:
    prediction['score'] = (logit(prediction['score']))

for prediction in predictions_test:
    prediction['score'] = (logit(prediction['score']))

In [None]:
def pearson_correlation(predictions, labels):
    '''
    Computes the Pearson correlation between the predictions and the labels
    :param predictions: the predicted values
    :param labels: the true values
    :return: the Pearson correlation
    '''
    return pearsonr(predictions, labels)[0]

In [None]:
for elem in predictions_train:
    predictions_list_train.append(elem['score'])

for elem in predictions_val:
    predictions_list_val.append(elem['score'])

for elem in predictions_test:
    predictions_list_test.append(elem['score'])

In [None]:
# Imprimir el coeficiente de correlación de Pearson
print(f"Correlación de Pearson (train): {pearson_correlation(predictions_list_train, input_pairs_scores)}")
print(f"Correlación de Pearson (validation): {pearson_correlation(predictions_list_val, input_pairs_val_scores)}")
print(f"Correlación de Pearson (test): {pearson_correlation(predictions_list_test, input_pairs_test_scores)}")

## Model comparison
---

In [145]:
#plot all the results form all the different models
import matplotlib.pyplot as plt

# Define the models
models = ['OH', 'SPACY', 'ROBERTA', 'ROBERTA_MEAN', 'W2V + MEAN', 'W2V + TFIDF', 'ROBERTA_FT']
pearson_train = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
pearson_val = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]
pearson_test = [0.0, 0.0, 0.0, 0.0, 0.0, 0.0]

# Fill the results
pearson_train[0] = compute_pearson(x_train_OH, y_train_OH, model_OH)
pearson_val[0] = compute_pearson(x_val_OH, y_val_OH, model_OH)
pearson_test[0] = compute_pearson(x_test_OH, y_test_OH, model_OH)

pearson_train[1] = compute_pearson(x_train_spacy, y_train_spacy, model_spacy)
pearson_val[1] = compute_pearson(x_val_spacy, y_val_spacy, model_spacy)
pearson_test[1] = compute_pearson(x_test_spacy, y_test_spacy, model_spacy)

pearson_train[2] = compute_pearson(x_train_roberta, y_train_roberta, model_roberta)
pearson_val[2] = compute_pearson(x_val_roberta, y_val_roberta, model_roberta)
pearson_test[2] = compute_pearson(x_test_roberta, y_test_roberta, model_roberta)

pearson_train[3] = compute_pearson(x_train_roberta_mean, y_train_roberta_mean, model_roberta_mean)
pearson_val[3] = compute_pearson(x_val_roberta_mean, y_val_roberta_mean, model_roberta_mean)
pearson_test[3] = compute_pearson(x_test_roberta_mean, y_test_roberta_mean, model_roberta_mean)

pearson_train[4] = compute_pearson(x_train_w2v_mean, y_train_w2v_mean, model_w2v_mean)
pearson_val[4] = compute_pearson(x_val_w2v_mean, y_val_w2v_mean, model_w2v_mean)
pearson_test[4] = compute_pearson(x_test_w2v_mean, y_test_w2v_mean, model_w2v_mean)

pearson_train[5] = compute_pearson(x_train_w2v_tfidf, y_train_w2v_tfidf, model_w2v_tfidf)
pearson_val[5] = compute_pearson(x_val_w2v_tfidf, y_val_w2v_tfidf, model_w2v_tfidf)
pearson_test[5] = compute_pearson(x_test_w2v_tfidf, y_test_w2v_tfidf, model_w2v_tfidf)

pearson_train[6] = pearson_correlation(predictions_list_train, input_pairs_scores)
pearson_val[6] = pearson_correlation(predictions_list_val, input_pairs_val_scores)
pearson_test[6] = pearson_correlation(predictions_list_test, input_pairs_test_scores)

# Plot the results
fig, ax = plt.subplots(1, 3, figsize=(18, 6))
ax[0].bar(models, pearson_train)
ax[0].set_title('Train')
ax[0].set_ylim([0.0, 1.0])
ax[1].bar(models, pearson_val)
ax[1].set_title('Validation')
ax[1].set_ylim([0.0, 1.0])
ax[2].bar(models, pearson_test)
ax[2].set_title('Test')
ax[2].set_ylim([0.0, 1.0])
plt.show()

[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 100ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 99ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 93ms/step
[1m65/65[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m16/16[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


ValueError: Input 0 of layer "functional_19" is incompatible with the layer: expected shape=(None, 300), found shape=(32, 0)

## Conclusions
---

After trying out all the models, we can see that the best model is the RoBERTa fine-tuned model. It has the best performance in the text similarity task, but we can asume that this performance is reflecting the fact that we are using the same sentences that were used to create the model. So, excluding this model, the best model is the Word2Vec model using the weighted mean of the vectors. Even though this is the model that obtained the best results, it is important to mention that none of the models had a good performance in the text similarity task. This is probably due to the fact that the dataset is too small and the sentences are too similar to each other. In a real world scenario, it would be difficult to use any of this models to compare the similarity of two sentences. 

## Future work

In the future, trying out the model with trainable embeddings could be a good idea, as stated in the requirements of the project. Due to internal problems and a lack of time I could not complete that part, so it is not included in the final notebook. Even tho the job is not finished and could be improved in a lot of ways, I can affirm that I learned and interiorized a lot of concepts and techniques that were mentioned in class but i did not really understand.

Other improvements could be a better exploration of the data to perform a better preprocessing, and a possible exploration of the hyperparams of the models to improve the performance. A proper report would also be a good idea to explain the results and the process in a more detailed way, and to show the results in a more explanatory way.