# Generate embedding matrices for IPIP-NEO-300
### Using several transformers referred to in the manuscript


In [8]:
# un from command line if needed
#!pip install tensorflow-hub
#!pip install sentence_transformers
import pandas as pd
import numpy as np

### Read in the data, take the items (ignore signs) and put them in a dataframe for embedding
The file imported is a csv file that has the facet code and name, and then each item every time followed by whether it is positively or negatively keyed. Here we ignore the facet code and name and the item signs, on the basis that we are trying to be parallel to factor analysis of ratings where people will not have seen the facet, code, or item signs.

In [3]:
csv_file_path = 'IPIP240_preprocessed_items.csv'  # Replace with the path to your CSV file
data = pd.read_csv(csv_file_path)
# Prepare the sentences
sentences = []
for index, row in data.iterrows():
    # Join the four items into a single string, separated by a space, checks column exists first
    sentence = ' '.join([str(row[item]) for item in ['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7', 'item8', 'item9', 'item10'] if item in data.columns])
    sentences.append(sentence)
#print(sentences)

### Generate the embedding similarity matrices for the item content
This creates scale embeddings for ALL items per scale concatenated in one pop. For item level embedding see other notebooks in folder. At the item level in the other folders, we will use item level raw, and item level where we reverse the embedding.

In [10]:
# USE-DAN - Ciao Damiano! please see if you can use tensor flow to do what we do below for all the others.
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import snapshot_download
import tensorflow_hub as hub
model_path = snapshot_download(repo_id="Dimitre/universal-sentence-encoder")
model_dan =  hub.KerasLayer(handle=model_path)

#encode sentences
sentence_embeddings = model_dan(sentences).numpy()
# Compute cosine similarities
cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
# Convert PyTorch tensor to a pandas DataFrame
cosine_sim_matrix = pd.DataFrame(cosine_similarities.numpy())
# Print to confirm it's a 30x30 matrix
# print(cosine_sim_matrix)
#Write to csv
cosine_sim_matrix.to_csv('matrix_concatenated_USE_DAN.csv', index=False)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

variables/variables.index:   0%|          | 0.00/3.19k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.26k [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.41k [00:00<?, ?B/s]

saved_model.pb:   0%|          | 0.00/8.22M [00:00<?, ?B/s]

variables.data-00000-of-00001:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

In [None]:
# Initialize nli-distilroberta-base-v2
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('nli-distilroberta-base-v2')
#encode sentences
sentence_embeddings = model.encode(sentences)
# Compute cosine similarities
cosine_similarities = util.pytorch_cos_sim(sentence_embeddings, sentence_embeddings)
# Convert PyTorch tensor to a pandas DataFrame
cosine_sim_matrix = pd.DataFrame(cosine_similarities.numpy())
# Print to confirm it's a 30x30 matrix
# print(cosine_sim_matrix)
#Write to csv
cosine_sim_matrix.to_csv('matrix_concatenated_roberta.csv', index=False)

In [None]:
# Initialize all-mpnet-base-v2
from sentence_transformers import SentenceTransformer, util
model2 = SentenceTransformer('all-mpnet-base-v2')
#encode sentences
sentence_embeddings2 = model2.encode(sentences)
# Compute cosine similarities
cosine_similarities2 = util.pytorch_cos_sim(sentence_embeddings2, sentence_embeddings2)
# Convert PyTorch tensor to a pandas DataFrame
cosine_sim_matrix2 = pd.DataFrame(cosine_similarities2.numpy())
# Print to confirm it's a 30x30 matrix
# print(cosine_sim_matrix)
#Write to csv
cosine_sim_matrix2.to_csv('matrix_concatenated_mpnet.csv', index=False)

In [None]:
# Initialize 'sentence-t5-base
from sentence_transformers import SentenceTransformer, util
model3 = SentenceTransformer('sentence-t5-base')
#encode sentences
sentence_embeddings3 = model3.encode(sentences)
# Compute cosine similarities
cosine_similarities3 = util.pytorch_cos_sim(sentence_embeddings3, sentence_embeddings3)
# Convert PyTorch tensor to a pandas DataFrame
cosine_sim_matrix3 = pd.DataFrame(cosine_similarities3.numpy())
# Print to confirm it's a 30x30 matrix
# print(cosine_sim_matrix)
#Write to csv
cosine_sim_matrix3.to_csv('matrix_concatenated_t5.csv', index=False)

In [None]:
# Initialize distilroberta
from sentence_transformers import SentenceTransformer, util
model5 = SentenceTransformer('sentence-transformers/stsb-distilroberta-base-v2')
#encode sentences
sentence_embeddings5 = model5.encode(sentences)
# Compute cosine similarities
cosine_similarities5 = util.pytorch_cos_sim(sentence_embeddings5, sentence_embeddings5)
# Convert PyTorch tensor to a pandas DataFrame
cosine_sim_matrix5 = pd.DataFrame(cosine_similarities5.numpy())
# Print to confirm it's a 30x30 matrix
# print(cosine_sim_matrix)
#Write to csv
cosine_sim_matrix5.to_csv('matrix_concatenated_distilroberta.csv', index=False)

In [None]:
# Initialize MiniLM-L12-H384-uncased
from sentence_transformers import SentenceTransformer, util
model5 = SentenceTransformer('microsoft/MiniLM-L12-H384-uncased')
#encode sentences
sentence_embeddings5 = model5.encode(sentences)
# Compute cosine similarities
cosine_similarities5 = util.pytorch_cos_sim(sentence_embeddings5, sentence_embeddings5)
# Convert PyTorch tensor to a pandas DataFrame
cosine_sim_matrix5 = pd.DataFrame(cosine_similarities5.numpy())
# Print to confirm it's a 30x30 matrix
# print(cosine_sim_matrix)
#Write to csv
cosine_sim_matrix5.to_csv('matrix_concatenated_miniLM.csv', index=False)

No sentence-transformers model found with name /Users/nigelguenole/.cache/torch/sentence_transformers/microsoft_MiniLM-L12-H384-uncased. Creating a new one with MEAN pooling.


### Here we average the matrices for the 5 transformers; we exclude USE-DAN as that's our baseline.

In [None]:
# List CSV files paths

csv_files = [
    'matrix_concatenated_distilroberta.csv',
    'matrix_concatenated_miniLM.csv',
    'matrix_concatenated_mpnet.csv',
    'matrix_concatenated_roberta.csv',
    'matrix_concatenated_t5.csv'
]

# Function to read CSV files and compute average matrix
def average_matrices(files):
    matrices = [pd.read_csv(file) for file in files]  # Read each CSV into a DataFrame
    average_matrix = sum(matrices) / len(matrices)  # Compute  average of all matrices
    return average_matrix

# Compute the average matrix
average_matrix = average_matrices(csv_files)

# Write the average matrix to a CSV file
average_matrix.to_csv('matrix_concatenated_average.csv', index=False)