# Generate embedding matrices for IPIP-NEO-300 
### Using several transformers referred to in the manuscript


In [9]:
# run from command line if needed
# pip install tensorflow
# pip install tensorflow-hub 
# pip install tensorflow-hub
import pandas as pd
import numpy as np

In [10]:
csv_file_path = 'IPIP300_preprocessed_items.csv'
data = pd.read_csv(csv_file_path)

# Initialize a list to store sentences and signs from each row
row_sentences_and_signs = []

for index, row in data.iterrows():
    # Initialize a list to store sentences and signs for the current row
    current_row_data = []
    
    # Iterate over each item/sign pair you're interested in
    for i in range(1, 11):  # Assuming you have item1 to item10 and sign1 to sign10
        item_col = f'item{i}'
        sign_col = f'sign{i}'
        if item_col in data.columns and sign_col in data.columns:
            # Append the cell content and sign as a tuple for the current row
            current_row_data.append((str(row[item_col]), 1 if row[sign_col] == '+' else -1))
    
    # Add the current row's data to the main list
    row_sentences_and_signs.append(current_row_data)
#row_sentences_and_signs

In [12]:
from sentence_transformers import SentenceTransformer, util
import numpy as np

model = SentenceTransformer('nli-distilroberta-base-v2')

# Initialize a list to store the modified averaged row embeddings
averaged_row_embeddings = []

# Iterate over each row's sentences and signs
for sentences_and_signs in row_sentences_and_signs:
    # Initialize a list to store embeddings for the current row's sentences
    modified_cell_embeddings = []
    
    # Encode each sentence, apply the corresponding sign, and append its embedding
    for sentence, sign in sentences_and_signs:
        cell_embedding = model.encode(sentence) * sign
        modified_cell_embeddings.append(cell_embedding)
    
    # Convert the list of embeddings to a single NumPy array before averaging
    modified_cell_embeddings_array = np.array(modified_cell_embeddings)
    
    # Calculate the average embedding for the current row
    avg_embedding = np.mean(modified_cell_embeddings_array, axis=0)
    averaged_row_embeddings.append(avg_embedding)

# Convert the list of averaged embeddings to a single NumPy array for cosine similarity computation
averaged_row_embeddings_array = np.stack(averaged_row_embeddings)

# Compute cosine similarities
cosine_similarities = util.pytorch_cos_sim(averaged_row_embeddings_array, averaged_row_embeddings_array)

# Convert PyTorch tensor to a pandas DataFrame
cosine_similarities_df = pd.DataFrame(cosine_similarities.numpy())

# Write to csv
cosine_similarities_df.to_csv('matrix_reversed_items_roberta.csv', index=False)