# Generate embedding matrices for IPIP-NEO-300 
### Using several transformers referred to in the manuscript


In [1]:
# run from command line if needed
# pip install tensorflow
# pip install tensorflow-hub 
# pip install tensorflow-hub
import pandas as pd
import numpy as np

### Read in the data, take the items (ignore signs) and put them in a dataframe for embedding
The file imported is a csv file that has the facet code and name, and then each item every time followed by whether it is positively or negatively keyed. Here we ignore the facet code and name and the item signs, on the basis that we are trying to be parallel to factor analysis of ratings where people will not have seen the facet, code, or item signs.

In [2]:
csv_file_path = 'IPIP300_preprocessed_items.csv'  
data = pd.read_csv(csv_file_path)

# Initialize a list to store sentences from each row
row_sentences = []

for index, row in data.iterrows():
    # Initialize a list to store sentences for the current row
    current_row_sentences = []
    
    # Iterate over each item/column you're interested in
    for item in ['item1', 'item2', 'item3', 'item4', 'item5', 'item6', 'item7', 'item8', 'item9', 'item10']:
        if item in data.columns:
            # Append the cell content as a sentence for the current row
            current_row_sentences.append(str(row[item]))
    
    # Add the current row's sentences to the main list
    row_sentences.append(current_row_sentences)

### Generate the embedding similarity matrices for the item content
This creates scale embeddings for ALL items per scale concatenated in one pop. For item level embedding see other notebooks in folder. At the item level in the other folders, we will use item level raw, and item level where we reverse the embedding.

In [4]:
# Import necessary libraries
from sentence_transformers import SentenceTransformer, util
import numpy as np
import pandas as pd

# Initialize nli-distilroberta-base-v2 model
model = SentenceTransformer('nli-distilroberta-base-v2')

# Initialize a list to store the averaged row embeddings
averaged_row_embeddings = []

# Assuming row_sentences is defined elsewhere and contains the sentences for each row
# Iterate over each row's sentences
for sentences in row_sentences:
    # Initialize a list to store embeddings for the current row's sentences
    cell_embeddings = []
    
    # Encode each sentence and append its embedding
    for sentence in sentences:
        cell_embedding = model.encode(sentence)
        cell_embeddings.append(cell_embedding)
    
    # Calculate the average embedding for the current row and append to the list
    avg_embedding = np.mean(cell_embeddings, axis=0)
    averaged_row_embeddings.append(avg_embedding)

# Convert the list of averaged embeddings to a single NumPy array
averaged_row_embeddings_np = np.array(averaged_row_embeddings)

# Compute cosine similarities using the NumPy array
cosine_similarities = util.pytorch_cos_sim(averaged_row_embeddings_np, averaged_row_embeddings_np)

# Convert the PyTorch tensor of cosine similarities to a pandas DataFrame
cosine_similarities_df = pd.DataFrame(cosine_similarities.numpy())

# Write the DataFrame to a CSV file
cosine_similarities_df.to_csv('matrix_items_roberta.csv', index=False)