In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
import os

Mounted at /content/drive


In [8]:
os.chdir('/content/drive/My Drive/LLM e Psicometria/00 Validazione PFA/Analisi/Dark_Triad_EN')

In [9]:
!ls

Analyses_Dark_Triad.R	    Results_target_item.csv
cos_matrices		    Results_target_item_labeled_cor.csv
Dark_triad_items_EN.gsheet  Results_target_item_labeled.csv
PFA_for_DarkTriad_EN.ipynb  Results_target_item_labeled_reordered.csv


In [10]:
# Check and install the required packages
import subprocess
import sys

# Function to install packages
def install(package):
    if package not in sys.modules:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of required packages
packages = ['xlwt', 'sentence_transformers', 'factor-analyzer', 'scipy']

# Installing the required packages
for package in packages:
    install(package)

In [11]:
# Make gsheet readable
sheet_name = 'Dark_triad_items_EN' # replace with your own sheet name
sheet_id = '1mEa_4-CaVP19QwCy2zHYGzZgbtgQAr-Bc4uyu1nBT0Y' # replace with your sheet's ID
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

In [12]:
import pandas as pd
import numpy as np
import xlwt
# read in file with items text etc.
df_items = pd.read_csv(url)
df_items.head()

Unnamed: 0,Number,Factor,Item,Item simplified,Sign
0,1,Machiavellanism,I tend to manipulate others to get my way,,+
1,2,Machiavellanism,I have used deceit or lied to get my way,,+
2,3,Machiavellanism,I have use flattery to get my way,,+
3,4,Machiavellanism,I tend to exploit others towards my own end,,+
4,5,Psychopathy,I tend to lack remorse,,+


# Part I: Create embeddings for PFA

Below we do the folowing:
- Group item in a single list per facet
- Calculate item embeddings
- Reverse item embeddings (if necessary)
- Aggregate item embeddings to create facet embedding
- Calculate facet list embedding
- Compute cosine similarities
- Store results

In [14]:
# Calculate item embeddings
# Reverse item embeddings if necessary

# To make the code shorter and sightly more efficient we loop through the models that we are using in the paper.
# First we create a list of models (which all have italian in the language they can be used in according to huggingface)
models = ['nli-distilroberta-base-v2',
          'paraphrase-multilingual-mpnet-base-v2',
          'paraphrase-multilingual-MiniLM-L12-v2',
          'intfloat/multilingual-e5-base',
          'LaBSE',
          'dwulff/mpnet-personality',
          'all-mpnet-base-v2',
          'sentence-t5-base'] #consider adding the finetuned model for psicometrista

# Import the necessary libraries and functions
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import snapshot_download
import tensorflow_hub as hub

# Create an empty data frame, which we will then populate with the different type of embeddings
facet_embeddings_sentences = pd.DataFrame()

for mod in models:
  if mod == 'Dimitre/universal-sentence-encoder':
    item_embed = [] #create list for item-level embed
    item_embed_rev = [] #create list for item-level embed accounting for sign
    model_path = snapshot_download(repo_id = 'Dimitre/universal-sentence-encoder')
    model = hub.KerasLayer(handle = model_path)
      #encode items
    for item in range(0,len(df_items['Number'])): #loop over all the items
      item_embed.append(model([df_items['Item'].iloc[item]]).numpy().flatten())
      if df_items['Sign'].iloc[item][0] == '-': #if items is negatively keyed, reverse the embeddings
        item_embed_rev.append(model([df_items['Item'].iloc[item]]).numpy().flatten()*-1)
      else:
        item_embed_rev.append(model([df_items['Item'].iloc[item]]).numpy().flatten())
    df_items[mod + '_embeddings'] = item_embed #then, we append the two item-level embeddings list and give them a name based on the model we used
    df_items[mod + '_embeddings_rev'] = item_embed_rev
  else:
    model = SentenceTransformer(mod) #call the model
    item_embed = [] #create list for item-level embed
    item_embed_rev = [] #create list for item-level embed accounting for sign
    for item in range(0,len(df_items['Number'])): #loop over all the items
    #encode items
      item_embed.append(model.encode(df_items['Item'].iloc[item]))
      if df_items['Sign'].iloc[item][0] == '-': #if items is negatively keyed, reverse the embeddings
        item_embed_rev.append(model.encode(df_items['Item'].iloc[item])*-1)
      else:
        item_embed_rev.append(model.encode(df_items['Item'].iloc[item]))
    df_items[mod + '_embeddings'] = item_embed #then, we append the two item-level embeddings list and give them a name based on the model we used
    df_items[mod + '_embeddings_rev'] = item_embed_rev

In [None]:
df_items.columns

Index(['Number', 'Factor', 'Item', 'Item simplified', 'Sign',
       'nli-distilroberta-base-v2_embeddings',
       'nli-distilroberta-base-v2_embeddings_rev',
       'paraphrase-multilingual-mpnet-base-v2_embeddings',
       'paraphrase-multilingual-mpnet-base-v2_embeddings_rev',
       'paraphrase-multilingual-MiniLM-L12-v2_embeddings',
       'paraphrase-multilingual-MiniLM-L12-v2_embeddings_rev',
       'intfloat/multilingual-e5-base_embeddings',
       'intfloat/multilingual-e5-base_embeddings_rev', 'LaBSE_embeddings',
       'LaBSE_embeddings_rev', 'dwulff/mpnet-personality_embeddings',
       'dwulff/mpnet-personality_embeddings_rev'],
      dtype='object')

In [16]:
# To avoid having too long names for the output datsets, we create a list of names, which we will then use to save the embedding cosine matrices
# make sure that the names here are meaningful and aligned with those of the one in the cell above.
model_short = ['distilroberta', 'mpnet', 'miniLM', 'e5', 'labse', 'Wulff', 'mpnet_en', 't5']

# Below, we loop over the different models we use for the study and compute the cosine sim. matrices.
for mod in range(0, len(models)):
  # create temporary empty lists for the item and one-pop method embeddings
  facet_embeddings_item = []
  facet_embeddings_item_rev = []

  #create cosine similarity matrix for each embedding calculation approach
  cosine_similarities_item = util.pytorch_cos_sim(df_items[models[mod] + '_embeddings'],df_items[models[mod] + '_embeddings']).numpy()
  cosine_similarities_item_rev = util.pytorch_cos_sim(df_items[models[mod] + '_embeddings_rev'],df_items[models[mod] + '_embeddings_rev']).numpy()
  #fill diagonal with 1. This is done to avoid efa functions reading the cosine matrix as covariance
  np.fill_diagonal(cosine_similarities_item,1)
  np.fill_diagonal(cosine_similarities_item_rev,1)

  #store results
  pd.DataFrame(cosine_similarities_item, columns = df_items['Item'].unique(), index = df_items['Item'].unique()).to_csv('./cos_matrices/matrix_concatenated_item_'+model_short[mod]+'.csv', index = False)
  pd.DataFrame(cosine_similarities_item_rev, columns = df_items['Item'].unique(), index = df_items['Item'].unique()).to_csv('./cos_matrices/matrix_concatenated_item_rev_'+model_short[mod]+'.csv', index = False)


  a = torch.tensor(a)
