In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)
import os

Mounted at /content/drive


In [2]:
os.chdir('/content/drive/My Drive/LLM e Psicometria/00 Validazione PFA/Analisi/DASS21')

In [3]:
!ls

DASS21.gsheet  PFA_for_DASS.ipynb


In [4]:
# Check and install the required packages
import subprocess
import sys

# Function to install packages
def install(package):
    if package not in sys.modules:
        subprocess.check_call([sys.executable, "-m", "pip", "install", package])

# List of required packages
packages = ['xlwt', 'sentence_transformers', 'factor-analyzer', 'scipy']

# Installing the required packages
for package in packages:
    install(package)

In [5]:
# Make gsheet readable
sheet_name = 'DASS21' # replace with your own sheet name
sheet_id = '1wbnRdOBL1W3zpbTKTXWEjkbgzOjoTx2XXNMN4u9FCys' # replace with your sheet's ID
url = f"https://docs.google.com/spreadsheets/d/{sheet_id}/gviz/tq?tqx=out:csv&sheet={sheet_name}"

In [6]:
import pandas as pd
import numpy as np
import xlwt
# read in file with items text etc.
df_items = pd.read_csv(url)
df_items.head()

Unnamed: 0,Number,Number mod,Factor,Item,Item simplified,Sign
0,3,1,Dep,Non riuscivo proprio a provare delle emozioni ...,,+
1,5,2,Dep,Ho avuto un'estrema difficoltà nel cominciare ...,,+
2,10,3,Dep,Non vedevo nulla di buono nel mio futuro,,+
3,13,4,Dep,Mi sono sentito scoraggiato e depresso,,+
4,16,5,Dep,Non c'era nulla che mi dava entusiasmo,,+


# Part I: Create embeddings for PFA

Below we do the folowing:
- Group item in a single list per facet
- Calculate item embeddings
- Reverse item embeddings (if necessary)
- Aggregate item embeddings to create facet embedding
- Calculate facet list embedding
- Compute cosine similarities
- Store results

In [7]:
# Calculate item embeddings
# Reverse item embeddings if necessary

# To make the code shorter and sightly more efficient we loop through the models that we are using in the paper.
# First we create a list of models (which all have italian in the language they can be used in according to huggingface)
models = ['nli-distilroberta-base-v2',
          'paraphrase-multilingual-mpnet-base-v2',
          'paraphrase-multilingual-MiniLM-L12-v2',
          'intfloat/multilingual-e5-base',
          'LaBSE',
          'dwulff/mpnet-personality'] #consider adding the finetuned model for psicometrista

# Import the necessary libraries and functions
from sentence_transformers import SentenceTransformer, util
from huggingface_hub import snapshot_download
import tensorflow_hub as hub

# Create an empty data frame, which we will then populate with the different type of embeddings
facet_embeddings_sentences = pd.DataFrame()

for mod in models:
  if mod == 'Dimitre/universal-sentence-encoder':
    item_embed = [] #create list for item-level embed
    item_embed_rev = [] #create list for item-level embed accounting for sign
    model_path = snapshot_download(repo_id = 'Dimitre/universal-sentence-encoder')
    model = hub.KerasLayer(handle = model_path)
      #encode items
    for item in range(0,len(df_items['Number mod'])): #loop over all the items
      item_embed.append(model([df_items['Item'].iloc[item]]).numpy().flatten())
      if df_items['Sign'].iloc[item][0] == '-': #if items is negatively keyed, reverse the embeddings
        item_embed_rev.append(model([df_items['Item'].iloc[item]]).numpy().flatten()*-1)
      else:
        item_embed_rev.append(model([df_items['Item'].iloc[item]]).numpy().flatten())
    df_items[mod + '_embeddings'] = item_embed #then, we append the two item-level embeddings list and give them a name based on the model we used
    df_items[mod + '_embeddings_rev'] = item_embed_rev
  else:
    model = SentenceTransformer(mod) #call the model
    item_embed = [] #create list for item-level embed
    item_embed_rev = [] #create list for item-level embed accounting for sign
    for item in range(0,len(df_items['Number mod'])): #loop over all the items
    #encode items
      item_embed.append(model.encode(df_items['Item'].iloc[item]))
      if df_items['Sign'].iloc[item][0] == '-': #if items is negatively keyed, reverse the embeddings
        item_embed_rev.append(model.encode(df_items['Item'].iloc[item])*-1)
      else:
        item_embed_rev.append(model.encode(df_items['Item'].iloc[item]))
    df_items[mod + '_embeddings'] = item_embed #then, we append the two item-level embeddings list and give them a name based on the model we used
    df_items[mod + '_embeddings_rev'] = item_embed_rev

  from tqdm.autonotebook import tqdm, trange
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.74k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.12k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.13k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/723 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/402 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/4.12k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/645 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/471M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/480 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.08M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/179k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/461 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/2.22k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/804 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/397 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/5.22M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.62M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

2_Dense/config.json:   0%|          | 0.00/114 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/2.36M [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/164 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.81k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/613 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [8]:
df_items.columns

Index(['Number', 'Number mod', 'Factor', 'Item', 'Item simplified', 'Sign',
       'nli-distilroberta-base-v2_embeddings',
       'nli-distilroberta-base-v2_embeddings_rev',
       'paraphrase-multilingual-mpnet-base-v2_embeddings',
       'paraphrase-multilingual-mpnet-base-v2_embeddings_rev',
       'paraphrase-multilingual-MiniLM-L12-v2_embeddings',
       'paraphrase-multilingual-MiniLM-L12-v2_embeddings_rev',
       'intfloat/multilingual-e5-base_embeddings',
       'intfloat/multilingual-e5-base_embeddings_rev', 'LaBSE_embeddings',
       'LaBSE_embeddings_rev', 'dwulff/mpnet-personality_embeddings',
       'dwulff/mpnet-personality_embeddings_rev'],
      dtype='object')

In [11]:
# To avoid having too long names for the output datsets, we create a list of names, which we will then use to save the embedding cosine matrices
# make sure that the names here are meaningful and aligned with those of the one in the cell above.
model_short = ['distilroberta', 'mpnet', 'miniLM', 'e5', 'labse', 'Wulff']

# Below, we loop over the different models we use for the study and compute the cosine sim. matrices.
for mod in range(0, len(models)):
  # create temporary empty lists for the item and one-pop method embeddings
  facet_embeddings_item = []
  facet_embeddings_item_rev = []

  #create cosine similarity matrix for each embedding calculation approach
  cosine_similarities_item = util.pytorch_cos_sim(df_items[models[mod] + '_embeddings'],df_items[models[mod] + '_embeddings']).numpy()
  cosine_similarities_item_rev = util.pytorch_cos_sim(df_items[models[mod] + '_embeddings_rev'],df_items[models[mod] + '_embeddings_rev']).numpy()
  #fill diagonal with 1. This is done to avoid efa functions reading the cosine matrix as covariance
  np.fill_diagonal(cosine_similarities_item,1)
  np.fill_diagonal(cosine_similarities_item_rev,1)

  #store results
  pd.DataFrame(cosine_similarities_item, columns = df_items['Item'].unique(), index = df_items['Item'].unique()).to_csv('./cos_matrices/matrix_concatenated_item_'+model_short[mod]+'.csv', index = False)
  pd.DataFrame(cosine_similarities_item_rev, columns = df_items['Item'].unique(), index = df_items['Item'].unique()).to_csv('./cos_matrices/matrix_concatenated_item_rev_'+model_short[mod]+'.csv', index = False)
