# What we do here

- Load items dataframe
- Embed items 
- Compute cosine similarity matrix per scale
- Construct final data frame with the following columns
    - Row =  scale name
    - Col1 = cosine similarity matrix
    - Col2 = scale id


In [1]:
#source Jonas/bin/activate 
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
from openai import OpenAI
import torch
import json
import os

  from .autonotebook import tqdm as notebook_tqdm


# 1 Load items

In [2]:
df = pd.read_csv('items.csv')
df.head()

Unnamed: 0,item_id,scale_id,item_text,source_table
0,1,360emergencymed_azami_2024,She/he performs the clinical examinations acco...,360emergencymed_azami_2024
1,2,360emergencymed_azami_2024,The student provides accurate and authoritativ...,360emergencymed_azami_2024
2,3,360emergencymed_azami_2024,Uses appropriate communication skills while ta...,360emergencymed_azami_2024
3,4,360emergencymed_azami_2024,"She/he patiently, timely, and correctly perfor...",360emergencymed_azami_2024
4,5,360emergencymed_azami_2024,She/he knows the correct use of the medical eq...,360emergencymed_azami_2024


In [17]:
#drop empty items
df = df[~df['item_text'].isna()]

In [6]:
# Initialize OpenAI client
os.environ.get("OPENAI_API_KEY")
client = OpenAI()

# 2 Compute Embeddings 

In [33]:
# Example dataframe
# df['item_text']

# List of models (can mix SentenceTransformer and OpenAI models)
models = [
    'nli-distilroberta-base-v2',
    'paraphrase-multilingual-mpnet-base-v2',
    'paraphrase-multilingual-MiniLM-L12-v2',
    'intfloat/multilingual-e5-base',
    'LaBSE',
    
    # --- Add OpenAI embedding models here ---
    'openai:text-embedding-3-large',
    'openai:text-embedding-3-small'
]

# Helper function
def get_openai_embedding(model_name, text):
    """Return embedding using OpenAI API."""
    response = client.embeddings.create(
        model=model_name,
        input=text
    )
    return response.data[0].embedding


# Main loop
for mod in models:
    print(f"Processing model: {mod}")
    item_embed = []

    # Detect if this model is an OpenAI embedding model
    if mod.startswith("openai:"):
        openai_model = mod.replace("openai:", "")
        
        for text in df['item_text']:
            emb = get_openai_embedding(openai_model, text)
            item_embed.append(emb)

    else:
        # SentenceTransformer model
        st_model = SentenceTransformer(mod)

        for text in df['item_text']:
            emb = st_model.encode(text)
            item_embed.append(emb)

    # Save embeddings to dataframe
    df[f"{mod}_embeddings"] = item_embed

Processing model: nli-distilroberta-base-v2




Processing model: paraphrase-multilingual-mpnet-base-v2
Processing model: paraphrase-multilingual-MiniLM-L12-v2




Processing model: intfloat/multilingual-e5-base
Processing model: LaBSE




Processing model: openai:text-embedding-3-large
Processing model: openai:text-embedding-3-small


# 3 construct final dataframe

In [55]:
#scales_corr_df
json.dumps(sim_matrix.tolist())

'[[1.0, 0.36389562487602234, 0.5199487209320068, 0.6566143035888672, 0.6517947316169739, 0.6644923090934753, 0.42835789918899536, 0.7001331448554993, 0.3956119120121002, 0.4204137325286865, 0.4455580711364746, 0.30995866656303406, 0.4128008186817169, 0.576221227645874, 0.6197668313980103, 0.36974090337753296, 0.2510111629962921, 0.39887556433677673, 0.5285356640815735, 0.35519278049468994, 0.3754984438419342, 0.45184066891670227, 0.27808818221092224, 0.30845436453819275, 0.2945639193058014, 0.44207605719566345, 0.37067273259162903, 0.6508740782737732, 0.5258058309555054, 0.6042841672897339, 0.2767714858055115, 0.5116041898727417, 0.33767393231391907, 0.45223569869995117, 0.2719181180000305, 0.19645234942436218, 0.42564064264297485, 0.4422115683555603, 0.10344786196947098, 0.3690079152584076, 0.46122369170188904, 0.45821571350097656, 0.4590514302253723, 0.2870608866214752, 0.40879034996032715, 0.5095987915992737, 0.4956972897052765, 0.41163668036460876, 0.5154732465744019, 0.45929035544

In [69]:
scales_corr_rows = []
model_short = ['distilroberta', 'mpnet', 'miniLM', 'e5', 'labse', 'gpt3-large', 'gpt3-small'
               ]

for scale in df['scale_id'].unique():
    temp_df = df[df['scale_id'] == scale]
    
    row = {"scale_id": scale}
    
    for m_idx in range(len(models)):
        col = models[m_idx] + '_embeddings'
        
        # Convert the Series of vectors to a 2D numpy array
        emb_matrix = temp_df[col].to_list()
        
        # Convert to torch tensor
#        emb_tensor = torch.tensor(emb_matrix)
        sim_tensor = util.pytorch_cos_sim(emb_matrix, emb_matrix)

        # Convert to numpy for further handling
        sim_matrix = sim_tensor.numpy()
        
        # Fill diagonal with 1
        np.fill_diagonal(sim_matrix, 1.0)
        
        # ⚠️ IMPORTANT CHANGE: store JSON string, not raw ndarray
        row[model_short[m_idx]] = json.dumps(sim_matrix.tolist())
    
    scales_corr_rows.append(row)

# Final DataFrame: one row per scale, one column per model (plus scale_id)
scales_corr_df = pd.DataFrame(scales_corr_rows)

scales_corr_df.to_csv('cosine_scales.csv', index=False)


In [71]:
sim_matrix.shape

(8, 8)