# What we do here

- Load items dataframe
- Embed items 
- Compute cosine similarity matrix per scale
- Construct final data frame with the following columns
    - Row =  scale name
    - Col1 = cosine similarity matrix
    - Col2 = scale id


In [121]:
#source Jonas/bin/activate 
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer, util
import torch

# 1 Load items

In [122]:
df = pd.read_csv('items.csv')
df.head()

Unnamed: 0,item_id,scale_id,item_text,source_table
0,1,360emergencymed_azami_2024,She/he performs the clinical examinations acco...,360emergencymed_azami_2024
1,2,360emergencymed_azami_2024,The student provides accurate and authoritativ...,360emergencymed_azami_2024
2,3,360emergencymed_azami_2024,Uses appropriate communication skills while ta...,360emergencymed_azami_2024
3,4,360emergencymed_azami_2024,"She/he patiently, timely, and correctly perfor...",360emergencymed_azami_2024
4,5,360emergencymed_azami_2024,She/he knows the correct use of the medical eq...,360emergencymed_azami_2024


In [123]:
#drop empty items
df = df[~df['item_text'].isna()]

# 2 Compute Embeddings 

In [124]:
# Example dataframe
# df['item_text']

# List of models (can mix SentenceTransformer and OpenAI models)
models = [
    'nli-distilroberta-base-v2',
    'paraphrase-multilingual-mpnet-base-v2',
    'paraphrase-multilingual-MiniLM-L12-v2',
    'intfloat/multilingual-e5-base',
    'LaBSE',
    
    # --- Add OpenAI embedding models here ---
    'openai:text-embedding-3-large',
    'openai:text-embedding-3-small'
]

# Initialize OpenAI client
client = OpenAI()

# Helper function
def get_openai_embedding(model_name, text):
    """Return embedding using OpenAI API."""
    response = client.embeddings.create(
        model=model_name,
        input=text
    )
    return response.data[0].embedding


# Main loop
for mod in models:
    print(f"Processing model: {mod}")
    item_embed = []

    # Detect if this model is an OpenAI embedding model
    if mod.startswith("openai:"):
        openai_model = mod.replace("openai:", "")
        
        for text in df['item_text']:
            emb = get_openai_embedding(openai_model, text)
            item_embed.append(emb)

    else:
        # SentenceTransformer model
        st_model = SentenceTransformer(mod)

        for text in df['item_text']:
            emb = st_model.encode(text)
            item_embed.append(emb)

    # Save embeddings to dataframe
    df[f"{mod}_embeddings"] = item_embed

Processing model: nli-distilroberta-base-v2
Processing model: paraphrase-multilingual-mpnet-base-v2
Processing model: paraphrase-multilingual-MiniLM-L12-v2
Processing model: intfloat/multilingual-e5-base
Processing model: LaBSE
Processing model: openai:text-embedding-3-large
Processing model: openai:text-embedding-3-small


# 3 construct final dataframe

In [153]:
col = models[0] + '_embeddings'  # e.g. "paraphrase-multilingual-mpnet-base-v2_embeddings"

# Convert the Series of vectors to a 2D numpy array: (n_items, embedding_dim)
emb_matrix = np.vstack(df[col].to_list())

# Convert to torch tensor
emb_tensor = torch.tensor(emb_matrix)

sim_matrix = util.pytorch_cos_sim(emb_tensor, emb_tensor)

In [154]:
sim_matrix.numpy()

array([[ 1.0000002 ,  0.3741003 ,  0.50861573, ..., -0.0713194 ,
        -0.02239083,  0.0462351 ],
       [ 0.3741003 ,  0.99999934,  0.50856173, ..., -0.13052134,
        -0.07202323, -0.01962789],
       [ 0.50861573,  0.50856173,  0.9999996 , ...,  0.02674046,
         0.17618975,  0.18011466],
       ...,
       [-0.0713194 , -0.13052134,  0.02674046, ...,  1.0000001 ,
         0.5553119 ,  0.4885304 ],
       [-0.02239083, -0.07202323,  0.17618975, ...,  0.5553119 ,
         0.99999976,  0.58691716],
       [ 0.0462351 , -0.01962789,  0.18011466, ...,  0.4885304 ,
         0.58691716,  0.99999946]], shape=(1932, 1932), dtype=float32)

In [165]:
scales_corr_rows = []
model_short = ['distilroberta', 'mpnet', 'miniLM', 'e5', 'labse', 'gpt3-large', 'gpt3-small']

for scale in df['scale_id'].unique():
    temp_df = df[df['scale_id'] == scale]
    
    row = {"scale_id": scale}
    
    for m_idx in range(len(models)):
        col = models[m_idx] + '_embeddings'
        
        # Convert the Series of vectors to a 2D numpy array
        emb_matrix = np.vstack(temp_df[col].to_list())
        
        # Convert to torch tensor
        emb_tensor = torch.tensor(emb_matrix)
        
        # Cosine similarity matrix (torch tensor)
        sim_tensor = util.pytorch_cos_sim(emb_tensor, emb_tensor)
        
        # Convert to numpy for further handling
        sim_matrix = sim_tensor.numpy()
        
        # Fill diagonal with 1
        np.fill_diagonal(sim_matrix, 1.0)
        
        # Store matrix in the row under a short model name
        row[model_short[m_idx]] = sim_matrix
    
    scales_corr_rows.append(row)

# Final DataFrame: one row per scale, one column per model (plus scale_id)
scales_corr_df = pd.DataFrame(scales_corr_rows)

scales_corr_df.to_csv('cosine_scales.csv')