In [2]:
'''
    Input files: input/report_keywords/???
    ???: for general: report_general_embeddings_freq.csv
    ???: for med: report_med_embeddings_freq.csv

    workflow: 
    1. construct list of common attributes
    2. rank using embeddings (cosine similarity)
    3. simply run by embeddings, rank by low similarity
'''

'\n    Input files: input/report_keywords/???\n    ???: for general: report_general_embeddings_freq.csv\n    ???: for med: report_med_embeddings_freq.csv\n\n    workflow: \n    1. construct list of common attributes\n    2. rank using embeddings (cosine similarity)\n    3. simply run by embeddings, rank by low similarity\n'

In [3]:
import numpy as np
import pandas as pd
from tqdm import tqdm

  from pandas.core import (


In [4]:
general_path = "../input/report_keywords/report_general_embeddings_freq.csv"
med_path = "../input/report_keywords/report_med_embeddings_freq.csv"

general_df = pd.read_csv(general_path)
general_attributes = general_df['pretty_name']
med_df = pd.read_csv(med_path)
med_attributes = med_df['pretty_name']
common_attributes = list(set(med_attributes) & set(general_attributes)) # find common attributes
# common_attributes

'''
    Find subset pairs, "chest" and "right chest", "chest" and "chest radiograph" would be considered subset pairs
'''
subset_pairs = []

general_embeddings = general_df['UMLS_Embeddings']
med_embeddings = med_df['UMLS_Embeddings']
general_freq = general_df['frequency']
med_freq = med_df['frequency']
 

In [5]:
   
print(f'processing {len(general_attributes)} attributes.')
for i, general_attr in tqdm(enumerate(general_attributes), desc='processing general attr'):
    general_words = set(general_attr.lower().split())
    general_emb = general_embeddings[i]
    general_frequency = general_freq[i]
    
    for j, med_attr in enumerate(med_attributes):
        med_words = set(med_attr.lower().split())
        med_emb = med_embeddings[j]
        med_frequency = med_freq[j]
        if general_words.issubset(med_words) and general_words != med_words:
            subset_pairs.append({
                "General Attribute": general_attr,
                "General UMLS_Embeddings": general_emb,
                "General Frequency": general_frequency,
                "Medical Attribute": med_attr,
                "Medical UMLS_Embeddings": med_emb,
                "Medical Frequency": med_frequency,
                "Direction": "General ⊂ Medical"
            })
        elif med_words.issubset(general_words) and general_words != med_words:
            subset_pairs.append({
                "General Attribute": general_attr,
                "General UMLS_Embeddings": general_emb,
                "General Frequency": general_frequency,
                "Medical Attribute": med_attr,
                "Medical UMLS_Embeddings": med_emb,
                "Medical Frequency": med_frequency,
                "Direction": "Medical ⊂ General"
            })

df_subsets = pd.DataFrame(subset_pairs)
df_subsets

processing 14628 attributes.


processing general attr: 14628it [02:22, 102.80it/s]


Unnamed: 0,General Attribute,General UMLS_Embeddings,General Frequency,Medical Attribute,Medical UMLS_Embeddings,Medical Frequency,Direction
0,chest,[ 3.03216279e-01 -3.65474582e-01 2.09655568e-...,22576,Right Chest,[ 2.49328896e-01 -2.63318449e-01 2.64106125e-...,757,General ⊂ Medical
1,chest,[ 3.03216279e-01 -3.65474582e-01 2.09655568e-...,22576,Radiologic Examination Of Chest,[ 8.54271799e-02 5.60153602e-03 -2.43190706e-...,455,General ⊂ Medical
2,chest,[ 3.03216279e-01 -3.65474582e-01 2.09655568e-...,22576,Chest Radiograph,[-1.32556647e-01 -1.37236774e-01 -1.72968626e-...,1924,General ⊂ Medical
3,chest,[ 3.03216279e-01 -3.65474582e-01 2.09655568e-...,22576,Surface Region Of Upper Chest,[ 2.41611525e-01 -4.28055078e-01 2.20263958e-...,49,General ⊂ Medical
4,chest,[ 3.03216279e-01 -3.65474582e-01 2.09655568e-...,22576,Surface Region Of Lower Chest,[ 2.57560223e-01 -4.40750510e-01 2.06421763e-...,48,General ⊂ Medical
...,...,...,...,...,...,...,...
9824,amylase,[ 1.84100211e-01 -5.87766588e-01 -3.98403853e-...,2,Amylase Lipase,[ 3.52496982e-01 -7.75270224e-01 -2.44756028e-...,1,General ⊂ Medical
9825,amylase,[ 1.84100211e-01 -5.87766588e-01 -3.98403853e-...,2,Increased Amylase,[ 5.13469517e-01 -3.00044030e-01 -1.72634929e-...,1,General ⊂ Medical
9826,fifteen,[ 1.90505952e-01 -3.21403056e-01 -4.59401272e-...,1,Fifteen Minutes,[ 2.01778680e-01 -7.24623442e-01 -7.80900180e-...,20,General ⊂ Medical
9827,cs,[ 2.31582314e-01 -1.79011300e-01 -3.28092635e-...,1,Aorta And Iliac Artery Cs,[ 1.33606598e-01 -2.01011568e-01 -2.30009526e-...,3,General ⊂ Medical


In [6]:
'''
    some cleaning, converting string to float
'''

import re
def convert_embedding(embedding_str):
    try:
        # Remove square brackets
        cleaned = embedding_str.strip().replace("[", "").replace("]", "")
        # Ensure numbers are properly separated and split into a list
        return list(map(float, cleaned.split()))
    except ValueError as e:
        print(f"Error processing embedding: {embedding_str}, Error: {e}")
        return None
    
df_subsets["General UMLS_Embeddings"] = df_subsets["General UMLS_Embeddings"].apply(
    lambda x: convert_embedding(x) if isinstance(x, str) else x
)
df_subsets["Medical UMLS_Embeddings"] = df_subsets["Medical UMLS_Embeddings"].apply(
    lambda x: convert_embedding(x) if isinstance(x, str) else x
)


In [9]:
from sklearn.metrics.pairwise import cosine_similarity

def compute_cosine_similarity(embed1, embed2):
    return cosine_similarity([embed1], [embed2])[0][0]

result_rows = []

grouped = df_subsets.groupby("General Attribute")
for general_attr, group in tqdm(grouped, desc="Processing General Attributes"):
    general_embedding = np.array(group.iloc[0]["General UMLS_Embeddings"])
    general_frequency = group.iloc[0]["General Frequency"]
    direction = group.iloc[0]["Direction"]

    similarities = group["Medical UMLS_Embeddings"].apply(
        lambda med_emb: compute_cosine_similarity(general_embedding, np.array(med_emb))
    )

    mean_similarity = similarities.mean()

    concatenated_medical_attributes = "; ".join(group["Medical Attribute"])
    concatenated_medical_frequencies = "; ".join(map(str, group["Medical Frequency"]))

    # Append results
    result_rows.append({
        "General Attribute": general_attr,
        # "General UMLS_Embeddings": general_embedding.tolist(),
        "General Frequency": general_frequency,
        "Matched Medical Attributes": concatenated_medical_attributes,
        "Matched Medical Frequencies": concatenated_medical_frequencies,
        "Mean Cosine Similarity": mean_similarity,
        "Direction": direction,
    })

final_df = pd.DataFrame(result_rows)
final_df = final_df.sort_values(by="Mean Cosine Similarity", ascending=True)

final_df
final_df.to_csv("output/ranked_cosine_similarity.csv", index=False)


Processing General Attributes: 100%|██████████| 2582/2582 [00:02<00:00, 1136.81it/s]
