# Evaluating Semantic Relevance Between Queries and LOINC Codes using Text Similarity Analysis

In [299]:
#Libraries
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
import os


Possible queries:
- glucose in blood
- bilirubin in plasma
- White blood cells count

In [300]:
#Dataset file
###################### modify each time you want to perform a new query on a different sheet of the file excel
query = 'bilirubin in plasma'  

Dataset_path = r'/home/gabriele/Desktop/Polima/Biomedical Informatic/MachineLearningRanking-main/data/loinc_dataset-v2.xlsx'
Dataset = pd.read_excel(Dataset_path, header=2, sheet_name=query) #removing the first 2 columns (metadata)

print(Dataset.head())
print(Dataset.columns)

Dataset_loinc_num = Dataset.pop('loinc_num')

print(Dataset.columns)

  loinc_num                                   long_common_name  \
0     934-0                          Blood product unit ID [#]   
1    1742-6  Alanine aminotransferase [Enzymatic activity/v...   
2   20565-8      Carbon dioxide, total [Moles/volume] in Blood   
3    1959-6                Bicarbonate [Moles/volume] in Blood   
4   18906-8                     Ciprofloxacin [Susceptibility]   

                  component    system property  
0     Blood product unit ID      Dose      Num  
1  Alanine aminotransferase  Ser/Plas     CCnc  
2            Carbon dioxide       Bld     SCnc  
3               Bicarbonate       Bld     SCnc  
4             Ciprofloxacin   Isolate     Susc  
Index(['loinc_num', 'long_common_name', 'component', 'system', 'property'], dtype='object')
Index(['long_common_name', 'component', 'system', 'property'], dtype='object')


The interest is on understand how much is initially relevant a document with respect to a query. In order to do this we can use all the informations available from the dataset (common_long_name, component, system, property) to build an initial ranking using a model.

The model that can be used is a pretrained Word2Vec model, tuned specifically for this task. The Word2Vec model is useful because it can understand the patterns of semantic relationships and contextual similarity between biomedical terms, even when faced with abbreviations, synonyms, or paraphrasing.

Before doing this we need to manage the dataset in order to have clear sentences, joining the columns of long_common_name, component, system, property

In [301]:
Dataset = Dataset.to_numpy()

for i in range(3):
    print(Dataset[i])

print()
print()

corpus = [" ".join(row) for row in Dataset]
print(corpus)

title_to_loinc = dict(zip(corpus, Dataset_loinc_num))
print(title_to_loinc)



#Cleaning of phrases
sentences = []
for sentence in corpus:
    sentence = sentence.lower()
    sentence = sentence.replace('/',' ')
    cleaned_sentence = re.sub(r'[^a-z\s]', r' ', sentence)
    tokens = [token for token in cleaned_sentence.split() if token]
    sentences.append(tokens)

print(sentences)

['Blood product unit ID [#]' 'Blood product unit ID' 'Dose' 'Num']
['Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma'
 'Alanine aminotransferase' 'Ser/Plas' 'CCnc']
['Carbon dioxide, total [Moles/volume] in Blood' 'Carbon dioxide' 'Bld'
 'SCnc']


['Blood product unit ID [#] Blood product unit ID Dose Num', 'Alanine aminotransferase [Enzymatic activity/volume] in Serum or Plasma Alanine aminotransferase Ser/Plas CCnc', 'Carbon dioxide, total [Moles/volume] in Blood Carbon dioxide Bld SCnc', 'Bicarbonate [Moles/volume] in Blood Bicarbonate Bld SCnc', 'Ciprofloxacin [Susceptibility] Ciprofloxacin Isolate Susc', 'Ampicillin+Sulbactam [Susceptibility] Ampicillin+Sulbactam Isolate Susc', 'Methicillin resistant Staphylococcus aureus [Presence] in Unspecified specimen by Organism specific culture Staphylococcus aureus.methicillin resistant isolate XXX ACnc', 'ABO & Rh group [Type] in Blood ABO+Rh group Bld Type', 'Leukocytes [#/volume] in Blood Leukocytes Bld NCnc', 'B

In [302]:
#Libraries for Word2Vec
from gensim.models import Word2Vec

w2v_model = Word2Vec(sentences, vector_size=50, window=2, min_count=2, workers=4, epochs=10)
print(w2v_model.wv.index_to_key)



['in', 'volume', 'ser', 'plas', 'serum', 'or', 'plasma', 'blood', 'mass', 'mcnc', 'moles', 'glucose', 'bld', 'scnc', 'type', 'bilirubin', 'isolate', 'susceptibility', 'susc', 'calcium', 'group', 'total', 'albumin', 'product', 'abo', 'aminotransferase', 'c', 'protein', 'lymphocytes', 'ampicillin', 'reactive', 'ncnc', 'leukocytes', 'acnc', 'by', 'presence', 'choriogonadotropin', 'cholesterol', 'dioxide', 'carbon', 'ccnc', 'chloride', 'activity', 'enzymatic', 'ionized', 'rh', 'indirect', 'post', 'xxx', 'unit', 'glucuronidated', 'bpu', 'b', 'specific', 'tyrosine', 'corrected', 'subunit', 'beta', 'antibody', 'levofloxacin', 'ciprofloxacin', 'disposition', 'dna', 'urine', 'bicarbonate', 'fld', 'fluid', 'sulbactam', 'for', 'mscnc', 'alkaline', 'phosphatase', 'hematocrit', 'amylase', 'trimethoprim', 'cortisol', 'carcinoembryonic', 'sulfamethoxazole', 'alanine', 'id', 'bilirubinate', 'virus', 'aureus', 'methicillin', 'resistant', 'nitrofurantoin', 'crossmatch', 'major', 'screen', 'gentamicin', 

Now the word2vec model is trained, so there is the possibility to represent the words as vectors of numer (in this case of size 100).

To get a representation of a title is just needed to calculate the mean vector of all the sentences.

In [303]:
from nltk.tokenize import word_tokenize
# Function to get the vector representation of a phrase by averaging word vectors
def sentence_vector(phrase, model):
    words = word_tokenize(phrase.lower())  # Tokenize the phrase into words
    #print(words)
    word_vectors = [model.wv[word] for word in words if word in model.wv]  # Get vectors for words in the model's vocabulary
    #print(len(word_vectors))
    #print(word_vectors)

    #in this moment we have 300 numbers for each word, then 6*300 numbers (6 matrices)
    # due to the fact that we have 6 words

    if word_vectors:
        #print(len((np.mean(word_vectors, axis=0))))
        #we are summing 6 matrices of dimension 1x300, and then dividing everything for 6

        return np.mean(word_vectors, axis=0)  # Average the word vectors
    else:
        return np.zeros(model.vector_size)  # Return a zero vector if no words are in the model



Now it is possible to apply this function easily to all the sentences and get a measure of similarity (like using cosine similarity)

In [304]:
from sklearn.metrics.pairwise import cosine_similarity


query_vec = sentence_vector(query, w2v_model)

print(query)
#print(query_vec)

sentences_vecs = [sentence_vector(" ".join(sentence), w2v_model) for sentence in sentences]
#print(sentences_vecs)

similarities = cosine_similarity([query_vec], sentences_vecs).flatten()

# Ranking
ranking = np.argsort(similarities)[::-1]



results = []
for i in ranking:
    title = corpus[i]
    loinc_num = title_to_loinc.get(title, "NA")  # prende il numero LOINC, "NA" se non trovato
    relevance = similarities[i]
    results.append([title, loinc_num, relevance])
    #print(f"{title} → LOINC: {loinc_num} → relevance: {relevance:.3f}")

ranking_df = pd.DataFrame(results, columns=['long_common_name', 'loinc_num', 'similarity'])

print(ranking_df.head())

folder_path = os.path.join("data")
os.makedirs(folder_path, exist_ok=True)  # Crea la cartella se non esiste

# Definizione del path completo del file
output_path = os.path.join(folder_path, f"starting ranking {query}.xlsx")

ranking_df.to_excel(output_path, index=False)



bilirubin in plasma
                                    long_common_name loinc_num  similarity
0  Bilirubin.direct [Mass/volume] in Serum or Pla...    1968-7    0.802994
1  Bilirubin.total [Mass/volume] in Serum or Plas...    1975-2    0.776385
2  Bilirubin.indirect [Mass/volume] in Serum or P...    1971-1    0.751710
3  Bilirubin.indirect [Mass or Moles/volume] in S...   35192-4    0.746804
4  Cholesterol in HDL [Mass/volume] in Serum or P...    2085-9    0.737445
