In [1]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.metrics.pairwise import cosine_similarity

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
Load CPT Codes from Excel File
cpt_file = "cpt_codes.xlsx"
cpt_data = pd.read_excel(cpt_file)

# Extract codes and descriptions
cpt_descriptions = cpt_data["Description"].tolist()

In [3]:
pitch_text = "Behavioral Health Support for Primary Care Integrated, Virtual Mental Health Care for Family Medicine, Pediatrics and OBGYN OUR  MISSION Increasing Access to Mental Health Services in Underresourced Areas  While about a quarter of adults have a mental illness, 50 %  of the US population lives in a mental health shortage area.  [company name] sets out to help patients in areas where support is otherwise unavailable.  THE PROBLEM 25% 80% 50%  of PCP visits involve mental health Patients overwhelmingly bring their mental health concerns to primary care providers instead of going directly to specialty care. PCPs act as the de facto triage point but have limited support.of patients with mental health concerns seek support from primary care PCPs work in 15-minute appointments and typically have limited resources and training, putting a strain both on them and their patients.Of the US population lives in a mental health shortage area Given massive shortages, there are frequently 6 +  month waitlists for patients who need specialty mental health support.Communities Lack Access to Mental Healthcare Patients look to primary care as their front line of support, however providers are poorly equipped Collaborative Care ([name]) [name]  integrates physical and mental health care in the primary care setting through counseling and consulting psychiatry.  There have been 80 +  academic studies *  showing :  [name] is highly effective (2x vs. usual care) 1. [name] reduces cost of care substantially 2. [name] drives up patient and provider satisfaction 3. *See details here THE CARE MODEL [company name] Has Aligned Incentives With Primary Care Clinics Improve Outcomes  & Patient Satisfaction Studies of the [name] Model have shown that it is more than 2x as effective as usual care in driving mental health symptoms to remission.  Moreover, 75 %  of patients who are treated through [name] models are highly satisfied with their care. Activate A New  FFS Revenue Stream Collaborative care can generate significant new revenue for partner practices through FFS reimbursement. [company name] generates ~$ 10k in contrbution margin per engaged PCP / year. Improve Risk Adjustments & Reduce Cost of Care We can drive an additional increase in revenue through refining mental health diagnoses and improving risk adjustments. Additionally, [name] has been proven to reduce patient cost of care by $ 3,400 over 4 years. VALUE PROPOSITION. Thank you For more info, contact : [email]"


### BioBert & ClinicalBert

In [21]:
# BioBERT
biobert_tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
biobert_model = AutoModel.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

# ClinicalBERT
clinicalbert_tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
clinicalbert_model = AutoModel.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")


Some weights of the model checkpoint at dmis-lab/biobert-base-cased-v1.1 were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing Bert

In [24]:
def get_embedding(text, tokenizer, model, max_length=128):
    """Generate [CLS] token embeddings for the input text."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=max_length)
    with torch.no_grad():
        outputs = model(**inputs)
    # Return the embedding for the [CLS] token
    return outputs.last_hidden_state[:, 0, :].squeeze(0).numpy()

# Step 4: Generate Embeddings for the Pitch Text and CPT Descriptions
pitch_embedding = get_embedding(pitch_text, clinicalbert_tokenizer, clinicalbert_model)

cpt_embeddings = []
for description in cpt_descriptions:
    cpt_embeddings.append(get_embedding(description, clinicalbert_tokenizer, clinicalbert_model))

# Convert CPT embeddings to a numpy array for similarity computation
import numpy as np
cpt_embeddings = np.array(cpt_embeddings)

# Step 5: Compute Cosine Similarities
similarities = cosine_similarity([pitch_embedding], cpt_embeddings).flatten()

# Step 6: Find the Top N Matches
top_n = 5
top_indices = similarities.argsort()[-top_n:][::-1]  # Indices of top N most similar descriptions
top_matches = cpt_data.iloc[top_indices].copy()
top_matches["Similarity"] = similarities[top_indices]

# Step 7: Display the Top Matches
print("Top Relevant CPT Codes:")
print(top_matches)


Top Relevant CPT Codes:
        Code                                        Description  Similarity
10658  3066F  Nephropathy is a kidney disease, which may occ...    0.924085
5328   59426  This service is considered a mini global code ...    0.920627
5327   59425  This service is considered a mini global code ...    0.917729
7062   80344  The lab analyst measures the amount of or dete...    0.913344
7190   80344  The lab analyst measures the amount of or dete...    0.913344


In [4]:
# Step 4: Define Helper Functions for Embedding Generation
def generate_embeddings(text, tokenizer, model):
    """Generates embeddings for the input text using the given tokenizer and model."""
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    outputs = model(**inputs)
    # Use [CLS] token embedding as the sentence embedding
    cls_embedding = outputs.last_hidden_state[:, 0, :].detach().numpy()
    return cls_embedding


In [5]:
# Step 5: Generate Embeddings for Pitch Deck Text and CPT Descriptions
def get_embeddings_for_cpt(cpt_descriptions, tokenizer, model):
    """Generates embeddings for all CPT descriptions."""
    cpt_embeddings = []
    for desc in cpt_descriptions:
        embedding = generate_embeddings(desc, tokenizer, model)
        cpt_embeddings.append(embedding)
    return torch.tensor(cpt_embeddings).squeeze()


In [8]:
# Generate embeddings for pitch text and CPT codes
biobert_pitch_embedding = generate_embeddings(pitch_text, biobert_tokenizer, biobert_model)
biobert_cpt_embeddings = get_embeddings_for_cpt(cpt_descriptions, biobert_tokenizer, biobert_model)

clinicalbert_pitch_embedding = generate_embeddings(pitch_text, clinicalbert_tokenizer, clinicalbert_model)
clinicalbert_cpt_embeddings = get_embeddings_for_cpt(cpt_descriptions, clinicalbert_tokenizer, clinicalbert_model)


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  return torch.tensor(cpt_embeddings).squeeze()
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [9]:
# Step 6: Calculate Cosine Similarities
def find_top_matches(pitch_embedding, cpt_embeddings, cpt_data, top_n=5):
    """Finds the top N CPT codes most similar to the input pitch text."""
    similarities = cosine_similarity(pitch_embedding, cpt_embeddings)
    top_indices = similarities.argsort()[0][-top_n:][::-1]
    top_matches = cpt_data.iloc[top_indices]
    top_matches["Similarity"] = similarities[0, top_indices]
    return top_matches


In [10]:

# BioBERT Results
print("Top Matches using BioBERT:")
biobert_top_matches = find_top_matches(biobert_pitch_embedding, biobert_cpt_embeddings, cpt_data)
print(biobert_top_matches)


Top Matches using BioBERT:
       Code                                        Description  Similarity
3481  37215  The provider places an intravascular stent int...    0.871257
6984  78812  PET scans are highly effective in the detectio...    0.868899
4738  51595  In this procedure, the provider surgically rem...    0.863706
6988  78816  The patient is injected with a radiopharmaceut...    0.863442
9808  95941  A provider other than the surgeon or anesthesi...    0.860780


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_matches["Similarity"] = similarities[0, top_indices]


In [11]:

# ClinicalBERT Results
print("Top Matches using ClinicalBERT:")
clinicalbert_top_matches = find_top_matches(clinicalbert_pitch_embedding, clinicalbert_cpt_embeddings, cpt_data)
print(clinicalbert_top_matches)


Top Matches using ClinicalBERT:
        Code                                        Description  Similarity
10907  0020M  This is an administrative multianalyte assay w...    0.907281
6803   77300  Radiation therapy is one of the most effective...    0.896859
10229  99392  Preventive medicine services are provided to i...    0.894350
10228  99391  Preventive medicine services are provided to i...    0.894350
7649   81560  The lab analyst performs a blood test to evalu...    0.893371


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top_matches["Similarity"] = similarities[0, top_indices]


### Fuzzy

In [27]:
from fuzzywuzzy import process

# Function to find the most relevant CPT codes for the given input text
def find_relevant_cpt_codes(pitch_text, cpt_data, top_n=5):
    cpt_descriptions = cpt_data["Description"].tolist()
    # Find the top matches for the extracted text
    matches = process.extract(pitch_text, cpt_descriptions, limit=top_n)
    
    # Get the corresponding codes and descriptions for the best matches
    relevant_codes = []
    for match in matches:
        description = match[0]
        score = match[1]
        code = cpt_data[cpt_data['Description'] == description]['Code'].values[0]
        relevant_codes.append({'Code': code, 'Description': description, 'Score': score})
    
    return relevant_codes

relevant_codes = find_relevant_cpt_codes(pitch_text, cpt_data)

# Display the relevant CPT codes
for code in relevant_codes:
    print(f"Code: {code['Code']}, Description: {code['Description']}, Score: {code['Score']}")


Code: 212, Description: The provider performs anesthesia services for a patient undergoing intracranial procedures involving subdural taps. In this procedure, another provider removes a little fluid from the subdural space, the fluid filled space in between the outer and middle membrane layers covering the brain. The surgical provider performs the procedure to decrease the excess intracranial fluid pressure being exerted on the brain tissues., Score: 86
Code: 474, Description: The provider performs anesthesia services for a patient undergoing a partial rib resection in which the surgical provider removes a portion of the ribs. The radical procedure involves other extensive measures for treating conditions such as pectus excavatum, which is a hollow depression in the center of the lower chest., Score: 86
Code: 563, Description: The provider performs anesthesia services for a patient undergoing a procedure involving the heart, the sac around the heart, and the great vessels of the chest,

### Name Entity Recognition

In [6]:
import spacy
nlp = spacy.load('en_core_web_sm')

# Function to find the most relevant CPT codes for the given input text using NER
def find_relevant_cpt_codes(pitch_text, cpt_data):
    # Process the extracted text with spaCy to identify named entities
    doc = nlp(pitch_text)
    
    # Extract relevant entities (e.g., medical terms, procedures)
    entities = [ent.text for ent in doc.ents if ent.label_ in ['MEDICAL_CONDITION', 'PROCEDURE']]
    
    # Find matching CPT codes based on the entities
    relevant_codes = []
    for entity in entities:
        matches = cpt_data[cpt_data['Description'].str.contains(entity, case=False, na=False)]
        for _, row in matches.iterrows():
            relevant_codes.append({'Code': row['Code'], 'Description': row['Description']})
    
    return relevant_codes

relevant_codes = find_relevant_cpt_codes(pitch_text, cpt_data)

relevant_codes


[]

### Topic Modeling

In [9]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

# Function to perform topic modeling using LDA to find the most relevant CPT codes
def find_relevant_cpt_codes_lda(pitch_text, cpt_data, num_topics=3, num_words=5):
    # Combine extracted text with CPT descriptions for LDA
    corpus = cpt_data['Description'].tolist() + [pitch_text]
    
    # Convert the corpus into a document-term matrix
    vectorizer = CountVectorizer(stop_words='english')
    dtm = vectorizer.fit_transform(corpus)
    
    # Fit the LDA model
    lda = LatentDirichletAllocation(n_components=num_topics, random_state=42)
    lda.fit(dtm)
    
    # Get the feature names (words)
    feature_names = vectorizer.get_feature_names_out()
    
    # Extract topics and words from the LDA model
    topics = lda.components_
    
    # Find the topics related to the extracted text (last document in the corpus)
    pitch_text_idx = len(corpus) - 1
    text_topics = lda.transform(dtm[pitch_text_idx])
    
    # Get the top words for each topic
    relevant_words = []
    for topic_idx, topic in enumerate(topics):
        top_words_idx = topic.argsort()[-num_words:][::-1]
        relevant_words.extend([feature_names[i] for i in top_words_idx])
    
    # Find matching CPT codes based on the relevant words
    relevant_codes = []
    for word in set(relevant_words):
        matches = cpt_data[cpt_data['Description'].str.contains(word, case=False, na=False)]
        for _, row in matches.iterrows():
            relevant_codes.append({'Code': row['Code'], 'Description': row['Description']})
    
    return relevant_codes

# Example usage
relevant_codes = find_relevant_cpt_codes_lda(pitch_text, cpt_data)

# Display the relevant CPT codes
# Display the relevant CPT codes (limit output)
for code in relevant_codes[:5]:  # Display only the first 10 matches
    print(f"Code: {code['Code']}, Description: {code['Description']}")

Code: 222, Description: The provider performs anesthesia services for a patient undergoing an intracranial procedure, including electrocoagulation of an intracranial nerve, which means to stop bleeding using electrocautery, or an electrical current.
Code: 190, Description: The provider performs anesthesia services for a patient undergoing a procedure on the facial bones or the skull.
Code: 192, Description: The provider performs anesthesia services for a patient undergoing radical surgery procedures on the facial bones or skull. This may include prognathism, which is a protrusion of the lower jaw.
Code: 210, Description: The provider performs anesthesia services for a patient undergoing intracranial procedures that are not specifically described by another anesthesia code.
Code: 211, Description: The provider performs anesthesia services for a patient undergoing intracranial procedures that involve removing a small portion of skull bone to take out a hematoma or blood clot from within 

### TF-IDF Matching
Corpus: combine all CPT descriptions and pitch text

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Corpus: combine all CPT descriptions and pitch text
# Function to perform TF-IDF matching to find the most relevant CPT codes
def find_relevant_cpt_codes_tfidf(pitch_text, cpt_codes_df, top_n=5):
    # Combine extracted text with CPT descriptions for TF-IDF
    corpus = cpt_data['Description'].tolist() + [pitch_text]
    
    # Convert the corpus into a TF-IDF matrix
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    # Calculate cosine similarity between the extracted text and all CPT descriptions
    cosine_similarities = cosine_similarity(tfidf_matrix[-1], tfidf_matrix[:-1]).flatten()
    
    # Get the indices of the top matches
    top_indices = cosine_similarities.argsort()[-top_n:][::-1]
    
    # Get the corresponding codes and descriptions for the best matches
    relevant_codes = []
    for idx in top_indices:
        relevant_codes.append({
            'Code': cpt_data.iloc[idx]['Code'],
            'Description': cpt_data.iloc[idx]['Description'],
            'Score': cosine_similarities[idx]
        })
    
    return relevant_codes

relevant_codes = find_relevant_cpt_codes_tfidf(pitch_text, cpt_data)

# Display the relevant CPT codes
for code in relevant_codes:
    print(f"Code: {code['Code']}, Description: {code['Description']}, Score: {code['Score']}")


Code: 99375, Description: Report this service when the provider supervises the care provided by a home health agency, overseeing the plan of care, while the patient is at home or in a facility meant for long–term care. For 99375 the provider should spend 30 minutes or more on a patient’s indirect care., Score: 0.2989245353553636
Code: 96156, Description: The provider assesses psychological, behavioral, emotional, cognitive, and social factors that affect a patient’s physical health, rather than assessing a specific mental health disorder. There are no time limits applied to this code. Report this code for an initial or repeat health behavior assessment., Score: 0.2739189946849235
Code: 99493, Description: A provider performs psychiatric collaborative care management (CoCM) for a patient receiving behavioral health treatment and regular psychiatric interspecialty consultation in collaboration and in conjunction with a patient’s treating (or billing) primary care provider.?Report?99493?f

Corpus: combine description for the CPT in this row and pitch text

In [13]:
def find_relevant_cpt_codes_tfidf(pitch_text, cpt_data, top_n=5):
    # Create a list to store similarity scores
    relevant_codes = []
    
    # Corpus: combine description for the CPT in this row and pitch text
    # Iterate through each CPT description to compare with the extracted text
    for index, row in cpt_data.iterrows():
        corpus = [row['Description'], pitch_text]
        
        # Convert the corpus into a TF-IDF matrix
        vectorizer = TfidfVectorizer(stop_words='english')
        tfidf_matrix = vectorizer.fit_transform(corpus)
        
        # Calculate cosine similarity between the extracted text and the CPT description
        cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2]).flatten()[0]
        
        # Append the code, description, and similarity score
        relevant_codes.append({
            'Code': row['Code'],
            'Description': row['Description'],
            'Score': cosine_sim
        })
    
    # Sort the relevant codes by similarity score in descending order and return the top matches
    relevant_codes = sorted(relevant_codes, key=lambda x: x['Score'], reverse=True)[:top_n]
    return relevant_codes

relevant_codes = find_relevant_cpt_codes_tfidf(pitch_text, cpt_data)

# Display the relevant CPT codes
for code in relevant_codes:
    print(f"Code: {code['Code']}, Description: {code['Description']}, Score: {code['Score']}")

Code: 99375, Description: Report this service when the provider supervises the care provided by a home health agency, overseeing the plan of care, while the patient is at home or in a facility meant for long–term care. For 99375 the provider should spend 30 minutes or more on a patient’s indirect care., Score: 0.2920122671021555
Code: 99493, Description: A provider performs psychiatric collaborative care management (CoCM) for a patient receiving behavioral health treatment and regular psychiatric interspecialty consultation in collaboration and in conjunction with a patient’s treating (or billing) primary care provider.?Report?99493?for the first 60 minutes of CoCM in a subsequent month after the first month of care.?, Score: 0.2581073391292682
Code: 99374, Description: Report this service when the provider oversees the plan of care provided by a home health agency while the patient is at home or in a facility meant for long–term care. For 99374 the provider should spend a minimum 15–2