## Fuzzy Matching

In [None]:
from fuzzywuzzy import process

def fuzzy_match(preprocessed_text, preprocessed_cpt_df, threshold=70):
    """
    Perform fuzzy matching between preprocessed text and CPT descriptions.
    
    Args:
        preprocessed_text (list): List of preprocessed text from pitch deck.
        preprocessed_cpt_df (pd.DataFrame): DataFrame with preprocessed CPT descriptions.
        threshold (int): Minimum score for fuzzy matching (0-100).
    
    Returns:
        matched_results (list): List of matched CPT codes and descriptions.
    """
    matched_results = []
    
    for text in preprocessed_text:
        # Find the best match using fuzzy matching for each text
        best_match, score = process.extractOne(text, preprocessed_cpt_df['Description'])
        
        # Check if the score meets the threshold
        if score >= threshold:
            # Find corresponding CPT code
            matched_row = preprocessed_cpt_df[preprocessed_cpt_df['Description'] == best_match]
            cpt_code = matched_row['Code'].values[0]
            matched_results.append((cpt_code, best_match, score))
    
    return matched_results

# Example usage:
fuzzy_matches = fuzzy_match(processed_text, processed_cpt_df)
for match in fuzzy_matches:
    print(f"CPT Code: {match[0]}, Description: {match[1]}, Score: {match[2]}")

## Synonym-Based Matching (Word Embeddings with Word2Vec)

In [None]:
import gensim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Assume preprocessed_text and preprocessed_cpt_df are tokenized
def synonym_match(preprocessed_text, preprocessed_cpt_df):
    """
    Perform synonym-based matching using Word2Vec embeddings.
    
    Args:
        preprocessed_text (list): List of preprocessed tokenized text from pitch deck.
        preprocessed_cpt_df (pd.DataFrame): DataFrame with tokenized CPT descriptions.
    
    Returns:
        matched_results (list): List of matched CPT codes and descriptions.
    """
    # Combine all text data (pitch deck + CPT descriptions) to train Word2Vec model
    combined_data = preprocessed_text + preprocessed_cpt_df['Description'].apply(word_tokenize).tolist()
    
    # Train Word2Vec model
    model = Word2Vec(sentences=combined_data, vector_size=100, window=5, min_count=1, workers=4)
    
    matched_results = []
    
    for text in preprocessed_text:
        # Find the most similar CPT description using Word2Vec similarity
        max_similarity = 0
        best_match = None
        best_code = None
        
        for _, row in preprocessed_cpt_df.iterrows():
            similarity = model.wv.n_similarity(text, word_tokenize(row['Description']))
            if similarity > max_similarity:
                max_similarity = similarity
                best_match = row['Description']
                best_code = row['Code']
        
        if best_match:
            matched_results.append((best_code, best_match, max_similarity))
    
    return matched_results

# Example usage:
synonym_matches = synonym_match(processed_text, processed_cpt_df)
for match in synonym_matches:
    print(f"CPT Code: {match[0]}, Description: {match[1]}, Similarity: {match[2]}")

## TF-IDF Matching

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def tfidf_match(preprocessed_text, preprocessed_cpt_df):
    """
    Perform matching using TF-IDF and cosine similarity.
    
    Args:
        preprocessed_text (list): List of preprocessed text from pitch deck.
        preprocessed_cpt_df (pd.DataFrame): DataFrame with preprocessed CPT descriptions.
    
    Returns:
        matched_results (list): List of matched CPT codes and descriptions.
    """
    # Combine the text for TF-IDF vectorization
    corpus = preprocessed_text + preprocessed_cpt_df['Description'].tolist()
    
    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(corpus)
    
    # Split the matrix into pitch deck text and CPT descriptions
    text_matrix = tfidf_matrix[:len(preprocessed_text)]
    cpt_matrix = tfidf_matrix[len(preprocessed_text):]
    
    matched_results = []
    
    for i, text_vector in enumerate(text_matrix):
        # Compute cosine similarity between pitch deck text and all CPT descriptions
        similarity = cosine_similarity(text_vector, cpt_matrix)
        
        # Find the best match
        best_match_idx = similarity.argmax()
        best_match_description = preprocessed_cpt_df.iloc[best_match_idx]['Description']
        best_match_code = preprocessed_cpt_df.iloc[best_match_idx]['Code']
        best_similarity_score = similarity[0, best_match_idx]
        
        matched_results.append((best_match_code, best_match_description, best_similarity_score))
    
    return matched_results

# Example usage:
tfidf_matches = tfidf_match(processed_text, processed_cpt_df)
for match in tfidf_matches:
    print(f"CPT Code: {match[0]}, Description: {match[1]}, Similarity: {match[2]}")

## Cosine Similarity (With BERT Embeddings)

In [None]:
from transformers import BertTokenizer, BertModel
import torch

def cosine_similarity_bert(preprocessed_text, preprocessed_cpt_df):
    """
    Perform cosine similarity matching using BERT embeddings.
    
    Args:
        preprocessed_text (list): List of preprocessed text from pitch deck.
        preprocessed_cpt_df (pd.DataFrame): DataFrame with preprocessed CPT descriptions.
    
    Returns:
        matched_results (list): List of matched CPT codes and descriptions.
    """
    # Load pre-trained BERT model and tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    def get_embeddings(text):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach()

    # Get embeddings for pitch deck text and CPT descriptions
    text_embeddings = torch.cat([get_embeddings(text) for text in preprocessed_text])
    cpt_embeddings = torch.cat([get_embeddings(desc) for desc in preprocessed_cpt_df['Description']])
    
    matched_results = []
    
    for text_embedding in text_embeddings:
        # Compute cosine similarity with all CPT descriptions
        similarity = torch.nn.functional.cosine_similarity(text_embedding.unsqueeze(0), cpt_embeddings)
        
        # Find the best match
        best_match_idx = similarity.argmax().item()
        best_match_description = preprocessed_cpt_df.iloc[best_match_idx]['Description']
        best_match_code = preprocessed_cpt_df.iloc[best_match_idx]['Code']
        best_similarity_score = similarity[best_match_idx].item()
        
        matched_results.append((best_match_code, best_match_description, best_similarity_score))
    
    return matched_results

# Example usage:
bert_matches = cosine_similarity_bert(processed_text, processed_cpt_df)
for match in bert_matches:
    print(f"CPT Code: {match[0]}, Description: {match[1]}, Similarity: {match[2]}")

## Named Entity Recognition (NER)

In [None]:
import spacy

# Load a pre-trained spaCy model
nlp = spacy.load('en_core_web_sm')  # You can use 'en_core_sci_md' for a biomedical model

def ner_match(preprocessed_text, preprocessed_cpt_df):
    """
    Perform NER-based matching using spaCy NER to extract medical terms and match with CPT codes.
    
    Args:
        preprocessed_text (list): List of preprocessed text from pitch deck.
        preprocessed_cpt_df (pd.DataFrame): DataFrame with preprocessed CPT descriptions.
    
    Returns:
        matched_results (list): List of matched CPT codes and descriptions.
    """
    matched_results = []
    
    for text in preprocessed_text:
        # Process text with spaCy NER
        doc = nlp(text)
        
        # Extract entities related to medical procedures or terms
        entities = [ent.text for ent in doc.ents if ent.label_ in ['MEDICAL_CONDITION', 'PROCEDURE', 'SYMPTOM']]
        
        # Match extracted entities with CPT descriptions
        for entity in entities:
            for _, row in preprocessed_cpt_df.iterrows():
                if entity in row['Description']:
                    matched_results.append((row['Code'], row['Description'], entity))
    
    return matched_results

# Example usage:
ner_matches = ner_match(processed_text, processed_cpt_df)
for match in ner_matches:
    print(f"CPT Code: {match[0]}, Description: {match[1]}, Matched Entity: {match[2]}")


## Topic Modeling (LDA)

In [None]:
from gensim import corpora, models
from gensim.utils import simple_preprocess

def lda_topic_modeling(preprocessed_text, preprocessed_cpt_df, num_topics=5):
    """
    Perform topic modeling using LDA to match the pitch deck with CPT descriptions based on topics.
    
    Args:
        preprocessed_text (list): List of preprocessed text from pitch deck.
        preprocessed_cpt_df (pd.DataFrame): DataFrame with preprocessed CPT descriptions.
        num_topics (int): Number of topics to extract.
    
    Returns:
        matched_results (list): List of matched CPT codes and topics.
    """
    # Combine the preprocessed text and CPT descriptions into one corpus
    combined_texts = preprocessed_text + preprocessed_cpt_df['Description'].tolist()
    
    # Tokenize and create dictionary
    dictionary = corpora.Dictionary([simple_preprocess(text) for text in combined_texts])
    corpus = [dictionary.doc2bow(simple_preprocess(text)) for text in combined_texts]
    
    # Train LDA model
    lda_model = models.LdaModel(corpus=corpus, id2word=dictionary, num_topics=num_topics, random_state=100)
    
    # Get topics for pitch deck text
    matched_results = []
    for i, text in enumerate(preprocessed_text):
        bow = dictionary.doc2bow(simple_preprocess(text))
        topics = lda_model.get_document_topics(bow)
        
        # Find the best matching topic and the corresponding CPT code
        best_topic = max(topics, key=lambda x: x[1])
        best_topic_words = lda_model.show_topic(best_topic[0])
        
        for _, row in preprocessed_cpt_df.iterrows():
            cpt_bow = dictionary.doc2bow(simple_preprocess(row['Description']))
            cpt_topics = lda_model.get_document_topics(cpt_bow)
            if any(best_topic[0] == topic_id for topic_id, _ in cpt_topics):
                matched_results.append((row['Code'], row['Description'], best_topic_words))
    
    return matched_results

# Example usage:
lda_matches = lda_topic_modeling(processed_text, processed_cpt_df)
for match in lda_matches:
    print(f"CPT Code: {match[0]}, Description: {match[1]}, Topic Words: {match[2]}")

## BERT-Based Matching

In [None]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

def bert_match(preprocessed_text, preprocessed_cpt_df):
    """
    Perform BERT-based matching using embeddings and cosine similarity.
    
    Args:
        preprocessed_text (list): List of preprocessed text from pitch deck.
        preprocessed_cpt_df (pd.DataFrame): DataFrame with preprocessed CPT descriptions.
    
    Returns:
        matched_results (list): List of matched CPT codes and descriptions.
    """
    def get_bert_embeddings(text):
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True)
        outputs = model(**inputs)
        return outputs.last_hidden_state.mean(dim=1).detach()

    # Get BERT embeddings for pitch deck and CPT descriptions
    text_embeddings = torch.cat([get_bert_embeddings(text) for text in preprocessed_text])
    cpt_embeddings = torch.cat([get_bert_embeddings(desc) for desc in preprocessed_cpt_df['Description']])
    
    matched_results = []
    
    for text_embedding in text_embeddings:
        # Compute cosine similarity with all CPT descriptions
        similarity = torch.nn.functional.cosine_similarity(text_embedding.unsqueeze(0), cpt_embeddings)
        
        # Find the best match
        best_match_idx = similarity.argmax().item()
        best_match_description = preprocessed_cpt_df.iloc[best_match_idx]['Description']
        best_match_code = preprocessed_cpt_df.iloc[best_match_idx]['Code']
        best_similarity_score = similarity[best_match_idx].item()
        
        matched_results.append((best_match_code, best_match_description, best_similarity_score))
    
    return matched_results

# Example usage:
bert_matches = bert_match(processed_text, processed_cpt_df)
for match in bert_matches:
    print(f"CPT Code: {match[0]}, Description: {match[1]}, Similarity: {match[2]}")