In [None]:
import pandas as pd
from rouge_score import rouge_scorer
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load CSV files
patients_df = pd.read_csv("../filtered_specialists.csv")  # All rows processed
# patients_df=patients_df.head(10)
specialities_df = pd.read_csv("specialist_description.csv")

# Preprocessing function
def preprocess_text(text):
    if pd.isna(text):
        return ""
    doc = nlp(text)
    weighted_tokens = []
    
    for token in doc:
        # Skip stopwords and punctuation
        if token.is_stop or token.is_punct:
            continue
        
        # Assign weights based on POS tags
        if token.pos_ in ["NOUN", "PROPN"]:  # Nouns and proper nouns
            weight = 1.5  # Higher weight
        elif token.pos_ in ["ADJ", "ADV"]:  # Adjectives and adverbs
            weight = 1.2
        elif token.pos_ in ["VERB"]:  # Verbs
            weight = 0.8  # Verbs excluded since int(0.8) = 0
        elif token.pos_ in ["ADP"]:  # Prepositions
            weight = 0.5  # Prepositions excluded since int(0.5) = 0
        else:
            weight = 1.0
        
        # Append token based on integer weight
        weighted_tokens.extend([token.lemma_.lower()] * int(weight))
    
    return " ".join(weighted_tokens)

# Calculate weighted ROUGE scores
def calculate_rouge_scores(speciality, symptom_desc, diagnosis, speciality_desc):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    
    # Preprocess texts
    symptom_desc_processed = preprocess_text(symptom_desc)
    diagnosis_processed = preprocess_text(diagnosis)
    speciality_desc_processed = preprocess_text(speciality_desc)
    spec_process = preprocess_text(speciality)
    
    # Concatenate with space (corrected from "/n")
    patient_text = symptom_desc_processed + " " + diagnosis_processed
    speciality_text = spec_process + " " + speciality_desc_processed
    
    scores = scorer.score(patient_text, speciality_text)
    rouge1_score = scores['rouge1'].fmeasure
    rougeL_score = scores['rougeL'].fmeasure
    
    # Weighted scores
    weighted_rouge1 = rouge1_score * 0.7
    weighted_rougeL = rougeL_score * 0.3
    
    # Combined score
    combined_score = weighted_rouge1 + weighted_rougeL
    return combined_score

# Find the best speciality
def find_best_speciality(symptom_desc, diagnosis, specialities_df):
    best_speciality = None
    best_score = -1.0
    
    for _, row in specialities_df.iterrows():
        speciality = row['Speciality']
        speciality_desc = row['Description']
        
        score = calculate_rouge_scores(speciality, symptom_desc, diagnosis, speciality_desc)
        
        if score > best_score:
            best_score = score
            best_speciality = speciality
    
    return best_speciality, best_score

# Assign predicted specialities to DataFrame
def assign_predicted_speciality(patients_df, specialities_df):
    for index, row in patients_df.iterrows():
        symptom_desc = row['Patient']
        diagnosis = row['Description']
        best_speciality, _ = find_best_speciality(symptom_desc, diagnosis, specialities_df)
        patients_df.at[index, 'Predicted_Speciality'] = best_speciality
    return patients_df

# Calculate accuracy (optional)
def calculate_accuracy(patients_df):
    correct_predictions = (patients_df['Predicted_Speciality'].str.lower() == patients_df['Specialist'].str.lower()).sum()
    total_predictions = len(patients_df)
    accuracy = correct_predictions / total_predictions
    return accuracy

# Main execution
# Assign predicted specialities
patients_df = assign_predicted_speciality(patients_df, specialities_df)

# Save to new CSV with all original data plus predicted speciality
patients_df.to_csv("patients_with_predicted_speciality.csv", index=False)

# Optional: Calculate and print accuracy
accuracy = calculate_accuracy(patients_df)
print(f"Accuracy: {accuracy * 100:.2f}%")

In [None]:
import pandas as pd
from rouge_score import rouge_scorer
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load CSV files
patients_df = pd.read_csv("../filtered_specialists.csv")  # All rows processed
specialities_df = pd.read_csv("specialist_description.csv")

# Preprocessing function to extract nouns, adjectives, and adverbs
def preprocess_text(text):
    if pd.isna(text):
        return ""
    doc = nlp(text)
    filtered_tokens = []
    
    for token in doc:
        # Extract only nouns, adjectives, and adverbs
        if token.pos_ in ["NOUN", "PROPN", "ADJ", "ADV"] and not token.is_stop and not token.is_punct:
            filtered_tokens.append(token.lemma_.lower())
    
    return " ".join(filtered_tokens)

# Calculate ROUGE scores
def calculate_rouge_scores(speciality, symptom_desc, diagnosis, speciality_desc):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    
    # Preprocess texts
    symptom_desc_processed = preprocess_text(symptom_desc)
    diagnosis_processed = preprocess_text(diagnosis)
    speciality_desc_processed = preprocess_text(speciality_desc)
    spec_process = preprocess_text(speciality)
    
    # Concatenate with space
    patient_text = symptom_desc_processed + " " + diagnosis_processed
    speciality_text = spec_process + " " + speciality_desc_processed
    
    scores = scorer.score(patient_text, speciality_text)
    rouge1_score = scores['rouge1'].fmeasure
    rougeL_score = scores['rougeL'].fmeasure
    
    # Combined score (you can adjust weights or use equal weights)
    combined_score = (rouge1_score + rougeL_score) / 2  # Simple average
    return combined_score

# Find the best speciality
def find_best_speciality(symptom_desc, diagnosis, specialities_df):
    best_speciality = None
    best_score = -1.0
    
    for _, row in specialities_df.iterrows():
        speciality = row['Speciality']
        speciality_desc = row['Description']
        
        score = calculate_rouge_scores(speciality, symptom_desc, diagnosis, speciality_desc)
        
        if score > best_score:
            best_score = score
            best_speciality = speciality
    
    return best_speciality, best_score

# Assign predicted specialities to DataFrame
def assign_predicted_speciality(patients_df, specialities_df):
    for index, row in patients_df.iterrows():
        symptom_desc = row['Patient']
        diagnosis = row['Description']
        best_speciality, _ = find_best_speciality(symptom_desc, diagnosis, specialities_df)
        patients_df.at[index, 'Predicted_Speciality'] = best_speciality
    return patients_df

# Calculate accuracy (optional)
def calculate_accuracy(patients_df):
    correct_predictions = (patients_df['Predicted_Speciality'].str.lower() == patients_df['Specialist'].str.lower()).sum()
    total_predictions = len(patients_df)
    accuracy = correct_predictions / total_predictions
    return accuracy

# Main execution
patients_df = assign_predicted_speciality(patients_df, specialities_df)

# Save to new CSV with all original data plus predicted speciality
patients_df.to_csv("patients_with_predicted_speciality.csv", index=False)

# Optional: Calculate and print accuracy
accuracy = calculate_accuracy(patients_df)
print(f"Accuracy: {accuracy * 100:.2f}%")

## Better Pre-Processing

In [None]:
import pandas as pd
from rouge_score import rouge_scorer
import spacy
from itertools import product

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Load your CSV files
patients_df = pd.read_csv("../filtered_specialists.csv")  # Updated path
# patients_df = patients_df.head(10)
specialities_df = pd.read_csv("specialist_description.csv")  # Updated path

# Fill NaN values with empty strings
patients_df['Patient'] = patients_df['Patient'].fillna('')
patients_df['Description'] = patients_df['Description'].fillna('')
patients_df['Specialist'] = patients_df['Specialist'].fillna('')
specialities_df['Speciality'] = specialities_df['Speciality'].fillna('')
specialities_df['Description'] = specialities_df['Description'].fillna('')
specialities_df['Subspeciality'] = specialities_df['Subspeciality'].fillna('')

def preprocess_text(text):
    if pd.isna(text):
        return ""
    doc = nlp(text)
    weighted_tokens = []
    
    for token in doc:
        # Skip stopwords and punctuation
        if token.is_stop or token.is_punct:
            continue
        
        # Assign weights based on POS tags
        if token.pos_ in ["NOUN", "PROPN"]:  # Nouns and proper nouns
            weight = 1.5  # Higher weight for nouns
        elif token.pos_ in ["ADJ", "ADV"]:  # Adjectives and adverbs
            weight = 1.2  # Slightly higher weight for adjectives and adverbs
        elif token.pos_ in ["VERB"]:  # Verbs
            weight = 0.8  # Lower weight for verbs
        elif token.pos_ in ["ADP"]:  # Prepositions
            weight = 0.5  # Much lower weight for prepositions
        else:
            weight = 1.0  # Default weight for other tokens
        
        # Append the token multiple times based on its weight
        weighted_tokens.extend([token.lemma_.lower()] * int(weight))
    
    return " ".join(weighted_tokens)

# Calculate weighted ROUGE scores for a given symptom, diagnosis, and speciality description
def calculate_rouge_scores(symptom_desc, diagnosis, speciality_desc, subspeciality_desc, rouge1_weight, rougeL_weight):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rougeL'], use_stemmer=True)
    
    # Preprocess texts
    symptom_desc_processed = preprocess_text(symptom_desc)
    diagnosis_processed = preprocess_text(diagnosis)
    speciality_desc_processed = preprocess_text(speciality_desc)
    subspeciality_desc_processed = preprocess_text(subspeciality_desc)
    
    # Calculate ROUGE-1 and ROUGE-L scores
    rouge1_score = scorer.score(symptom_desc_processed + "/n" + diagnosis_processed, speciality_desc_processed + "/n" + subspeciality_desc_processed)['rouge1'].fmeasure
    rougeL_score = scorer.score(symptom_desc_processed + "/n" + diagnosis_processed, speciality_desc_processed + "/n" + subspeciality_desc_processed)['rougeL'].fmeasure
    
    # Weighted scores (adjust weights as needed)
    weighted_rouge1 = rouge1_score * rouge1_weight
    weighted_rougeL = rougeL_score * rougeL_weight
    
    # Combined weighted score
    combined_score = weighted_rouge1 + weighted_rougeL
    return combined_score

# Find the best speciality based on ROUGE scores
def find_best_speciality(symptom_desc, diagnosis, specialities_df, rouge1_weight, rougeL_weight):
    best_speciality = ""
    best_score = -1.0
    
    for _, row in specialities_df.iterrows():
        speciality = row['Speciality']
        speciality_desc = row['Description']
        subspeciality_desc = row['Subspeciality']
        score = 0.0
        # Calculate ROUGE score for this speciality
        score = calculate_rouge_scores(symptom_desc, diagnosis, speciality_desc, subspeciality_desc, rouge1_weight, rougeL_weight)
        
        # Update best speciality if this score is higher
        if score > best_score:
            best_score = score
            best_speciality = speciality
    best_speciality = str(best_speciality)
    return best_speciality

# Calculate accuracy based on benchmark speciality
def calculate_accuracy(patients_df, specialities_df, rouge1_weight, rougeL_weight):
    correct_predictions = 0
    total_predictions = len(patients_df)
    
    for index, row in patients_df.iterrows():
        symptom_desc = row['Patient']
        diagnosis = row['Description']
        benchmark_speciality = row['Specialist']
        # Find the best speciality for this patient
        best_speciality = find_best_speciality(symptom_desc, diagnosis, specialities_df, rouge1_weight, rougeL_weight)
        
        # Compare with benchmark speciality
        if best_speciality == benchmark_speciality:
            correct_predictions += 1
    
    accuracy = correct_predictions / total_predictions
    return accuracy

def find_best_weights(patients_df, specialities_df):
    best_accuracy = -1
    best_weights = (0, 0)
    
    # Iterate through all possible weight combinations (0.1 to 0.9)
    for rouge1_weight, rougeL_weight in product([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9], repeat=2):
        # Ensure the weights sum to 1
        if abs(rouge1_weight + rougeL_weight - 1.0) > 1e-6:
            continue
        
        # Calculate accuracy for this weight combination
        accuracy = calculate_accuracy(patients_df, specialities_df, rouge1_weight, rougeL_weight)
        
        # Update best weights if this accuracy is higher
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_weights = (rouge1_weight, rougeL_weight)
    
    return best_weights, best_accuracy

# Calculate and print accuracy
best_weights, best_accuracy = find_best_weights(patients_df, specialities_df)
print(f"Best weights: ROUGE-1 = {best_weights[0]}, ROUGE-L = {best_weights[1]}")
print(f"Best accuracy: {best_accuracy * 100:.2f}%")