In [None]:
import pandas as pd
from rouge_score import rouge_scorer
import re
import six 
from collections import Counter
from evaluate import load

df = pd.read_csv('4o_ehr_review.csv')
df2 = pd.read_csv('35_ehr_review.csv')
df3 = pd.read_csv('o3_ehr_review.csv')
df4 = pd.read_csv('llama3_ehr.csv')

In [None]:
def process_text(text):
    # Ensure the text is a string, handle NaN or invalid data by converting it to a string or empty string
    if not isinstance(text, str):
        text = str(text)  # Convert to string if it's not a string (e.g., for float or NaN values)
    
    # Convert everything to lowercase.
    text = text.lower()
    
    # Replace any non-alpha-numeric characters with spaces.
    text = re.sub(r"[^a-z0-9]+", " ", six.ensure_str(text))
    
    return text

df['Processed_One_Sentence_Extracted'] = df['One_Sentence_Extracted'].apply(process_text)
df['Processed_Generated_Summary'] = df['Generated_Summary'].apply(process_text)

# Similarly, you can process text in df2 or any other DataFrame
df2['Processed_One_Sentence_Extracted'] = df2['One_Sentence_Extracted'].apply(process_text)
df2['Processed_Generated_Summary'] = df2['Generated_Summary'].apply(process_text)

# Similarly, you can process text in df2 or any other DataFrame
df3['Processed_One_Sentence_Extracted'] = df3['One_Sentence_Extracted'].apply(process_text)
df3['Processed_Generated_Summary'] = df3['Generated_Summary'].apply(process_text)


# Similarly, you can process text in df2 or any other DataFrame
df4['Processed_One_Sentence_Extracted'] = df4['Processed_One_Sentence_Extracted'].apply(process_text)
df4['Processed_Generated_Summary'] = df4['Assistant_Content'].apply(process_text)


In [None]:
## Define function to compute ROUGE scores and F1 scores
def compute_rouge_scores(reference, hypothesis):
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    scores = scorer.score(reference, hypothesis)
    
    # Extract ROUGE scores
    rouge1_precision = scores['rouge1'].precision
    rouge1_recall = scores['rouge1'].recall
    rouge1_fmeasure = scores['rouge1'].fmeasure
    
    rouge2_precision = scores['rouge2'].precision
    rouge2_recall = scores['rouge2'].recall
    rouge2_fmeasure = scores['rouge2'].fmeasure
    
    rougeL_precision = scores['rougeL'].precision
    rougeL_recall = scores['rougeL'].recall
    rougeL_fmeasure = scores['rougeL'].fmeasure
    
    # Calculate F1 for each ROUGE score
    f1_rouge1 = 2 * (rouge1_precision * rouge1_recall) / (rouge1_precision + rouge1_recall) if (rouge1_precision + rouge1_recall) != 0 else 0
    f1_rouge2 = 2 * (rouge2_precision * rouge2_recall) / (rouge2_precision + rouge2_recall) if (rouge2_precision + rouge2_recall) != 0 else 0
    f1_rougeL = 2 * (rougeL_precision * rougeL_recall) / (rougeL_precision + rougeL_recall) if (rougeL_precision + rougeL_recall) != 0 else 0
    
    # Return ROUGE F1 scores for the DataFrame
    return pd.Series([rouge1_fmeasure, rouge2_fmeasure, rougeL_fmeasure, f1_rouge1, f1_rouge2, f1_rougeL], 
                     index=['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1'])



# Compute ROUGE scores for df using processed columns
df[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']] = df.apply(
    lambda row: compute_rouge_scores(row['Processed_One_Sentence_Extracted'], row['Processed_Generated_Summary']),
    axis=1
)

# Compute ROUGE scores for df2 using processed columns
df2[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']] = df2.apply(
    lambda row: compute_rouge_scores(row['Processed_One_Sentence_Extracted'], row['Processed_Generated_Summary']),
    axis=1
)

# Compute ROUGE scores for df2 using processed columns
df3[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']] = df3.apply(
    lambda row: compute_rouge_scores(row['Processed_One_Sentence_Extracted'], row['Processed_Generated_Summary']),
    axis=1
)

# Compute ROUGE scores for df2 using processed columns
df4[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']] = df4.apply(
    lambda row: compute_rouge_scores(row['Processed_One_Sentence_Extracted'], row['Assistant_Content']),
    axis=1
)

# Compute the aggregated (mean) ROUGE scores and F1 scores for df
aggregated_rouge_f1_scores_df = df[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']].mean()

# Compute the aggregated (mean) ROUGE scores and F1 scores for df2
aggregated_rouge_f1_scores_df2 = df2[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']].mean()

# Compute the aggregated (mean) ROUGE scores and F1 scores for df2
aggregated_rouge_f1_scores_df3 = df3[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']].mean()

# Compute the aggregated (mean) ROUGE scores and F1 scores for df2
aggregated_rouge_f1_scores_df4 = df4[['ROUGE-1', 'ROUGE-2', 'ROUGE-L', 'ROUGE-1-F1', 'ROUGE-2-F1', 'ROUGE-L-F1']].mean()

# Print the aggregated scores for df
print(f'Aggregated ROUGE and F1 Scores for GPT-4o:\n{aggregated_rouge_f1_scores_df}')

# Print the aggregated scores for df2
print(f'Aggregated ROUGE and F1 Scores for GPT-3.5:\n{aggregated_rouge_f1_scores_df2}')


# Print the aggregated scores for df2
print(f'Aggregated ROUGE and F1 Scores for O3-mini:\n{aggregated_rouge_f1_scores_df3}')


# Print the aggregated scores for df2
print(f'Aggregated ROUGE and F1 Scores for LLama3:\n{aggregated_rouge_f1_scores_df4}')


Aggregated ROUGE and F1 Scores for GPT-4o:
ROUGE-1       0.346749
ROUGE-2       0.126664
ROUGE-L       0.285771
ROUGE-1-F1    0.346749
ROUGE-2-F1    0.126664
ROUGE-L-F1    0.285771
dtype: float64
Aggregated ROUGE and F1 Scores for GPT-3.5:
ROUGE-1       0.309570
ROUGE-2       0.131968
ROUGE-L       0.262131
ROUGE-1-F1    0.309570
ROUGE-2-F1    0.131968
ROUGE-L-F1    0.262131
dtype: float64
Aggregated ROUGE and F1 Scores for O3-mini:
ROUGE-1       0.311885
ROUGE-2       0.120407
ROUGE-L       0.271613
ROUGE-1-F1    0.311885
ROUGE-2-F1    0.120407
ROUGE-L-F1    0.271613
dtype: float64
Aggregated ROUGE and F1 Scores for LLama3:
ROUGE-1       0.293899
ROUGE-2       0.110161
ROUGE-L       0.250804
ROUGE-1-F1    0.293899
ROUGE-2-F1    0.110161
ROUGE-L-F1    0.250804
dtype: float64
