In [None]:
import os
import pandas as pd

load_path = "/scratch/koehler.ale/FinalMIMICDF.csv"


FinalMIMICDF = load_dataframe(load_path)
FinalMIMICDF.head()


In [None]:
def prepare_model_comparison_data(term_analysis: pd.DataFrame, bins: int = 20) -> Dict:
    """
    Prepare TF-IDF comparison data focusing on model adaptation.
    
    Args:
        term_analysis: DataFrame with TF-IDF scores
        bins: Number of bins for grouping TF-IDF scores
        
    Returns:
        Dictionary with scatter plot data for trained and untrained models
    """
    def create_binned_data(scores: np.ndarray) -> List[Dict[str, float]]:
        # Create histogram data
        hist, bin_edges = np.histogram(scores, bins=bins, range=(0, 1))
        bin_centers = (bin_edges[:-1] + bin_edges[1:]) / 2
        
        # Convert to scatter format
        scatter_data = []
        for count, tfidf in zip(hist, bin_centers):
            if count > 0:  # Only include bins with terms
                scatter_data.append({
                    'tfidf': float(tfidf),
                    'count': int(count)
                })
        return scatter_data

    # Calculate relative TF-IDF scores (how well each model matches reference patterns)
    trained_similarity = 1 - abs(term_analysis['trained_mean'] - term_analysis['reference_mean'])
    untrained_similarity = 1 - abs(term_analysis['untrained_mean'] - term_analysis['reference_mean'])
    
    scatter_data = {
        'trained': create_binned_data(trained_similarity),
        'untrained': create_binned_data(untrained_similarity)
    }
    
    # Add summary statistics
    stats = {
        'trained': {
            'mean_similarity': float(trained_similarity.mean()),
            'std_similarity': float(trained_similarity.std()),
            'median_similarity': float(trained_similarity.median())
        },
        'untrained': {
            'mean_similarity': float(untrained_similarity.mean()),
            'std_similarity': float(untrained_similarity.std()),
            'median_similarity': float(untrained_similarity.median())
        }
    }
    
    return {
        'scatter_data': scatter_data,
        'stats': stats
    }

def analyze_model_adaptation(
    trained_summaries: List[str],
    untrained_summaries: List[str],
    reference_summaries: List[str]
) -> Tuple[Dict, pd.DataFrame]:
    """
    Analyze how well models have adapted to reference patterns.
    """
    # Run basic analysis first
    report, term_analysis = analyze_summaries(
        trained_summaries,
        untrained_summaries,
        reference_summaries
    )
    
    # Prepare comparison data
    comparison_data = prepare_model_comparison_data(term_analysis)
    
    # Add adaptation metrics to report
    report['adaptation_metrics'] = {
        'improvement_ratio': (
            comparison_data['stats']['trained']['mean_similarity'] /
            comparison_data['stats']['untrained']['mean_similarity']
        ),
        'absolute_improvement': (
            comparison_data['stats']['trained']['mean_similarity'] -
            comparison_data['stats']['untrained']['mean_similarity']
        )
    }
    
    return report, comparison_data

# Example usage
def main():
    report, comparison_data = analyze_model_adaptation(
        FinalMIMICDF['Trained_Summary'].tolist(),
        FinalMIMICDF['Untrained_Summary'].tolist(),
        FinalMIMICDF['Real_Summary'].tolist()
    )
    
    print("\nModel Adaptation Analysis:")
    print(f"Improvement ratio: {report['adaptation_metrics']['improvement_ratio']:.2f}x")
    print(f"Absolute improvement: {report['adaptation_metrics']['absolute_improvement']:.3f}")
    
    return report, comparison_data

In [None]:
TFDF = calculate_tfidf_scores(sample_df)
TFDF

In [None]:
import os 
import statistics as stats 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

def analyze_and_visualize_tfidf(comparison_df, output_dir='./plots'):
    """
    Create comprehensive visualizations and statistical analysis of TF-IDF scores.
    
    Args:
        comparison_df: DataFrame from calculate_tfidf_scores
        output_dir: Directory to save plots
    """
    os.makedirs(output_dir, exist_ok=True)
    
    # Calculate additional statistics
    score_diff = comparison_df['Score_Difference']
    similarity_ratio = comparison_df['Similarity_Ratio']
    
    stats_results = {
        'std_difference': np.std(score_diff),
        'std_similarity': np.std(similarity_ratio),
        'mean_difference': np.mean(score_diff),
        'mean_similarity': np.mean(similarity_ratio)
    }
    
    # Top 5 Words with Significant TF-IDF Differences
    plt.figure(figsize=(6, 4))
    significant_diff = comparison_df.nlargest(5, 'Score_Difference')  # Changed to top 5
    sns.barplot(data=significant_diff, y='Word', x='Score_Difference', 
        palette=['purple', 'blue'])
    plt.title('Top Words with Significant TF-IDF Differences')
    plt.xlabel('Score Difference')
    plt.legend(
        handles=[
            plt.Line2D([0], [0], color='purple', lw=4, label='Generated'), 
            plt.Line2D([0], [0], color='blue', lw=4, label='Real')    
        ],
        loc='best',  
    )
    plt.tight_layout()
    plt.show()
    plt.close()
    
    # Top 5 Words with Highest Similarity
    plt.figure(figsize=(6,4))
    most_similar = comparison_df.nlargest(5, 'Similarity_Ratio')  # Changed to top 5
    sns.barplot(data=most_similar, y='Word', x='Similarity_Ratio')
    plt.title('Top Words with Highest Similarity')
    plt.xlabel('Similarity Ratio')
    plt.tight_layout()
    plt.show()
    plt.close()
    
    return stats_results

def print_analysis_report(stats_results):
    """
    Print a formatted report of the statistical analysis.
    """
    print("\nTF-IDF Analysis Statistical Report")
    print("=" * 40)
    print(f"Standard Deviation of Differences: {stats_results['std_difference']:.4f}")
    print(f"Standard Deviation of Similarities: {stats_results['std_similarity']:.4f}")
    print(f"Mean Difference: {stats_results['mean_difference']:.4f}")
    print(f"Mean Similarity: {stats_results['mean_similarity']:.4f}")


stats_results = analyze_and_visualize_tfidf(TFDF)

print_analysis_report(stats_results)


In [None]:

def plot_tfidf_histograms(comparison_df, output_dir='./plots'):
    """
    Create precise histogram visualizations of TF-IDF score differences and similarity ratios.
    
    Args:
        comparison_df: DataFrame containing TF-IDF comparison results
        output_dir: Directory to save plots
    """
    # Set style
    plt.style.use('default')
    
    # Create figure with two subplots
    fig, (ax1, ax2) = plt.subplots(2, 1, figsize=(6, 4))
    
    # Plot Score Differences Histogram
    ax1.hist(comparison_df['Score_Difference'], 
             bins=30,  # Adjust number of bins as needed
             edgecolor='black',
             alpha=0.7,
             color='red')
    ax1.axvline(comparison_df['Score_Difference'].mean(), 
                color='blue', 
                linestyle='dashed', 
                linewidth=2,
                label=f"Mean: {comparison_df['Score_Difference'].mean():.3f}")
    ax1.set_title('Distribution of TF-IDF Score Differences', pad=20)
    ax1.set_xlabel('Score Difference')
    ax1.set_ylabel('Count')
    ax1.legend()
    ax1.grid(True, alpha=0.3)
    
    # Add counts above bars for Score Differences
    for patch in ax1.patches:
        height = patch.get_height()
        ax1.text(patch.get_x() + patch.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')
    
    # Plot Similarity Ratio Histogram
    ax2.hist(comparison_df['Similarity_Ratio'],
             bins=30,  # Adjust number of bins as needed
             edgecolor='black',
             alpha=0.7,
             color='green')
    ax2.axvline(comparison_df['Similarity_Ratio'].mean(),
                color='blue',
                linestyle='dashed',
                linewidth=2,
                label=f"Mean: {comparison_df['Similarity_Ratio'].mean():.3f}")
    ax2.set_title('Distribution of Similarity Ratios', pad=20)
    ax2.set_xlabel('Similarity Ratio')
    ax2.set_ylabel('Count')
    ax2.legend()
    ax2.grid(True, alpha=0.3)
    
    # Add counts above bars for Similarity Ratios
    for patch in ax2.patches:
        height = patch.get_height()
        ax2.text(patch.get_x() + patch.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom')
    
    # Adjust layout and save
    plt.tight_layout()
    
    # Print summary statistics
    print("\nScore Difference Statistics:")
    print(f"Mean: {comparison_df['Score_Difference'].mean():.3f}")
    print(f"Median: {comparison_df['Score_Difference'].median():.3f}")
    print(f"Std Dev: {comparison_df['Score_Difference'].std():.3f}")
    print(f"Total Words: {len(comparison_df)}")
    
    print("\nSimilarity Ratio Statistics:")
    print(f"Mean: {comparison_df['Similarity_Ratio'].mean():.3f}")
    print(f"Median: {comparison_df['Similarity_Ratio'].median():.3f}")
    print(f"Std Dev: {comparison_df['Similarity_Ratio'].std():.3f}")
    
    # Save the figure
    plt.show()

    plt.close()


# After calculating TF-IDF scores:
plot_tfidf_histograms(TFDF, output_dir='./plots')


In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import pandas as pd
import matplotlib.pyplot as plt

def compute_semantic_similarities(df, model_name='all-MiniLM-L6-v2'):
    """
    Compute semantic similarities between generated, untrained, and real summaries.
    
    Args:
        df: DataFrame with 'Generated_Summary', 'Real_Summary', and 'Untrained_Summary' columns
        model_name: Name of the SBERT model to use
    
    Returns:
        DataFrame with original columns plus semantic similarity scores
    """
    # Load model and compute embeddings
    model = SentenceTransformer(model_name)
    generated_embeddings = model.encode(df['Generated_Summary'].tolist())
    untrained_embeddings = model.encode(df['Untrained_Summary'].tolist())
    real_embeddings = model.encode(df['Real_Summary'].tolist())
    
    # Calculate cosine similarities
    gen_to_real_similarities = [
        cosine_similarity([gen_emb], [real_emb])[0][0]
        for gen_emb, real_emb in zip(generated_embeddings, real_embeddings)
    ]
    untrained_to_real_similarities = [
        cosine_similarity([untrained_emb], [real_emb])[0][0]
        for untrained_emb, real_emb in zip(untrained_embeddings, real_embeddings)
    ]
    
    # Add similarities to the dataframe
    df_with_scores = df.copy()
    df_with_scores['Generated_to_Real_Similarity'] = gen_to_real_similarities
    df_with_scores['Untrained_to_Real_Similarity'] = untrained_to_real_similarities
    
    return df_with_scores

# Example DataFrame
sample_df = pd.DataFrame({
    'Generated_Summary': ["This is a fine-tuned summary.", "Another fine-tuned result."],
    'Real_Summary': ["This is the real summary.", "Another real summary here."],
    'Untrained_Summary': ["This is an untrained summary.", "Another untrained output."]
})

# Compute semantic similarities
Semanticdf = compute_semantic_similarities(sample_df)

# Calculate improvement from fine-tuning
Semanticdf['Improvement'] = (
    Semanticdf['Generated_to_Real_Similarity'] - Semanticdf['Untrained_to_Real_Similarity']
)

# Print the DataFrame with similarities and improvement
print(Semanticdf)

# Plot improvement
plt.figure(figsize=(10, 6))
plt.plot(Semanticdf.index, Semanticdf['Generated_to_Real_Similarity'], label="Fine-Tuned Similarity", marker='o')
plt.plot(Semanticdf.index, Semanticdf['Untrained_to_Real_Similarity'], label="Untrained Similarity", marker='x')
plt.title("Cosine Similarity Improvement with Fine-Tuning")
plt.xlabel("Sample Index")
plt.ylabel("Cosine Similarity")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import json


def format_summaries_for_viz(df):
    """
    Format dataframe with similarity scores into the structure needed for visualization.
    
    Parameters:
    df (pandas.DataFrame): DataFrame with columns for generated summary, real summary, and semantic similarity
    
    Returns:
    dict: Dictionary containing most and least similar summary pairs
    """
    # Sort by semantic similarity score
    df_sorted = df.copy().sort_values('Semantic_Similarity', ascending=False)
    
    # Get top 3 most similar and least similar pairs
    most_similar = df_sorted.head(3)
    least_similar = df_sorted.tail(3)
    
    # Format the results for the React component
    result = {
        'mostSimilar': [
            {
                'generated': row['Generated_Summary'],
                'real': row['Real_Summary'],
                'similarity': row['Semantic_Similarity']
            }
            for _, row in most_similar.iterrows()
        ],
        'leastSimilar': [
            {
                'generated': row['Generated_Summary'],
                'real': row['Real_Summary'],
                'similarity': row['Semantic_Similarity']
            }
            for _, row in least_similar.iterrows()
        ]
    }
    
    # Optionally save to JSON file
    with open('similarity_results.json', 'w') as f:
        json.dump(result, f, indent=2)
        
    return result

format_summaries_for_viz(Semanticdf)

In [None]:
def aggregate_tfidf_to_sentence(TFDF):
    """
    Aggregate word-level TF-IDF scores to sentence level.
    
    Args:
        TFDF: DataFrame containing word-level TF-IDF scores with 'Index' column
        
    Returns:
        DataFrame with sentence-level TF-IDF scores
    """
    # Group by sentence (Index) and calculate mean scores
    sentence_tfidf = TFDF.groupby('Index').agg({
        'Similarity_Ratio': 'mean',
        'Score_Difference': 'mean'
    }).reset_index()
    
    return sentence_tfidf

def plot_sentence_level_comparison(SemanticDF, TFDF, RougeDF):
    """
    Create visualization comparing sentence-level semantic, TF-IDF, and ROUGE similarities.
    
    Args:
        SemanticDF: DataFrame with semantic similarity scores per sentence
        TFDF: DataFrame with word-level TF-IDF scores
        RougeDF: DataFrame with ROUGE scores per sentence
    """
    # First aggregate TF-IDF scores to sentence level
    sentence_tfidf = aggregate_tfidf_to_sentence(TFDF)
    
    # Sort all DataFrames by index to ensure alignment
    SemanticDF = SemanticDF.sort_index().reset_index(drop=True)
    sentence_tfidf = sentence_tfidf.sort_values('Index').reset_index(drop=True)
    RougeDF = RougeDF.sort_index().reset_index(drop=True)
    
    # Create the visualization
    plt.figure(figsize=(15, 8))
    
    # Plot all similarity scores
    plt.plot(range(len(SemanticDF)), 
             SemanticDF['Semantic_Similarity'],
             'o-', color='blue', alpha=0.6,
             label='Semantic Similarity',
             markersize=6)
    
    plt.plot(range(len(sentence_tfidf)),
             sentence_tfidf['Similarity_Ratio'], 
             's-', color='red', alpha=0.6,
             label='Average TF-IDF Similarity',
             markersize=6)
    
    plt.plot(range(len(RougeDF)),
             RougeDF['Rouge_Score'], 
             '^-', color='green', alpha=0.6,
             label='ROUGE Score',
             markersize=6)
    
    # Add mean lines
    plt.axhline(y=SemanticDF['Semantic_Similarity'].mean(),
                color='blue', linestyle='--', alpha=0.5,
                label=f'Semantic Mean: {SemanticDF["Semantic_Similarity"].mean():.3f}')
    
    plt.axhline(y=sentence_tfidf['Similarity_Ratio'].mean(),
                color='red', linestyle='--', alpha=0.5,
                label=f'TF-IDF Mean: {sentence_tfidf["Similarity_Ratio"].mean():.3f}')
    
    plt.axhline(y=RougeDF['Rouge_Score'].mean(),
                color='green', linestyle='--', alpha=0.5,
                label=f'ROUGE Mean: {RougeDF["Rouge_Score"].mean():.3f}')
    
    # Customize the plot
    plt.title('Sentence-Level Comparison of Semantic, TF-IDF, and ROUGE Similarity Scores', pad=20)
    plt.xlabel('Sentence Index')
    plt.ylabel('Similarity Score')
    plt.grid(True, alpha=0.3)
    plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    # Print summary statistics
    print("\nSentence-Level Summary Statistics:")
    print("\nSemantic Similarity:")
    print(f"Mean: {SemanticDF['Semantic_Similarity'].mean():.3f}")
    print(f"Median: {SemanticDF['Semantic_Similarity'].median():.3f}")
    print(f"Std Dev: {SemanticDF['Semantic_Similarity'].std():.3f}")
    
    print("\nAverage TF-IDF Similarity per Sentence:")
    print(f"Mean: {sentence_tfidf['Similarity_Ratio'].mean():.3f}")
    print(f"Median: {sentence_tfidf['Similarity_Ratio'].median():.3f}")
    print(f"Std Dev: {sentence_tfidf['Similarity_Ratio'].std():.3f}")
    
    print("\nROUGE Score:")
    print(f"Mean: {RougeDF['Rouge_Score'].mean():.3f}")
    print(f"Median: {RougeDF['Rouge_Score'].median():.3f}")
    print(f"Std Dev: {RougeDF['Rouge_Score'].std():.3f}")
    
    # Calculate correlations between metrics
    semantic_tfidf_corr = SemanticDF['Semantic_Similarity'].corr(sentence_tfidf['Similarity_Ratio'])
    semantic_rouge_corr = SemanticDF['Semantic_Similarity'].corr(RougeDF['Rouge_Score'])
    tfidf_rouge_corr = sentence_tfidf['Similarity_Ratio'].corr(RougeDF['Rouge_Score'])
    
    print("\nCorrelations between metrics:")
    print(f"Semantic-TF-IDF: {semantic_tfidf_corr:.3f}")
    print(f"Semantic-ROUGE: {semantic_rouge_corr:.3f}")
    print(f"TF-IDF-ROUGE: {tfidf_rouge_corr:.3f}")
    
    # Adjust layout
    plt.tight_layout()
    plt.show()
    plt.close()

def plot_sentence_correlation(SemanticDF, TFDF, RougeDF):
    """Create scatter plot matrix comparing all sentence-level metrics"""
    # Aggregate TF-IDF scores
    sentence_tfidf = aggregate_tfidf_to_sentence(TFDF)
    
    # Sort DataFrames by index to ensure alignment
    SemanticDF = SemanticDF.sort_index().reset_index(drop=True)
    sentence_tfidf = sentence_tfidf.sort_values('Index').reset_index(drop=True)
    RougeDF = RougeDF.sort_index().reset_index(drop=True)
    
    # Create figure with 2x2 subplots (we'll use 3 of them)
    fig, axes = plt.subplots(2, 2, figsize=(15, 15))
    fig.suptitle('Sentence-Level Metric Correlations', y=1.02, fontsize=16)
    
    # Semantic vs TF-IDF
    axes[0,0].scatter(SemanticDF['Semantic_Similarity'],
                     sentence_tfidf['Similarity_Ratio'],
                     alpha=0.6, c='purple')
    correlation = SemanticDF['Semantic_Similarity'].corr(sentence_tfidf['Similarity_Ratio'])
    axes[0,0].set_title(f'Semantic vs TF-IDF (r={correlation:.3f})')
    axes[0,0].set_xlabel('Semantic Similarity')
    axes[0,0].set_ylabel('TF-IDF Similarity')
    axes[0,0].grid(True, alpha=0.3)
    
    # Semantic vs ROUGE
    axes[0,1].scatter(SemanticDF['Semantic_Similarity'],
                     RougeDF['Rouge_Score'],
                     alpha=0.6, c='green')
    correlation = SemanticDF['Semantic_Similarity'].corr(RougeDF['Rouge_Score'])
    axes[0,1].set_title(f'Semantic vs ROUGE (r={correlation:.3f})')
    axes[0,1].set_xlabel('Semantic Similarity')
    axes[0,1].set_ylabel('ROUGE Score')
    axes[0,1].grid(True, alpha=0.3)
    
    # TF-IDF vs ROUGE
    axes[1,0].scatter(sentence_tfidf['Similarity_Ratio'],
                     RougeDF['Rouge_Score'],
                     alpha=0.6, c='blue')
    correlation = sentence_tfidf['Similarity_Ratio'].corr(RougeDF['Rouge_Score'])
    axes[1,0].set_title(f'TF-IDF vs ROUGE (r={correlation:.3f})')
    axes[1,0].set_xlabel('TF-IDF Similarity')
    axes[1,0].set_ylabel('ROUGE Score')
    axes[1,0].grid(True, alpha=0.3)
    
    # Remove the unused subplot
    fig.delaxes(axes[1,1])
    
    plt.tight_layout()
    plt.show()
    plt.close()