### Imports

In [1]:
import nltk
nltk.download('punkt')
import numpy as np
import pandas as pd
from summac.model_summac import SummaCZS, SummaCImager
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_auc_score,f1_score
from tqdm import tqdm

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\hetan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


### Zero-Shot SummaC Article Pipeline

In [2]:
model_zs = SummaCZS(granularity="sentence", model_name="vitc", device="cpu")  # Use GPU

In [3]:
def get_best_threshold(validation_df,model):
    true_labels = []
    predicted_scores = []

    for index, row in tqdm(validation_df.iterrows(),total=len(validation_df)):
        document = row['Scraped Content']
        claim = row['Headline']
        score = model.score([str(document)], [str(claim)])

        true_labels.append(row['Actual Decision'])
        predicted_scores.append(score["scores"][0])

    # Evaluate performance at different thresholds
    thresholds = [i * 0.01 for i in range(-100, 101)]
    best_threshold = 0
    best_accuracy = 0

    for threshold in thresholds:
        predicted_labels = [1 if score >= threshold else 0 for score in predicted_scores]

        accuracy = accuracy_score(true_labels, predicted_labels)
        precision = precision_score(true_labels, predicted_labels, zero_division=1)
        recall = recall_score(true_labels, predicted_labels)
        f1 = f1_score(true_labels, predicted_labels)

        print(f"Threshold: {threshold:.2f}")
        print(f"Accuracy: {accuracy:.3f}")
        print(f"Precision: {precision:.3f}")
        print(f"Recall: {recall:.3f}")
        print(f"F1 Score: {f1:.3f}")
        print('-' * 50)

        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold

    print(f"Best Threshold: {best_threshold:.2f}")
    print(f"Best Accuracy: {best_accuracy:.3f}")

    roc_auc = roc_auc_score(true_labels, predicted_scores)
    print(f"ROC-AUC: {roc_auc:.3f}")
    return best_threshold

In [4]:
# TEST SET
def test_accuracy(test_df, best_threshold, model, imager):
    true_test_labels = []
    predicted_test_scores = []
    
    # Create lists to store interesting cases
    correct_predictions = []
    incorrect_predictions = []
    
    for index, row in tqdm(test_df.iterrows(), total=len(test_df)):
        document = row['Scraped Content']
        claim = row['Headline']
        score = model.score([str(document)], [str(claim)])
        true_label = row['Actual Decision']
        predicted_score = score["scores"][0]
        
        predicted_label = 1 if predicted_score >= best_threshold else 0
        
        # Store interesting cases
        if predicted_label == true_label:
            correct_predictions.append(row)
        else:
            incorrect_predictions.append(row)
            
        true_test_labels.append(true_label)
        predicted_test_scores.append(predicted_score)
    
    # Calculate metrics
    predicted_test_labels = [1 if score >= best_threshold else 0 for score in predicted_test_scores]
    test_accuracy = accuracy_score(true_test_labels, predicted_test_labels)
    test_precision = precision_score(true_test_labels, predicted_test_labels, zero_division=1)
    test_recall = recall_score(true_test_labels, predicted_test_labels)
    test_f1 = f1_score(true_test_labels, predicted_test_labels)
    test_roc_auc = roc_auc_score(true_test_labels, predicted_test_scores)
    
    print(f"Test Accuracy: {test_accuracy:.3f}")
    print(f"Test Precision: {test_precision:.3f}")
    print(f"Test Recall: {test_recall:.3f}")
    print(f"Test F1 Score: {test_f1:.3f}")
    print(f"Test ROC-AUC: {test_roc_auc:.3f}")
    
    return correct_predictions, incorrect_predictions

Note: The `best_threshold` is manually set from our experiments in `Pipeline_Results_SummaC`. \
 \
Additionally, the results obtained using `best_threshold` pertaining to *F1* and *Accuracy* are calcualted on a subset of the dataset of 16 claims.

In [6]:
# Initialize the imager
imager = SummaCImager(model_name="vitc", device="cpu")  # Use GPU if available

# Your existing code with visualization added
df = pd.read_csv('./Efficiency test/SummaC_Pipeline_Article.csv', delimiter='|')
test_df, validation_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['Actual Decision'])

# Get best threshold from the Pipeline_Results_SummaC.ipynb for Article_Pipeline with SummaC_ZS
best_threshold = -0.03 

# Run test with visualization
correct_preds, incorrect_preds = test_accuracy(test_df, best_threshold, model_zs, imager)

  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
100%|██████████| 16/16 [05:59<00:00, 22.50s/it]

Test Accuracy: 0.875
Test Precision: 1.000
Test Recall: 0.750
Test F1 Score: 0.857
Test ROC-AUC: 0.859





### Showing the document segment with the most impact on sentence level granularity

In [7]:
def color_score(score, threshold=0.3):
    if score > threshold:
        return True
    return False

def analyze_text_comparison(document, claim, imager):
    """
    Analyze and display text-based comparison with the full document and color-coded segments
    Green: Entailment, Red: Contradiction, Pink: Neutral
    """
    doc_segments = imager.split_text(document, granularity="sentence")
    claim_segments = imager.split_text(claim, granularity="sentence")
    image = imager.build_image(document, claim)
    
    print("\n=== Analysis Results ===")
    print(f"\nCLAIM: {claim}\n")
    
    # Store scores and segments with their relationships
    entail_scores = []
    contra_scores = []
    neutral_scores = []
    segment_relationships = {}  # Store indices and their highest scoring relationship
    
    # First pass to identify relationships
    for i, doc_seg in enumerate(doc_segments):
        for j, claim_seg in enumerate(claim_segments):
            try:
                entail_score = float(image[0][i][j])
                contra_score = float(image[1][i][j])
                neutral_score = float(image[2][i][j])
                
                entail_scores.append(entail_score)
                contra_scores.append(contra_score)
                neutral_scores.append(neutral_score)
                
                # Determine the dominant relationship for this segment
                max_score = max(entail_score, contra_score, neutral_score)
                if max_score > 0.3:
                    if max_score == entail_score:
                        segment_relationships[i] = 'entailment'
                    elif max_score == contra_score:
                        segment_relationships[i] = 'contradiction'
                    elif max_score == neutral_score:
                        segment_relationships[i] = 'neutral'
                    
            except (IndexError, Exception):
                continue
    
    # Print the full document with color-coded segments
    print("DOCUMENT TEXT:")
    for i, segment in enumerate(doc_segments):
        if i in segment_relationships:
            relationship = segment_relationships[i]
            if relationship == 'entailment':
                print(f"\033[1;32m{segment}\033[0m")  # Green for entailment
            elif relationship == 'contradiction':
                print(f"\033[1;31m{segment}\033[0m")  # Red for contradiction
            elif relationship == 'neutral':
                print(f"\033[1;95m{segment}\033[0m")  # Pink for neutral
        else:
            print(segment)
    
    print("\nDETAILED RELATIONSHIPS:")
    # Print detailed scores for segments with relationships
    for i in segment_relationships:
        for j, claim_seg in enumerate(claim_segments):
            try:
                entail_score = float(image[0][i][j])
                contra_score = float(image[1][i][j])
                neutral_score = float(image[2][i][j])
                
                max_score = max(entail_score, contra_score, neutral_score)
                if max_score > 0.3:
                    print(f"\nSegment: {doc_segments[i]}")
                    print(f"Relationship: {segment_relationships[i].upper()}")
                    print(f"Scores - E: {entail_score:.3f} | C: {contra_score:.3f} | N: {neutral_score:.3f}")
            except (IndexError, Exception):
                continue
    
    # Create simple ASCII histogram
    print("\nScore Distribution:")
    print("Entailment:     " + "█" * int(sum(entail_scores)/len(entail_scores) * 50))
    print("Contradiction:  " + "█" * int(sum(contra_scores)/len(contra_scores) * 50))
    print("Neutral:        " + "█" * int(sum(neutral_scores)/len(neutral_scores) * 50))
    print("\n" + "-"*80 + "\n")

def analyze_sample_cases_text(df, imager, n_samples=2):
    """
    Analyze sample cases with cleaner output
    """
    true_cases = df[df['Actual Decision'] == 1]
    false_cases = df[df['Actual Decision'] == 0]
    
    n_true_samples = min(n_samples, len(true_cases))
    n_false_samples = min(n_samples, len(false_cases))
    
    print("\n=== TRUE CLAIMS ===")
    for idx, row in true_cases.sample(n=n_true_samples).iterrows():
        doc_content = str(row['Scraped Content']) if pd.notna(row['Scraped Content']) else ""
        headline = str(row['Headline']) if pd.notna(row['Headline']) else ""
        
        if doc_content and headline:
            analyze_text_comparison(doc_content, headline, imager)
    
    print("\n=== FALSE CLAIMS ===")
    for idx, row in false_cases.sample(n=n_false_samples).iterrows():
        doc_content = str(row['Scraped Content']) if pd.notna(row['Scraped Content']) else ""
        headline = str(row['Headline']) if pd.notna(row['Headline']) else ""
        
        if doc_content and headline:
            analyze_text_comparison(doc_content, headline, imager)
print("\nAnalyzing sample predictions...")
if len(correct_preds) > 0:
    analyze_sample_cases_text(pd.DataFrame(correct_preds), imager, n_samples=2)
if len(incorrect_preds) > 0:
    analyze_sample_cases_text(pd.DataFrame(incorrect_preds), imager, n_samples=2)




Analyzing sample predictions...

=== TRUE CLAIMS ===

=== Analysis Results ===

CLAIM: Sepsis tests take days  putting patients at risk. A new method may cut wait time

DOCUMENT TEXT:
[1;32mSepsis tests take days, putting patients at risk.[0m
A new method may cut wait time  Every print subscription comes with full digital access The test is crucial to figuring out what is causing a bloodstream infection, and fighting it A nurse holds tubes of collected blood from a patient.
A new method of sepsis testing could speed treatment that targets the specific bacteria causing a blood infection.
FLUXFACTORY/GETTY IMAGES By Claire Yuan JULY 24, 2024 AT 11:00 AM When bloodstream infections set in, fast treatment is crucial — but it can take several days to identify the bacteria responsible.
A new, rapid-diagnosis sepsis test could cut down on the wait, reducing testing time from as much as a few days to about 13 hours by cutting out a lengthy blood culturing step, researchers report July 24 in