In [None]:
import os
import torch
import shap
import numpy as np
import nltk
from nltk import sent_tokenize
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification
from peft import PeftModel
from shap.maskers import Text
import torch.nn.functional as F
from termcolor import colored
from tqdm import tqdm

# === Setup NLTK ===
nltk_data_path = os.path.join(os.getcwd(), 'nltk_data')
os.makedirs(nltk_data_path, exist_ok=True)
try:
    nltk.download('punkt_tab', download_dir=nltk_data_path)
except:
    nltk.download('punkt', download_dir=nltk_data_path)
nltk.data.path.append(nltk_data_path)

# === Device setup ===
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"\U0001F680 Using device: {device}")

# === Load tokenizer and model ===
tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")
base_model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)
model = PeftModel.from_pretrained(base_model, "./trained_distilbert_lora")
model.to(device)
model.eval()

# === Wrapped model for SHAP ===
def wrapped_model(texts):
    if isinstance(texts, (str, np.generic)) or not isinstance(texts, list):
        texts = [str(t) for t in np.atleast_1d(texts)]
    inputs = tokenizer(texts, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=-1)
    return probs.cpu().numpy()

# === Table Formatter ===
def truncate(text, length=80):
    return text if len(text) <= length else text[:length - 3] + "..."

def color_class(cls):
    return colored(cls, 'green') if cls == 'REAL' else colored(cls, 'red')

def color_score(score):
    return colored(f"{score:+.4f}", 'green' if score >= 0 else 'red')

def print_table(rows, headers):
    col_widths = [max(len(truncate(str(row[i]))) for row in rows + [headers]) + 2 for i in range(len(headers))]
    header_line = " | ".join(h.ljust(col_widths[i]) for i, h in enumerate(headers))
    print("\n" + "=" * len(header_line))
    print(header_line)
    print("-" * len(header_line))
    for row in rows:
        print(" | ".join(truncate(str(row[i])).ljust(col_widths[i]) for i in range(len(row))))
    print("=" * len(header_line))

# === SHAP Explain — Sentences vs Article ===
def shap_sentence_vs_article(article_text, threshold=0.05):
    print(f"\n🔍 Mode: Sentence-wise explanation vs Article")

    # Sentence-level chunks
    chunks = sent_tokenize(article_text)
    explainer = shap.Explainer(wrapped_model, Text(tokenizer))
    shap_values = explainer(chunks)

    # Full-article prediction
    article_pred_class = np.argmax(wrapped_model([article_text])[0])
    class_name = ['FAKE', 'REAL'][article_pred_class]
    print(f"\n🎯 Predicted Class for Full Article: {color_class(class_name)}\n")

    table_rows = []
    flagged_sentences = []

    for i, (chunk, shap_val) in enumerate(tqdm(zip(chunks, shap_values.values), total=len(chunks), desc="🔎 Explaining")):
        score = shap_val[:, article_pred_class].sum()
        score_colored = color_score(score)

        # Store for table
        table_rows.append([f"Sentence {i+1}", chunk.strip(), score_colored])

        # Highlight flagged sentence if it strongly contributes to FAKE
        if article_pred_class == 0 and score > threshold:
            flagged_sentences.append(colored(f"[SUSPECTED FAKE] Sentence {i+1}: {chunk.strip()}", 'red'))
        elif article_pred_class == 1 and score > threshold:
            flagged_sentences.append(colored(f"[REAL ➜ slightly FAKE-ish] Sentence {i+1}: {chunk.strip()}", 'yellow'))

    # Print SHAP score table
    print_table(table_rows, headers=["Chunk", "Content", f"SHAP → {class_name}"])

    # === Print full paragraph with inline highlights ===
    print(f"\n📄 Full Article with Highlighted Sentences (Threshold: {threshold}):\n")

    highlighted_article = []
    for i, (chunk, shap_val) in enumerate(zip(chunks, shap_values.values)):
        score = shap_val[:, article_pred_class].sum()
        if article_pred_class == 0 and score < threshold:
            highlighted_article.append(colored(chunk.strip(), 'red'))
        elif article_pred_class == 1 and score < threshold:
            highlighted_article.append(colored(chunk.strip(), 'yellow'))
        else:
            highlighted_article.append(chunk.strip())

    print(" ".join(highlighted_article))

    # === Show flagged sentence list for reference ===
    if flagged_sentences:
        print(f"\n🚨 Highlighted Sentences (Score > {threshold}):\n")
        for s in flagged_sentences:
            print(s)
    else:
        print(f"\n✅ No sentences passed the threshold for suspicion (Score > {threshold})")

# === Batch run multiple examples ===
if __name__ == "__main__":
    examples = [
        # Health misinformation and facts
        """A viral post claims that drinking warm lemon water every morning cures cancer by 'alkalizing the body' and eliminating harmful toxins.
        While lemons are a good source of vitamin C and may support immunity, there is no scientific evidence that they cure cancer or significantly change blood pH.
        Oncologists emphasize that such claims can be dangerous if they lead patients to forgo evidence-based treatments like chemotherapy or radiation.""",

        # AI sentience myth vs expert opinion
        """Recent headlines suggest that AI systems like ChatGPT have become fully sentient, capable of experiencing emotions and forming independent opinions.
        However, experts from OpenAI and academic institutions clarify that these systems are large-scale language models trained on massive datasets—they simulate understanding but do not possess consciousness.
        Studies confirm that while LLMs can generate human-like responses, their outputs are the result of pattern recognition, not awareness.""",

        # Financial conspiracy vs reality
        """A popular financial influencer claims that the U.S. Federal Reserve is intentionally printing money to crash the dollar and usher in a new digital currency under global control.
        While the Fed has indeed increased money supply during times of crisis, such as the 2008 recession and COVID-19 pandemic, there is no evidence of a coordinated plan to collapse the economy.
        Central bank digital currencies (CBDCs) are being explored worldwide, but their adoption is guided by transparency, not secrecy.""",

        # Education disinfo
        """Some articles claim that standardized tests like the SAT are part of a global intelligence-tracking system funded by secret government agencies.
        In reality, the SAT was developed to assess readiness for college and has undergone numerous reforms to reduce bias.
        While criticisms remain, there is no evidence it is used to feed data to intelligence organizations.""",

        # Vaccine chip hoax
        """A widely circulated video alleges that COVID-19 vaccines contain microchips for government surveillance.
        This claim has been debunked by multiple independent investigations and health agencies.
        Vaccine ingredients are publicly disclosed, and no approved vaccine contains microchips or tracking devices.""",

        # Space hoax mixed with verified info
        """A forum thread suggests that the Mars rover images are actually filmed in a desert in Nevada.
        NASA has consistently published telemetry, satellite data, and photographic evidence to support its missions.
        Thousands of engineers and scientists across international agencies have confirmed the integrity of Mars exploration missions.""",

        # Nutrition fact + exaggeration
        """Some influencers say eating only raw vegetables for 30 days can reverse autoimmune disorders.
        While vegetables are a crucial part of a healthy diet, there is no conclusive evidence that raw-only diets cure autoimmune diseases.
        Doctors advise balanced nutrition and caution against extreme dietary restrictions.""",

        # Climate change denial myth
        """A blog post argues that recent cold winters prove that climate change is a hoax.
        However, climate scientists explain that global warming can disrupt weather patterns and lead to increased frequency of extreme events—including cold snaps.
        Long-term temperature trends, not short-term events, determine climate patterns.""",

        # Education tech fear
        """A TikTok video claims that digital textbooks are a tool for governments to control student thoughts.
        Digital education tools are designed to increase access and reduce costs, not manipulate beliefs.
        Content is often curated by educators and reviewed by academic institutions.""",

        # Historical revisionism
        """A viral claim says the moon landing never happened and was staged in a Hollywood studio.
        However, physical evidence like moon rocks, engineering blueprints, and decades of third-party tracking contradict this.
        Multiple space agencies, including Russia’s, acknowledged the success of Apollo missions at the time."""
    ]

    for idx, article in enumerate(examples):
        print(f"\n\n================ EXAMPLE {idx + 1} ================\n")
        shap_sentence_vs_article(article)
