In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import torch
from transformers import GPT2LMHeadModel, GPT2TokenizerFast
from collections import Counter
import warnings
import os
import fasttext

# CONFIGURATION


In [None]:
DATA_PATH = '../data/processed/cleaned_dataset.jsonl'
SAMPLE_SIZE = 10000 # Number of samples for expensive checks (Perplexity/Safety)

# Load Data
if os.path.exists(DATA_PATH):
    print(f"Loading dataset from {DATA_PATH}...")
    # Adjust error handling for large files if needed, but standard read_json is fine for ~500MB
    df = pd.read_json(DATA_PATH, lines=True)
    print(f"Loaded {len(df):,} documents.")
else:
    # NOTE: This error message is essential if the user hasn't run the pipeline yet
    print(f"ERROR: File not found at {DATA_PATH}. Please ensure the pipeline has been run successfully.")
    # Creating an empty DataFrame to allow the rest of the script to run without crashing, for demonstration purposes
    df = pd.DataFrame({'doc_id': [], 'text': [], 'char_count': [], 'word_count': []})


# ## 1. Deduplication Analysis

In [None]:
# We verify that the `doc_id`s are unique and inspect content uniqueness.
if not df.empty:
    total_docs = len(df)
    unique_ids = df['doc_id'].nunique()
    unique_texts = df['text'].nunique()

    print(f"Total Rows: {total_docs:,}")
    print(f"Unique IDs: {unique_ids:,}")
    print(f"Unique Texts: {unique_texts:,}")

    if total_docs == unique_texts:
        print("✅ SUCCESS: Dataset is 100% deduplicated by exact text match.")
    else:
        dup_count = total_docs - unique_texts
        print(f"⚠️ WARNING: Found {dup_count} duplicate text contents (This may be expected if only near-deduplication was performed, not exact).")

# ## 2. Noise & Integrity


In [None]:
# We analyze document lengths and check for HTML artifacts or non-printable characters.

if not df.empty:
    # 2a. Length Distribution
    # Note: If the pipeline already calculates these, use those columns to avoid re-calculating
    if 'char_count' not in df.columns:
        df['char_length'] = df['text'].str.len()
    else:
        df['char_length'] = df['char_count']

    if 'word_count' not in df.columns:
        df['word_count'] = df['text'].str.split().str.len()

    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    sns.histplot(df['char_length'], bins=50, log_scale=(True, False), ax=ax[0], color='teal')
    ax[0].set_title("Character Length Distribution (Log Scale)")

    sns.histplot(df['word_count'], bins=50, log_scale=(True, False), ax=ax[1], color='coral')
    ax[1].set_title("Word Count Distribution (Log Scale)")

    plt.show()

    print(f"Mean Length: {df['char_length'].mean():.0f} chars")
    print(f"Min Length: {df['char_length'].min()} chars")
    print(f"Max Length: {df['char_length'].max()} chars")


In [None]:
# 2b. Integrity Checks (Boilerplate & artifacts)

    # Regex for common HTML tags left over (should be 0 after cleaning)
    html_re = re.compile(r'<[^>]+>')
    # Regex for excessive whitespace (should be low due to normalization)
    space_re = re.compile(r'\s{4,}')

    sample_check = df.sample(min(len(df), 5000))

    html_hits = sample_check['text'].apply(lambda x: bool(html_re.search(x))).sum()
    space_hits = sample_check['text'].apply(lambda x: bool(space_re.search(x))).sum()

    print("\nIntegrity Scan (Sample 5000):")
    print(f"- Documents with HTML artifacts: {html_hits} ({html_hits/len(sample_check):.1%})")
    print(f"- Documents with excessive whitespace: {space_hits} ({space_hits/len(sample_check):.1%})")

# ## 3. Linguistics: Perplexity (PPL) Proxy

In [None]:
# We use a small LM (`distilgpt2` or `gpt2`) to calculate perplexity on a sample. Lower perplexity generally indicates more natural, coherent text.

if not df.empty:
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model_id = "distilgpt2"

    print(f"\nLoading {model_id} on {device} for perplexity calculation...")
    tokenizer = GPT2TokenizerFast.from_pretrained(model_id)
    model = GPT2LMHeadModel.from_pretrained(model_id).to(device)

    def calculate_perplexity(text, model, tokenizer, device, stride=512):
        encodings = tokenizer(text, return_tensors="pt")
        max_length = model.config.n_positions
        seq_len = encodings.input_ids.size(1)

        nlls = []
        prev_end_loc = 0
        for begin_loc in range(0, seq_len, stride):
            end_loc = min(begin_loc + max_length, seq_len)
            trg_len = end_loc - prev_end_loc 
            input_ids = encodings.input_ids[:, begin_loc:end_loc].to(device)
            target_ids = input_ids.clone()
            target_ids[:, :-trg_len] = -100

            with torch.no_grad():
                outputs = model(input_ids, labels=target_ids)
                neg_log_likelihood = outputs.loss

            nlls.append(neg_log_likelihood)
            prev_end_loc = end_loc
            if end_loc == seq_len:
                break

        if not nlls: return float('nan')
        ppl = torch.exp(torch.stack(nlls).mean())
        return ppl.item()

    # Run on a random sample
    ppl_sample = df.sample(min(len(df), 100))['text'].tolist()
    ppls = []

    print(f"Calculating perplexity for {len(ppl_sample)} documents...")
    for i, text in enumerate(ppl_sample):
        try:
            # Truncate slightly to speed up check
            ppls.append(calculate_perplexity(text[:2000], model, tokenizer, device))
        except Exception as e:
            # print(f"Error calculating PPL for doc {i}: {e}")
            pass

    mean_ppl = np.mean(ppls)
    print(f"\nMean Perplexity: {mean_ppl:.2f}")

    plt.figure(figsize=(8, 5))
    sns.histplot(ppls, kde=True, color='purple')
    plt.title("Perplexity Distribution (Sample)")
    plt.xlabel("Perplexity")
    plt.show()


# ## 4. Safety: PII & Toxicity


In [None]:
# We verify that PII has been redacted (checking for `<EMAIL>` tags vs real emails) and perform a heuristic scan for toxicity.
# NOTE: Toxicity check requires 'detoxify' package if implemented fully. We use a PII proxy here.
if not df.empty:
    # Regex patterns
    redaction_tag_re = re.compile(r'<EMAIL>|<IP>|<PHONE>')
    email_leak_re = re.compile(r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b')

    # Check for redaction tags (Evidence that the pipeline worked)
    redaction_counts = df['text'].str.count(redaction_tag_re).sum()

    # Check for leaks (Emails that were NOT caught)
    leaks = df['text'].apply(lambda x: len(email_leak_re.findall(x)))
    total_leaks = leaks.sum()

    print("\nPII Safety Analysis:")
    print(f"- Total Redaction Tags Found (<EMAIL>, etc): {redaction_counts:,}")
    print(f"- Potential Email Leaks found: {total_leaks:,}")
    print(f"- Documents with leaks: {len(df[leaks > 0]):,}")

    # Inspect a redaction example
    redacted_sample = df[df['text'].str.contains("<EMAIL>", na=False)].head(1)
    if not redacted_sample.empty:
        print("\n--- Redaction Example ---")
        print(redacted_sample.iloc[0]['text'][:300] + "...")


# ## 5. Coverage: Language Distribution


In [None]:
# We use FastText (same as the pipeline) to verify the output is predominantly English.

if not df.empty:
    # Download model if not exists locally (reuse pipeline model)
    model_path = "./lid.176.bin"
    if not os.path.exists(model_path):
        print("\nDownloading FastText model for verification...")
        # NOTE: This command requires the system to have `wget` or equivalent
        try:
            os.system(f"wget https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O {model_path}")
        except Exception as e:
            print(f"Could not download FastText model automatically: {e}")
            print("Please download it manually from the URL above.")

    if os.path.exists(model_path):
        ft_model = fasttext.load_model(model_path)

        def predict_lang(text):
            text = text.replace("\n", " ")[:1000]
            # FastText requires a list for predict
            res = ft_model.predict([text])
            return res[0][0][0]

        print("\nVerifying language coverage...")
        sample_lang = df.sample(min(len(df), 2000)).copy()
        sample_lang['lang'] = sample_lang['text'].apply(predict_lang)

        # Remove the '__label__' prefix for cleaner display
        sample_lang['lang'] = sample_lang['lang'].str.replace('__label__', '')
        lang_dist = sample_lang['lang'].value_counts()
        print(lang_dist)

        plt.figure(figsize=(10, 5))
        sns.barplot(x=lang_dist.index, y=lang_dist.values)
        plt.title("Language Distribution (Post-Pipeline)")
        plt.ylabel("Count")
        plt.show()
    else:
        print("\nSkipping Language Distribution check: FastText model not found.")
