In [None]:

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path

# TextBlob
from textblob import TextBlob

# NLTK / VADER
import nltk
try:
    from nltk.sentiment.vader import SentimentIntensityAnalyzer
except Exception:
    nltk.download('vader_lexicon')
    from nltk.sentiment.vader import SentimentIntensityAnalyzer

from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_recall_fscore_support
)

SAVE_RAW_TEXT = False   # PHI-safe
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)


def ensure_dir(path):
    os.makedirs(path, exist_ok=True)

def load_tsv(path):
    df = pd.read_csv(path, delimiter='\t', header=None)
    if df.shape[1] < 2:
        raise ValueError(f"{path}: expected at least 2 columns (text + label)")
    df = df[[df.columns[0], df.columns[-1]]].copy()
    df.columns = ['Text', 'Sentiments']
    df['Sentiments'] = pd.to_numeric(df['Sentiments'], errors='coerce').fillna(0).astype(int).clip(0,1)
    return df

def summarize_metrics(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    p_pos, r_pos, f1_pos, _ = precision_recall_fscore_support(y_true, y_pred, average='binary', zero_division=0)
    p_m, r_m, f1_m, _ = precision_recall_fscore_support(y_true, y_pred, average='macro', zero_division=0)
    return {
        'accuracy': acc,
        'precision_pos': p_pos,
        'recall_pos': r_pos,
        'f1_pos': f1_pos,
        'precision_macro': p_m,
        'recall_macro': r_m,
        'f1_macro': f1_m
    }

def plot_confusion(y_true, y_pred, title, save_path=None):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['Negative','Positive'])
    plt.figure(figsize=(5,4))
    disp.plot(values_format='d')
    plt.title(title)
    if save_path:
        plt.savefig(save_path, bbox_inches='tight', dpi=150)
    plt.show()

def print_class_style_header(df):
    print("\n RAW SENTIMENTS")
    print(df.head().to_string(index=True))
    print("\n Data dimesnion = ", df.shape)

def print_class_style_quantitative(df_like):
    print("\n QUANTITATIVE RESULTS")
    print(df_like.head().to_string(index=True))
    print("\n DESCRIPTIVE STATISTICS")
    cols = [c for c in ['Polarity_score','subjectivity_score','Predicted_Label'] if c in df_like.columns]
    print(df_like[cols].describe())


def predict_textblob(texts):
    polarity = []
    subjectivity = []
    for t in texts:
        tb = TextBlob(str(t))
        polarity.append(tb.sentiment.polarity)
        subjectivity.append(tb.sentiment.subjectivity)
    pred = (np.array(polarity) > 0).astype(int)
    return {'polarity': np.array(polarity), 'subjectivity': np.array(subjectivity), 'pred': pred}

def predict_vader_full(texts):
    sid = SentimentIntensityAnalyzer()
    comp, neu = [], []
    for t in texts:
        s = sid.polarity_scores(str(t))
        comp.append(s['compound'])
        neu.append(s['neu'])
    pred = (np.array(comp) > 0).astype(int)
    # subjectivity proxy = 1 - neu
    subj_proxy = (1 - np.array(neu)).clip(0,1)
    return {'compound': np.array(comp), 'neu': np.array(neu), 'subj_proxy': subj_proxy, 'pred': pred}


def evaluate_textblob_only(dataset_path, out_dir=OUT_DIR, dataset_name=None, make_scatter=True):
    ensure_dir(out_dir)
    if dataset_name is None:
        dataset_name = Path(dataset_path).stem

    df = load_tsv(dataset_path)
    texts = df['Text'].tolist()
    y_true = df['Sentiments'].to_numpy()

    print(f"\n================ TextBlob Dataset: {dataset_name} ================")
    print_class_style_header(df)

    tb = predict_textblob(texts)
    tb_metrics = summarize_metrics(y_true, tb['pred'])

    tb_df = pd.DataFrame({
        'Polarity_score': tb['polarity'],
        'subjectivity_score': tb['subjectivity'],
        'Predicted_Label': tb['pred']
    })

    print("\n--- TextBlob (default) ---")
    print_class_style_quantitative(tb_df)

    tb_pred_csv = os.path.join(out_dir, f"{dataset_name}_textblob_predictions.csv")
    tb_save = df.copy()
    tb_save['TB_polarity'] = tb['polarity']
    tb_save['TB_subjectivity'] = tb['subjectivity']
    tb_save['TB_pred_default'] = tb['pred']
    tb_save = tb_save if SAVE_RAW_TEXT else tb_save.drop(columns=['Text'])
    tb_save.to_csv(tb_pred_csv, index=False)

    # Styled scatter (like your image)
    if make_scatter:
        plt.figure(figsize=(6,5))
        plt.scatter(tb['polarity'], tb['subjectivity'], alpha=0.6)
        plt.axvline(0, linestyle='--', linewidth=1)
        plt.axhline(0.5, linestyle='--', linewidth=1)
        plt.xlabel('Polarity (-1 to 1)')
        plt.ylabel('Subjectivity (0 to 1)')
        plt.title(f'TextBlob Sentiment Distribution — {dataset_name}')
        plt.savefig(os.path.join(out_dir, f"{dataset_name}_textblob_scatter.png"), bbox_inches='tight', dpi=150)
        plt.show()

    plot_confusion(y_true, tb['pred'], f'TextBlob (default) — {dataset_name}',
                   save_path=os.path.join(out_dir, f"{dataset_name}_textblob_cm_default.png"))

    return {'dataset': dataset_name, 'model': 'TextBlob_default', **tb_metrics,
            'scores_x': tb['polarity'], 'scores_y': tb['subjectivity']}


def evaluate_vader_only(dataset_path, out_dir=OUT_DIR, dataset_name=None, make_scatter=True):
    ensure_dir(out_dir)
    if dataset_name is None:
        dataset_name = Path(dataset_path).stem

    df = load_tsv(dataset_path)
    texts = df['Text'].tolist()
    y_true = df['Sentiments'].to_numpy()

    print(f"\n================ VADER Dataset: {dataset_name} ================")
    print_class_style_header(df)

    va = predict_vader_full(texts)
    va_metrics = summarize_metrics(y_true, va['pred'])

    va_df = pd.DataFrame({
        'Polarity_score': va['compound'],        # x-axis value
        'subjectivity_score': va['subj_proxy'],  # y-axis proxy
        'Predicted_Label': va['pred']
    })

    print("\n--- VADER (default) ---")
    print_class_style_quantitative(va_df)

    va_pred_csv = os.path.join(out_dir, f"{dataset_name}_vader_predictions.csv")
    va_save = df.copy()
    va_save['VADER_compound'] = va['compound']
    va_save['VADER_neu'] = va['neu']
    va_save['VADER_subj_proxy'] = va['subj_proxy']
    va_save['VADER_pred_default'] = va['pred']
    va_save = va_save if SAVE_RAW_TEXT else va_save.drop(columns=['Text'])
    va_save.to_csv(va_pred_csv, index=False)

    plot_confusion(y_true, va['pred'], f'VADER (default) — {dataset_name}',
                   save_path=os.path.join(out_dir, f"{dataset_name}_vader_cm_default.png"))

    # Styled VADER scatter with same axes & guides
    if make_scatter:
        plt.figure(figsize=(6,5))
        plt.scatter(va['compound'], va['subj_proxy'], alpha=0.6)
        plt.axvline(0, linestyle='--', linewidth=1)
        plt.axhline(0.5, linestyle='--', linewidth=1)
        plt.xlabel('Polarity (-1 to 1)')     # to match style
        plt.ylabel('Subjectivity (0 to 1)')  # proxy: 1 - neutral
        plt.title(f'VADER Sentiment Distribution — {dataset_name}')
        plt.savefig(os.path.join(out_dir, f"{dataset_name}_vader_scatter.png"), bbox_inches='tight', dpi=150)
        plt.show()

    return {'dataset': dataset_name, 'model': 'VADER_default', **va_metrics,
            'scores_x': va['compound'], 'scores_y': va['subj_proxy']}


def combined_scatter(tb_x, tb_y, va_x, va_y, out_dir=OUT_DIR, filename="combined_scatter_models_matched_axes.png"):
    plt.figure(figsize=(8,5))
    plt.scatter(tb_x, tb_y, alpha=0.6, marker='x', label='TextBlob (polarity, subjectivity)')
    plt.scatter(va_x, va_y, alpha=0.6, marker='o', label='VADER (compound, 1 - neutral)')
    plt.axvline(0, linestyle='--', linewidth=1)
    plt.axhline(0.5, linestyle='--', linewidth=1)
    plt.xlabel('Polarity (-1 to 1)')
    plt.ylabel('Subjectivity (0 to 1)')
    plt.title('Model Comparison — Same Axes (TextBlob vs VADER)')
    plt.legend()
    out_path = os.path.join(out_dir, filename)
    plt.savefig(out_path, bbox_inches='tight', dpi=150)
    plt.show()
    return out_path


# Configure datasets
TEXTBLOB_DATASET = "health_labelled.txt"          # TextBlob uses this
VADER_DATASET    = "vader_labelled.txt"    # VADER uses this

tb_result = evaluate_textblob_only(TEXTBLOB_DATASET)
va_result = evaluate_vader_only(VADER_DATASET)

# Combined scatter with matched axes
combined_path = combined_scatter(tb_result['scores_x'], tb_result['scores_y'],
                                 va_result['scores_x'], va_result['scores_y'])
print(f"Combined scatter saved to: {combined_path}")

# Summary table
summary_df = pd.DataFrame([
    {k: v for k, v in tb_result.items() if k not in ('scores_x','scores_y')},
    {k: v for k, v in va_result.items() if k not in ('scores_x','scores_y')}
])
summary_csv = os.path.join(OUT_DIR, "summary_custom_datasets_matched_axes.csv")
summary_df.to_csv(summary_csv, index=False)

print("\n=== Summary (Custom Datasets per Model) ===")
print(summary_df)
print(f"\nSaved: {summary_csv}")

try:
    from IPython.display import display
    display(summary_df)
except Exception:
    pass
