# Libs and Paths

In [1]:
import os
os.chdir("/Users/finlayduff/Documents/BATH MSc/Dissertation")

In [76]:
from utils.data.results import load_combined_results
import pandas as pd
from plotly import express as px
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score, roc_curve
import plotly.graph_objects as go

In [82]:
# dataset_name = "FA-KES_10"
# experiment_id = "b7daf879-2120-4941-93bb-8533b910d1df"
# experiment_id = "eb741518-88e7-4245-b76b-8864e8cd9e18"
# experiment_id = "0f5045c4-5aa8-4866-a58e-ac969fc1d672"


# dataset_name = "recovery-news-data_100"
# experiment_id = "6277d39b-d2f6-4944-a953-fde69e80ef43"
# experiment_id = 'f2880a5c-b7ec-44f5-8fd8-aefda442a62d'
# experiment_id = "39504490-6007-44f6-81eb-2785278e45f2"
# experiment_id = "04718f2e-dfee-41ed-8b91-546d2feb4a7b"
# experiment_id = "7fde3298-3b64-461d-a482-9c140d0af92f"



# dataset_name = "recovery-news-data_None"
# experiment_id = "0ef980ca-1f45-4e25-8bb0-c7e0e9b476a0"
# experiment_id = "4809b2ed-f4bd-43c4-aa83-8dde21ae6819" # ZERO SHOT
# experiment_id = "9123e40c-a2ff-4145-9896-83cf25985e61" # bulk_signals_condensed_critic_gpt4
# experiment_id = "bcb508bd-6784-411b-891a-4ecd74c99d2f"  # bulk_signals_condensed_critic_followup_gpt4
# experiment_id = "b7e34512-1311-4096-92db-e39395dca1a0" # bulk_signals_condensed_critic_followup_selector_gpt4


dataset_name = 'isot_2000'
# experiment_id = '216435aa-8a24-426e-9a32-be413eb9974a'  # bulk_signals_condensed_critic_followup_selector_gpt4
experiment_id = '5887fb1e-a5f5-4d13-8854-27b2fb990943'  # few_shot

In [83]:
df = load_combined_results(dataset_name=dataset_name, experiment_id=experiment_id)
df['article_id'] = df.index
df["is_correct"] = df["prediction"] == df["actual"]

In [84]:
print(df.experiment_name.head(1))

0    few_shot_gpt4
Name: experiment_name, dtype: object


In [85]:
CONF_MAP = {
    "UNCERTAIN":      0.33,
    "FAIRLY CERTAIN": 0.66,
    "CERTAIN":        1.00,
}


def p_hat(row, positive_label="FAKE"):
    c = CONF_MAP[row["confidence"]]
    return c if row["prediction"] == positive_label else 1-c
       
df["p_fake"] = df.apply(p_hat, axis=1)


y_true  = (df["actual"] == 0).astype(int) 
y_score = df["p_fake"]

roc = roc_auc_score(y_true, y_score)
pr = average_precision_score(y_true, y_score)
fpr, tpr, _ = roc_curve(y_true, y_score)
print(f"ROC-AUC: {roc:.3f}   PR-AUC: {pr:.3f}")

roc_fig = go.Figure(
    data=[
        go.Scatter(x=fpr, y=tpr, mode="lines", name=f"ROC (AUC={roc:.3f})"),
        go.Scatter(x=[0,1], y=[0,1], mode="lines", name="random", line=dict(dash="dash"))
    ],
    layout=go.Layout(
        title="ROC curve (positive class = FAKE)",
        xaxis_title="False-Positive Rate",
        yaxis_title="True-Positive Rate",
        template="plotly_white",
        width=600, height=450
    )
)
roc_fig.show()

def ece(y_true, p_hat, bins=10):
    """Expected-Calibration-Error with equal-width probability bins."""
    edges  = np.linspace(0, 1, bins + 1)
    bin_id = np.digitize(p_hat, edges[1:-1])  

    e = 0.0
    for b in range(bins):
        mask = bin_id == b
        if mask.sum() == 0:                 # skip empty bin
            continue
        acc  = y_true[mask].mean()          # empirical precision
        conf = p_hat[mask].mean()           # average stated prob
        e   += mask.mean() * abs(acc - conf)
    return e

ece10 = ece(y_true.values,y_score.values, bins=10)
print(f"ECE (10 bins) = {ece10:.3f}")     

# ------------------------------------------------------------------
# 3 · Accuracy vs stated certainty  -----------------------------------
bucket_acc = (
    df.groupby("confidence", observed=True)
      .apply(lambda d: (d["prediction"] == d["actual"]).mean())
      .reset_index(name="accuracy")
)
# enforce the logical order UNCERTAIN→CERTAIN
cat_order = pd.CategoricalDtype(CONF_MAP.keys(), ordered=True)
bucket_acc["confidence"] = bucket_acc["confidence"].astype(cat_order)
bucket_acc = bucket_acc.sort_values("confidence")

bar_fig = px.bar(
    bucket_acc, x="confidence", y="accuracy",
    title="Empirical accuracy vs. LLM-stated certainty",
    text=bucket_acc["accuracy"].apply(lambda v: f"{v:.1%}"),
    labels={"confidence":"Stated certainty",
            "accuracy":"Accuracy"},
    template="plotly_white")
bar_fig.update_yaxes(range=[0,1], tickformat=".0%")
bar_fig.show()

ROC-AUC: 0.602   PR-AUC: 0.797


ECE (10 bins) = 0.471






In [86]:
# --- 1) explode the nested JSON into a flat table ---------------------------
def explode_features(df: pd.DataFrame) -> pd.DataFrame:

    exploded_rows = []

    # iterate row-by-row so we can keep the current article_id
    for _, row in df.iterrows():
        article_id = row["article_id"]
        sig_dict   = row["feature_selection"]

        # skip rows that lack parsed signals
        if not isinstance(sig_dict, dict) or "signals" not in sig_dict:
            continue

        for sig, meta in sig_dict["signals"].items():
            exploded_rows.append(
                {
                    "article_id" : article_id,
                    "signal"     : sig,
                    "label"      : meta.get("label"),
                    "confidence" : meta.get("confidence")
                }
            )

    return pd.DataFrame(exploded_rows)

flat = explode_features(df)

# --- 2) count keep-rates by signal & relevance ------------------------------
keep_rate = (
    flat.groupby(['signal', 'confidence'])
        .size()
        .rename('count')
        .reset_index()
)

import plotly.express as px
fig = px.bar(
    keep_rate, x='count', y='signal',
    color='confidence', barmode='stack',
    title='How often is each signal kept—and at what relevance level?'
)
fig.show()

KeyError: 'signal'

In [87]:
flat = explode_features(df)

# count per signal–confidence
keep_rate = (
    flat.groupby(['signal', 'confidence'])
        .size()
        .rename('count')
        .reset_index()
)

keep_rate['confidence'] = keep_rate['confidence'].replace({
    'MEDIUM':      'FAIRLY CERTAIN',
}) 

# total per signal
totals = keep_rate.groupby('signal')['count'].transform('sum')
keep_rate['pct'] = 100 * keep_rate['count'] / totals          # row-wise %

fig = px.bar(
    keep_rate, x='pct', y='signal',
    color='confidence', barmode='stack',
    text='pct',
    labels={'pct':'% within signal'},
    # title='Confidence mix for each signal (bars sum to 100 %)'
)
fig.update_traces(texttemplate='%{text:.1f}%')
fig.show()

KeyError: 'signal'

In [88]:
# merge correctness flag in
flat2 = flat.merge(df[['article_id', 'is_correct']], on='article_id')
flat2['confidence'] = flat2['confidence'].replace({
    'MEDIUM':      'FAIRLY CERTAIN',
}) 

# compute conditional accuracy
acc = (flat2
       .groupby(['signal', 'confidence'])['is_correct']
       .mean()
       .reset_index(name='accuracy'))

fig = px.scatter(
    acc, x='confidence', y='accuracy', color='signal',
    title='Accuracy as a function of confidence',
    category_orders={'confidence': ['UNCERTAIN','FAIRLY CERTAIN','CERTAIN']},
    labels={'accuracy':'P(correct | quality)'}
)
fig.update_traces(marker_size=12)
fig.show()

KeyError: 'article_id'

In [89]:
import pandas as pd
import plotly.express as px
from scipy.stats import chi2_contingency

def flatten_feature_selection_signals(df):
    """explode feature_selection → one row per (article, signal)"""
    recs = []
    for idx, row in df.iterrows():
        for sig, meta in row["feature_selection"]["signals"].items():
            recs.append(
                dict(
                    article_id       = row["article_id"],
                    signal           = sig,
                    sig_value        = 1 if meta["label"] == "TRUE" else 0,
                    sig_label        = meta["label"],
                    true_label       = int(row["actual"]),
                    predicted_label  = int(row["prediction"]),
                    is_correct       = int(row["actual"] == row["prediction"]),
                )
            )
    return pd.DataFrame(recs)

flat = flatten_feature_selection_signals(df)

TypeError: 'NoneType' object is not subscriptable

In [90]:
# accuracy when signal == 1   vs   signal == 0
cond_acc = (
    flat.groupby(["signal", "sig_label"])["is_correct"]
        .mean()
        .rename("accuracy")
        .reset_index()
)

fig = px.bar(
    cond_acc,
    x="signal", y="accuracy", color="sig_label",
    barmode="group",
    color_discrete_map={1:"#EF553B", 0:"#636EFA"},
    # category_orders={"sig_value":[1,0]},
    labels={"sig_value":"signal label (TRUE=1 / FALSE=0)"},
    title="Prediction accuracy conditioned on each signal’s value"
)
fig.update_layout(xaxis_title="", yaxis_tickformat=".0%")
fig.show()

KeyError: 'signal'

In [91]:
assoc = []
for sig, grp in flat.groupby("signal"):
    tbl = pd.crosstab(grp["sig_value"], grp["predicted_label"])
    if tbl.shape == (2,2):
        chi2, p, _, _ = chi2_contingency(tbl)
        n = tbl.values.sum()
        cramers_v = (chi2 / n) ** 0.5
        assoc.append(dict(signal=sig, cramers_v=cramers_v, p_value=p))

assoc_df = pd.DataFrame(assoc).sort_values("cramers_v", ascending=False)

fig = px.bar(
    assoc_df, x="cramers_v", y="signal",
    color="p_value", color_continuous_scale="Blues",
    title="Cramér’s V: how strongly each signal drives the *model’s* label"
)
fig.update_layout(xaxis_title="association strength (0-1)", coloraxis_colorbar_title="p")
fig.show()

KeyError: 'signal'

In [92]:
assoc_gt = []
for sig, grp in flat.groupby("signal"):
    tbl = pd.crosstab(grp["sig_value"], grp["true_label"])
    if tbl.shape == (2,2):
        chi2, p, _, _ = chi2_contingency(tbl)
        n = tbl.values.sum()
        cramers_v = (chi2 / n) ** 0.5
        assoc_gt.append(dict(signal=sig, cramers_v=cramers_v, p_value=p))

assoc_gt = pd.DataFrame(assoc_gt).sort_values("cramers_v", ascending=False)

fig = px.bar(
    assoc_gt, x="cramers_v", y="signal",
    color="p_value", color_continuous_scale="Greens",
    title="Cramér’s V: how strongly each signal matches the *true* label"
)
fig.update_layout(xaxis_title="association strength (0-1)", coloraxis_colorbar_title="p")
fig.show()

KeyError: 'signal'

In [16]:
import plotly.express as px

# Scatter plot
fig = px.scatter(
    df,
    x="classification_prompt_user_content_length",
    y="eval_score",
    title="Correlation Between User Content Length and Accuracy",
    labels={
        "classification_prompt_user_content_length": "User Content Length",
        "eval_score": "Accuracy (Eval Score)"
    },
    opacity=0.7
)

# Update layout for better visualization
fig.update_traces(marker=dict(color="blue", size=10))
fig.update_layout(
    title_font_size=14,
    xaxis_title_font_size=12,
    yaxis_title_font_size=12,
    template="plotly_white"
)

fig.show()

In [17]:
import pandas as pd, numpy as np, plotly.express as px

# Suppose df has:  article_content_length (tokens or chars)  and  is_correct (0/1)
nbins = 6                                    # change as needed
df["len_bin"] = pd.qcut(
    df["article_content_length"],
    q=nbins,
    duplicates="drop",
    labels=False,
)

# aggregate
grouped = (
    df.groupby("len_bin")
      .agg(mean_len=("article_content_length", "mean"),
           accuracy=("is_correct", "mean"),
           n=("is_correct", "size"))
      .reset_index(drop=True)
)

fig = px.bar(
    grouped, x="mean_len", y="accuracy",
    text="n",  # show how many samples per bin
    labels={"mean_len": "Mean length in bin", "accuracy": "Accuracy"},
    title="Accuracy by article-length bins",
)
fig.update_traces(textposition="outside")
fig.update_layout(yaxis_tickformat=".0%", template="plotly_white")
fig.show()

In [18]:
import statsmodels.api as sm
import numpy as np
import plotly.graph_objects as go

X = sm.add_constant(df["article_content_length"])
model = sm.Logit(df["is_correct"], X).fit(disp=False)

# predicted probability across range
length_seq = np.linspace(df["article_content_length"].min(),
                         df["article_content_length"].max(), 200)
pred = model.predict(sm.add_constant(length_seq))

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=length_seq, y=pred, mode="lines",
    name="Predicted accuracy",
))
fig.update_layout(
    title="Logistic regression: P(correct) vs. content length",
    xaxis_title="Article length",
    yaxis=dict(title="Predicted accuracy", tickformat=".0%"),
    template="plotly_white"
)
fig.show()

print(model.summary().tables[1])  # see coefficient & p-value

                             coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------------
const                      1.8740      0.142     13.204      0.000       1.596       2.152
article_content_length  8.404e-05      0.000      0.467      0.640      -0.000       0.000


In [19]:
import statsmodels.api as sm
import numpy as np
import plotly.graph_objects as go

X = sm.add_constant(df["article_title_length"])
model = sm.Logit(df["is_correct"], X).fit(disp=False)

# predicted probability across range
length_seq = np.linspace(df["article_title_length"].min(),
                         df["article_title_length"].max(), 200)
pred = model.predict(sm.add_constant(length_seq))

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=length_seq, y=pred, mode="lines",
    name="Predicted accuracy",
))
fig.update_layout(
    title="Logistic regression: P(correct) vs. title length",
    xaxis_title="Article Title length",
    yaxis=dict(title="Predicted accuracy", tickformat=".0%"),
    template="plotly_white"
)
fig.show()

print(model.summary().tables[1])  # see coefficient & p-value

                           coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                    1.6074      0.251      6.403      0.000       1.115       2.099
article_title_length     0.0158      0.012      1.334      0.182      -0.007       0.039


In [20]:
import statsmodels.api as sm
import numpy as np
import plotly.graph_objects as go

X = sm.add_constant(df["classification_prompt_user_content_length"])
model = sm.Logit(df["is_correct"], X).fit(disp=False)

# predicted probability across range
length_seq = np.linspace(df["classification_prompt_user_content_length"].min(),
                         df["classification_prompt_user_content_length"].max(), 200)
pred = model.predict(sm.add_constant(length_seq))

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=length_seq, y=pred, mode="lines",
    name="Predicted accuracy",
))
fig.update_layout(
    title="Logistic regression: P(correct) vs. Classification Prompt User Content Length",
    xaxis_title="User prompt length",
    yaxis=dict(title="Predicted accuracy", tickformat=".0%"),
    template="plotly_white"
)
fig.show()

print(model.summary().tables[1])  # see coefficient & p-value

                                                coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------------
const                                         1.5953      0.310      5.138      0.000       0.987       2.204
classification_prompt_user_content_length     0.0002      0.000      1.092      0.275      -0.000       0.001


In [21]:
df['captured_credibility_signals']

0      {'evidence_present': {'label': 'TRUE', 'confid...
1      {'evidence_present': {'label': 'FALSE', 'confi...
2      {'evidence_present': {'label': 'FALSE', 'confi...
3      {'evidence_present': {'label': 'FALSE', 'confi...
4      {'evidence_present': {'label': 'TRUE', 'confid...
                             ...                        
856    {'evidence_present': {'label': 'FALSE', 'confi...
857    {'evidence_present': {'label': 'TRUE', 'confid...
858    {'evidence_present': {'label': 'TRUE', 'confid...
859    {'evidence_present': {'label': 'TRUE', 'confid...
860    {'evidence_present': {'label': 'TRUE', 'confid...
Name: captured_credibility_signals, Length: 861, dtype: object

In [22]:
df.columns

Index(['experiment_name', 'chunk', 'dataset_name', 'experiment_id', 'run_id',
       'start_time', 'end_time', 'article_id', 'article_title',
       'article_content', 'article_title_length', 'article_content_length',
       'actual', 'prediction', 'confidence', 'explanation', 'eval_score',
       'eval_comment', 'captured_credibility_signals',
       'captured_signals_critiques', 'follow_up_signals_analysis',
       'feature_selection', 'classification_prompt_system',
       'classification_prompt_user',
       'classification_prompt_system_content_length',
       'classification_prompt_user_content_length', 'topic', 'is_correct',
       'p_fake', 'follow_up_signals', 'follow_up_signals_count', 'n_bin',
       'len_bin'],
      dtype='object')

In [1]:
import os
os.chdir("/Users/finlayduff/Documents/BATH MSc/Dissertation")

In [2]:

import pandas as pd
import numpy as np
from utils.data.results import analyze_experiments
import plotly.express as px
# Load all experiment results
results_df = analyze_experiments(verbose=False)



In [62]:
latest_experiments = results_df.loc[results_df["start_time"] >= '2025-05-01'].sort_values("start_time").groupby(["experiment_name","dataset"]).last()
latest_experiments_reset = latest_experiments.reset_index()

In [63]:
datasets_df = latest_experiments_reset.loc[latest_experiments_reset["dataset"].isin(["recovery-news-data_None","isot_2000"])]
experiments = ["zero_shot_gpt35_turbo",
               "zero_shot_gpt4",
               "few_shot_gpt4",
               "bulk_signals_condensed_gpt4",
               "bulk_signals_followup_gpt4",
               "bulk_signals_condensed_critic_gpt4",
               "bulk_signals_condensed_critic_followup_gpt4",
               "bulk_signals_condensed_critic_followup_selector_gpt4"
      ]
experiment_name_format = {
    "zero_shot_gpt35_turbo": "zero-shot gpt-3.5",
    "zero_shot_gpt4": "zero-shot gpt-4",
    "few_shot_gpt4": "few-shot",
    "bulk_signals_condensed_gpt4": "signals",
    "bulk_signals_condensed_critic_followup_gpt4": "signals follow-up critic",
    "bulk_signals_condensed_critic_followup_selector_gpt4": "full graph",
    "bulk_signals_condensed_critic_gpt4": "signals critic",
}
dataset_name_format = {
    "recovery-news-data_None": "recovery",
    "isot_2000": "ISOT",
}

datasets_df = datasets_df.loc[datasets_df["experiment_name"].isin(experiments)]
datasets_df['experiment_name'] = datasets_df['experiment_name'].map(lambda x: experiment_name_format[x])
datasets_df['dataset'] = datasets_df['dataset'].map(lambda x: dataset_name_format[x])


In [81]:
datasets_df

Unnamed: 0,experiment_name,dataset,experiment_id,start_time,accuracy,precision,recall,f1,precision_real,recall_real,...,f1_fake,f1_macro,true_negatives,false_positives,false_negatives,true_positives,mean_s,p50_s,p99_s,mean_classification_prompt_length
0,signals follow-up critic,ISOT,6a581ec9-034a-4ba1-b87e-f3ba16a16dff,2025-05-04T18:56:40.457657+00:00,0.839721,0.691489,0.619048,0.653266,0.691489,0.619048,...,0.89577,0.774518,593,58,80,130,41.204759,41.008174,66.070727,1447.861789
1,signals follow-up critic,recovery,bcb508bd-6784-411b-891a-4ecd74c99d2f,2025-05-01T17:43:12.476898+00:00,0.693492,0.81491,0.703402,0.755062,0.81491,0.703402,...,0.590577,0.672819,445,216,401,951,49.39902,45.264651,73.192588,1955.051664
2,full graph,ISOT,216435aa-8a24-426e-9a32-be413eb9974a,2025-05-04T11:35:23.221389+00:00,0.872242,0.681159,0.895238,0.773663,0.681159,0.895238,...,0.911003,0.842333,563,88,22,188,58.852036,58.51951,85.75991,1543.802555
3,full graph,recovery,b7e34512-1311-4096-92db-e39395dca1a0,2025-05-02T19:16:17.514343+00:00,0.768505,0.80301,0.868343,0.834399,0.80301,0.868343,...,0.615512,0.724955,373,288,178,1174,57.377995,56.947896,80.449848,1946.823646
7,signals critic,ISOT,1588f424-9fd7-4c40-b969-9efd24bc1ee6,2025-05-04T21:19:18.024765+00:00,0.795587,0.548851,0.909524,0.684588,0.548851,0.909524,...,0.848797,0.766693,494,157,19,191,24.763296,24.465743,34.925705,1041.672474
8,signals critic,recovery,9123e40c-a2ff-4145-9896-83cf25985e61,2025-05-01T12:07:06.139585+00:00,0.788376,0.78651,0.940089,0.856469,0.78651,0.940089,...,0.597353,0.726911,316,345,81,1271,33.430679,32.494403,52.884759,1589.894685
9,signals,ISOT,3f988d3f-6dbb-4f7c-8886-437a000c00c8,2025-05-04T08:14:13.668346+00:00,0.872242,0.676056,0.914286,0.777328,0.676056,0.914286,...,0.910423,0.843876,559,92,18,192,19.619122,19.03799,29.095417,1533.336818
10,signals,recovery,4c3bb455-7796-4adc-8765-45d3e87c12ec,2025-05-01T09:57:17.350675+00:00,0.769995,0.803828,0.869822,0.835524,0.803828,0.869822,...,0.617671,0.726598,374,287,176,1176,19.0566,18.476418,30.744544,1932.537506
11,few-shot,ISOT,5887fb1e-a5f5-4d13-8854-27b2fb990943,2025-05-04T22:36:41.117319+00:00,0.945412,0.897561,0.87619,0.886747,0.897561,0.87619,...,0.96404,0.925393,630,21,26,184,2.253395,2.131332,4.990497,2621.806039
12,few-shot,recovery,3cdf49a6-79fb-4774-b165-38a0c7361095,2025-05-02T17:12:00.159687+00:00,0.812221,0.812179,0.93713,0.870192,0.812179,0.93713,...,0.660682,0.765437,368,293,85,1267,3.109868,2.92204,6.782434,10152.438649


In [64]:
px.bar(
    datasets_df.sort_values('mean_s', ascending=True),
    x='experiment_name',
    y='mean_s',
    color='dataset',
    # title='Mean Classification Prompt Length by Experiment and Dataset',
    labels={'experiment_name': 'Experiment Name', 'mean_s': 'Mean Elapse Time (s)'},
    template='plotly_white',
    barmode='group',
).show()

In [65]:
dataset = "recovery-news-data_None"
# dataset = "isot_2000"
df = latest_experiments_reset.loc[latest_experiments_reset['dataset'] == dataset].sort_values("start_time",ascending=False)

# COMPARISONS

In [66]:
# ── 1. Pick the metrics you want to show ──────────────────────────────
metrics = ["accuracy", "f1_macro", "f1_fake", "recall_fake", "precision_fake",]

# ── 2. Keep only the columns we need and reshape to long format ──────
plot_df = (
    df[["experiment_name"] + metrics]
    .melt(id_vars="experiment_name", var_name="metric", value_name="score")
)

plot_df = plot_df.sort_values(
    by="experiment_name",
    key=lambda s: s.str.len(),
    ascending=True,
)

# ── 3. Plot ───────────────────────────────────────────────────────────
fig = px.bar(
    plot_df,
    x="experiment_name",
    y="score",
    color="metric",
    barmode="group",
    text=plot_df["score"].map(lambda v: f"{v:.2%}"),
    labels={"score": "Metric value", "experiment_name": "Experiment"},
    title="Accuracy & F-scores by experiment",
    color_discrete_sequence=px.colors.qualitative.Set2,
)
from pandas.api.types import CategoricalDtype

order = plot_df['experiment_name'].unique()
cat_type = CategoricalDtype(categories=order, ordered=True)
plot_df['experiment_name'] = plot_df['experiment_name'].astype(cat_type)

fig.update_traces(textposition="outside")
fig.update_yaxes(tickformat=".0%", range=[0, 1])
fig.update_xaxes(categoryorder='array', categoryarray=order, autorange=False)
fig.update_layout(
    xaxis_tickangle=-35,
    uniformtext_minsize=8,
    uniformtext_mode="hide",
    legend_title_text="",
    bargap=0.25,
    template="plotly_white",
    margin=dict(l=40, r=20, t=50, b=80),
)

fig.show()

In [None]:
metrics = ["accuracy", "f1_macro",
           "recall_fake"]

long_df = (
    datasets_df[["experiment_name", "dataset"] + metrics]    
      .melt(id_vars=["experiment_name", "dataset"],  
            value_vars=metrics,                     
            var_name="metric",
            value_name="score")
)

wide_df = (
    long_df
      .pivot_table(index="experiment_name",        
                   columns=["dataset", "metric"],   
                   values="score")
      .sort_index(axis=1, level=[0, 1])             
      .sort_index()                                
)


In [68]:
wide_df

dataset,ISOT,ISOT,ISOT,recovery,recovery,recovery
metric,accuracy,f1_macro,recall_fake,accuracy,f1_macro,recall_fake
experiment_name,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
few-shot,0.945412,0.925393,0.967742,0.812221,0.765437,0.556732
full graph,0.872242,0.842333,0.864823,0.768505,0.724955,0.564297
signals,0.872242,0.843876,0.858679,0.769995,0.726598,0.565809
signals critic,0.795587,0.766693,0.758833,0.788376,0.726911,0.478064
signals follow-up critic,0.839721,0.774518,0.910906,0.693492,0.672819,0.673222
zero-shot gpt-3.5,0.681395,0.665046,0.596923,0.760437,0.687113,0.420575
zero-shot gpt-4,0.753775,0.735031,0.674347,0.795827,0.725297,0.440242


In [None]:
# Add new cell with markdown:
"""# Confusion Matrices for All Experiments"""

# Add new code cell:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Get unique combinations of experiments and datasets

# dataset_name = "recovery-news-data_None"
dataset_experiments = datasets_df.loc[datasets_df['experiment_name'].isin(["zero-shot gpt-4","few-shot","full graph"])]
experiment_combos = dataset_experiments.groupby(['experiment_name', 'dataset'])


# Calculate number of rows and columns needed for subplots
n_plots = len(experiment_combos)
n_cols = min(2, n_plots)  # Max 2 columns
n_rows = (n_plots + 1) // 2  # Ceiling division for number of rows

# Create subplot figure
fig = make_subplots(
    rows=n_rows, 
    cols=n_cols,
    subplot_titles=[f"{exp}" for exp, ds in experiment_combos.groups.keys()]
)

# Create confusion matrix for each experiment
for idx, ((exp_name, dataset), group) in enumerate(experiment_combos):
    row = idx // 2 + 1
    col = idx % 2 + 1
    
    # Create confusion matrix
    cm = np.array([
        [group['true_negatives'].iloc[0], group['false_positives'].iloc[0]],
        [group['false_negatives'].iloc[0], group['true_positives'].iloc[0]]
    ])
    
    # Add heatmap
    fig.add_trace(
        go.Heatmap(
            z=cm,
            x=['Fake', 'Real'],
            y=['Fake', 'Real'],
            text=cm,
            texttemplate="%{text}",
            textfont={"size": 14},
            colorscale='Blues',
            showscale=False,
        ),
        row=row, col=col
    )

# Update layout
fig.update_layout(
    height=400 * n_rows,
    width=1000,
    title_text="Confusion Matrices for All Experiments",
    showlegend=False
)

# Update axes labels
for i in range(1, n_plots + 1):
    fig.update_xaxes(title_text="Predicted", row=(i-1)//2 + 1, col=(i-1)%2 + 1)
    fig.update_yaxes(title_text="Actual", row=(i-1)//2 + 1, col=(i-1)%2 + 1)

fig.show()

In [None]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# ------------------------------------------------------------------
# 1.  Select the three experiments and figure out datasets
exp_order = ["zero-shot gpt-4", "few-shot", "full graph"]
dataset_experiments = datasets_df.loc[datasets_df["experiment_name"].isin(exp_order)]

datasets = sorted(dataset_experiments["dataset"].unique())  # e.g. ['ISOT', 'recovery-news-data_None']
n_rows, n_cols = len(datasets), len(exp_order)               # 2 × 3
# ------------------------------------------------------------------


fig = make_subplots(
    rows=len(datasets),
    cols=len(exp_order),
    column_titles=[e.title() for e in exp_order],
    row_titles=[d for d in datasets],     
    horizontal_spacing=0.08,
    vertical_spacing=0.10
)

# 3.  Populate each cell
for r, dataset in enumerate(datasets, start=1):
    for c, exp_name in enumerate(exp_order, start=1):
        group = dataset_experiments[
            (dataset_experiments["dataset"] == dataset) &
            (dataset_experiments["experiment_name"] == exp_name)
        ]
        if group.empty:
            continue  # no run for this combination

        cm = np.array([
            [group["true_negatives"].iloc[0], group["false_positives"].iloc[0]],
            [group["false_negatives"].iloc[0], group["true_positives"].iloc[0]]
        ])

        fig.add_trace(
            go.Heatmap(
                z=cm,
                x=["Fake", "Real"],
                y=["Fake", "Real"],
                text=cm,
                texttemplate="%{text}",
                textfont={"size": 14},
                colorscale="Blues",
                showscale=False,
            ),
            row=r, col=c
        )

# 4.  Global layout tweaks
fig.update_layout(
    height=400 * n_rows,
    width=1200,
    # title_text="Confusion Matrices by Dataset (rows) and Experiment (columns)",
    showlegend=False
)

# 5.  Axis labels (one set per row / column to avoid clutter)
for c in range(1, n_cols + 1):
    fig.update_xaxes(title_text="Predicted", row=n_rows, col=c)
for r in range(1, n_rows + 1):
    fig.update_yaxes(title_text="Actual", row=r, col=1)



fig.show()

# Parwise Comparison

In [220]:
import os
os.chdir("/Users/finlayduff/Documents/BATH MSc/Dissertation")
from utils.data.results import compare_runs,load_combined_results

In [223]:
# recovery
dataset_name = "recovery-news-data_None"
zero_shot_experiment_id = "d8dd8561-68a9-4b69-8f6f-dee1c56d48dd" # zero_shot_gpt4
just_signals_experiment_id = "4c3bb455-7796-4adc-8765-45d3e87c12ec" # bulk_signals_gpt4
full_graph_experiment_id = "b7e34512-1311-4096-92db-e39395dca1a0" # full-graph

In [224]:
# isot
dataset_name = "isot_2000"
zero_shot_experiment_id = "c77035a4-d57a-4643-b71c-c051d92ea3f3" # zero_shot_gpt4
just_signals_experiment_id = "3f988d3f-6dbb-4f7c-8886-437a000c00c8" # bulk_signals_gpt4
full_graph_experiment_id = "216435aa-8a24-426e-9a32-be413eb9974a" # full-graph

In [225]:
zero_shot_df = load_combined_results(dataset_name=dataset_name, experiment_id=zero_shot_experiment_id)
bulk_signals_df = load_combined_results(dataset_name=dataset_name, experiment_id=just_signals_experiment_id)
full_graph_df = load_combined_results(dataset_name=dataset_name, experiment_id=full_graph_experiment_id)

In [190]:
delta,p = compare_runs(df_a=full_graph_df, df_b=zero_shot_df, y_col="actual", yhat_col="prediction")

In [191]:
print(f"delta {delta:.3f}   p-value {p:.3f}")

delta -0.000   p-value 0.288


# Label Flipping

In [226]:
def flatten_credibility_signals(df):
    """explode feature_selection → one row per (article, signal)"""
    recs = []
    for idx, row in df.iterrows():
        if row["captured_credibility_signals"] is not None:
            for sig, meta in row["captured_credibility_signals"].items():
                recs.append(
                    dict(
                        article_id       = row["article_id"],
                        signal           = sig,
                        sig_value        = 1 if meta["label"] == "TRUE" else 0,
                        sig_label        = meta["label"],
                        true_label       = int(row["actual"]),
                        predicted_label  = int(row["prediction"]),
                        is_correct       = int(row["actual"] == row["prediction"]),
                    )
                )
    return pd.DataFrame(recs)
def explode_critic_followup(df):
    """explode feature_selection → one row per (article, signal)"""
    recs = []
    for idx, row in df.iterrows():
        if row["captured_signals_critiques"] is not None:
            for sig, meta in row["captured_signals_critiques"].get('follow_up',{}).items():
                recs.append(
                    dict(
                        article_id       = row["article_id"],
                        signal           = sig,
                    )
                )
    return pd.DataFrame(recs)

first_pass_signals_df = flatten_credibility_signals(bulk_signals_df)
full_graph_signals_df = flatten_credibility_signals(full_graph_df)
requested_followup_df = explode_critic_followup(full_graph_df)
followup_analysis_df = explode_features(full_graph_df)

In [227]:
# ───────────────────────── signal snapshot after FIRST LLM pass ─────────────────────────
# columns: article_id • signal • sig_label  (TRUE/FALSE)
sig_base = (
    first_pass_signals_df
      .loc[:, ["article_id", "signal", "sig_label"]]
      .rename(columns={"sig_label": "base_sig_label"})
)

# ───────────────────────── SAME signals after FOLLOW-UP (if any) ─────────────────────────
# columns: article_id • signal • label
sig_follow = (
    followup_analysis_df
      .loc[:, ["article_id", "signal", "label"]]
      .rename(columns={"label": "follow_sig_label"})
)

# ───────────────────────── which (article,signal) PAIRS were flagged? ────────────────────
flagged_pairs = (
    requested_followup_df
      .assign(flagged=True)
      .drop_duplicates(["article_id", "signal"])
)

# ───────────────────────── ARTICLE-level ground truth + preds ────────────────────────────
art_base  = first_pass_signals_df .drop_duplicates("article_id")[["article_id",
                                                         "predicted_label"]]\
                             .rename(columns={"predicted_label":"base_art_pred"})
art_final = full_graph_signals_df.drop_duplicates("article_id")[["article_id",
                                                         "predicted_label",
                                                         "true_label"]]     \
                             .rename(columns={"predicted_label":"final_art_pred"})
#merge it all together
df = (
    sig_base
      .merge(sig_follow,       on=["article_id","signal"], how="left")
      .merge(flagged_pairs,    on=["article_id","signal"], how="left")
      .merge(art_base,         on="article_id",            how="left")
      .merge(art_final,        on="article_id",            how="left")
      .fillna({"flagged": False})
)

df["sig_flip"] = df["base_sig_label"] != df["follow_sig_label"]
df["art_flip"] = df["base_art_pred"]  != df["final_art_pred"]
df["final_correct"] = df["final_art_pred"] == df["true_label"]
df["base_correct"]  = df["base_art_pred"]  == df["true_label"]
df.dropna(subset=["final_correct", "base_correct"], inplace=True)


Downcasting object dtype arrays on .fillna, .ffill, .bfill is deprecated and will change in a future version. Call result.infer_objects(copy=False) instead. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`



In [228]:
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, recall_score

# ------------------------------------------------------------
# 0 · Safety: coerce Boolean cols
# ------------------------------------------------------------
bool_cols = ['base_correct', 'final_correct',
             'flagged', 'sig_flip', 'art_flip']
df[bool_cols] = df[bool_cols].astype(bool)

# ------------------------------------------------------------------
# 1 · Headline accuracy / F1 / recall  (article level)
# ------------------------------------------------------------------
art_level = (
    df.groupby('article_id')
      .agg(true_label    = ('true_label',      'first'),
           base_pred     = ('base_art_pred',   'first'),
           final_pred    = ('final_art_pred',  'first'),
           base_correct  = ('base_correct',    'first'),
           final_correct = ('final_correct',   'first'),
           any_flagged   = ('flagged',         'any'),
           any_artflip   = ('art_flip',        'any'))
      .dropna(subset=['true_label', 'base_pred', 'final_pred'])
)

POS = 0
def _scores(y_true, y_pred):
    return dict(
        acc         = accuracy_score(y_true, y_pred),
        f1_macro    = f1_score      (y_true, y_pred, average='macro'),
        recall_fake = recall_score  (y_true, y_pred, pos_label=POS)
    )

base_m  = _scores(art_level.true_label, art_level.base_pred)
final_m = _scores(art_level.true_label, art_level.final_pred)

print("dataset: ",dataset_name)

print(f"· Articles evaluated     : {len(art_level):,d}")
print(f"· Accuracy   first pass  : {base_m['acc']:6.2%}")
print(f"· Accuracy   final pass  : {final_m['acc']:6.2%}  "
      f"({(final_m['acc']-base_m['acc'])*100:+4.2f} pp)")
print(f"· Macro-F1    first pass : {base_m['f1_macro']:6.2%}")
print(f"· Macro-F1    final pass : {final_m['f1_macro']:6.2%}  "
      f"({(final_m['f1_macro']-base_m['f1_macro'])*100:+4.2f} pp)")
print(f"· Recall(Fake) first     : {base_m['recall_fake']:6.2%}")
print(f"· Recall(Fake) final     : {final_m['recall_fake']:6.2%}  "
      f"({(final_m['recall_fake']-base_m['recall_fake'])*100:+4.2f} pp)")

# ------------------------------------------------------------------
# 2 · Critic intervention
# ------------------------------------------------------------------
crit_rate = art_level.any_flagged.mean()
flip_rate = art_level.any_artflip.mean()

fixes = ((~art_level.base_correct) & art_level.final_correct).sum()
harms = ((  art_level.base_correct) & ~art_level.final_correct).sum()

print(f"· Critic requested follow-up on  {crit_rate:6.2%} of articles")
print(f"· Final label changed in         {flip_rate:6.2%} of articles")
print(f"· Articles fixed                 {fixes:4d}")
print(f"· Articles harmed                {harms:4d}")
print(f"· Net improvement                {fixes-harms:+4d}")

# ------------------------------------------------------------------
# 2-A · Follow-up load and most-flagged signal
# ------------------------------------------------------------------
followups_per_art = (
    df.loc[df.flagged]
      .groupby('article_id')
      .size()
)
avg_followups = followups_per_art.mean()

sig_to_art_cnt = (
    df.loc[df.flagged]
      .groupby('signal')['article_id']
      .nunique()
      .sort_values(ascending=False)
)
top_sig         = sig_to_art_cnt.idxmax()
top_sig_arts    = sig_to_art_cnt.max()
top_sig_pct     = top_sig_arts / len(art_level)

print(f"· Average follow-ups requested per article : {avg_followups:4.2f}")
print(f"· Most-flagged signal                     : '{top_sig}' "
      f"({top_sig_arts:,d} articles, {top_sig_pct:5.2%} of corpus)")

# ------------------------------------------------------------------
# 3 · Per-signal diagnostics
# ------------------------------------------------------------------
def _agg_per_signal(g):
    return pd.Series(dict(
        n_flagged = g.flagged.sum(),
        n_sigflip = g.sig_flip.sum(),
        fixed     = ((~g.base_correct) &
                     g.final_correct  &
                     g.sig_flip).sum(),
        harmed    = (( g.base_correct) &
                     ~g.final_correct &
                     g.sig_flip).sum(),
    ))

sig_stats = (
    df.groupby('signal', group_keys=False)
      .apply(_agg_per_signal)
      .assign(net = lambda t: t.fixed - t.harmed)
      .sort_values('net', ascending=False)
)

print("\nTop signals by net benefit:")
print(sig_stats.head(10).to_string())

dataset:  isot_2000
· Articles evaluated     : 815
· Accuracy   first pass  : 87.12%
· Accuracy   final pass  : 86.99%  (-0.12 pp)
· Macro-F1    first pass : 84.66%
· Macro-F1    final pass : 84.35%  (-0.31 pp)
· Recall(Fake) first     : 85.48%
· Recall(Fake) final     : 86.14%  (+0.66 pp)
· Critic requested follow-up on  93.62% of articles
· Final label changed in          5.52% of articles
· Articles fixed                   22
· Articles harmed                  23
· Net improvement                  -1
· Average follow-ups requested per article : 4.15
· Most-flagged signal                     : 'external_corroboration' (677 articles, 83.07% of corpus)

Top signals by net benefit:
                            n_flagged  n_sigflip  fixed  harmed  net
signal                                                              
credible_sourcing                 612         26      5       2    3
explicit_unverified_claims        418         12      2       0    2
evidence_present                  





In [229]:
# ------------------------------------------------------------------
# 4 · Accuracy when the critic stays silent
# ------------------------------------------------------------------
mask_no_follow = ~art_level['any_flagged']        # critic never flagged
silent_subset  = art_level[mask_no_follow]

if silent_subset.empty:
    print("· No articles were accepted outright by the critic.")
else:
    acc_base  = (silent_subset['true_label'] == silent_subset['base_pred']).mean()
    acc_final = (silent_subset['true_label'] == silent_subset['final_pred']).mean()

    print(f"· Articles with NO follow-up           : {len(silent_subset):,d}")
    print(f"· Accuracy 1st-pass (no follow-up)    : {acc_base:6.2%}")
    print(f"· Accuracy final (no follow-up)       : {acc_final:6.2%}")
    delta = (acc_final - acc_base) * 100
    print(f"· Change when no follow-up requested  : {delta:+4.2f} pp")

· Articles with NO follow-up           : 52
· Accuracy 1st-pass (no follow-up)    : 84.62%
· Accuracy final (no follow-up)       : 82.69%
· Change when no follow-up requested  : -1.92 pp
