In [None]:
# save_gaico_like_report.py  (drop into your notebook/script and run)
import json
import os
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# For nice inline display in notebooks (no-op in scripts)
try:
    from IPython.display import display
except Exception:
    def display(x): print(x)

def make_predictions_csv_and_gaico_report(testcases_df, out_dir='gaico_out', baseline_model='tfidf'):
    """
    Create GAICO-like artifacts from a testcases dataframe.

    testcases_df must contain columns:
      id, text, true_label, <model>_pred (e.g. tfidf_pred, distil_finetuned_pred, distil_base_emb_pred, gpt_pred)
    """
    os.makedirs(out_dir, exist_ok=True)

    # testcases_df expected columns: id, text, true_label, tfidf_pred, distil_finetuned_pred, distil_base_emb_pred, gpt_pred
    # Melt to long form: one row per (testcase, model)
    mdf = testcases_df.melt(id_vars=['id','text','true_label'],
                            value_vars=[c for c in testcases_df.columns if c.endswith('_pred')],
                            var_name='model_var', value_name='pred')
    # normalize model names: remove suffix and a canonical mapping
    mdf['model'] = mdf['model_var'].str.replace('_pred','', regex=False).str.replace('distil_','distil-', regex=False)
    mdf = mdf[['id','text','true_label','model','pred']]

    # Save predictions file
    preds_path = os.path.join(out_dir, 'predictions.csv')
    mdf.to_csv(preds_path, index=False)

    # Compute per-model aggregated metrics (on the small testcase set)
    models = mdf['model'].unique().tolist()
    summary = []
    for m in models:
        subset = mdf[mdf['model']==m]
        y_true = subset['true_label'].astype(int).values
        y_pred = subset['pred'].astype(int).values
        summary.append({
            'model': m,
            'n': int(len(y_true)),
            'accuracy': float(accuracy_score(y_true, y_pred)),
            'precision': float(precision_score(y_true, y_pred, zero_division=0)),
            'recall': float(recall_score(y_true, y_pred, zero_division=0)),
            'f1': float(f1_score(y_true, y_pred, zero_division=0)),
        })
    summary_df = pd.DataFrame(summary).sort_values('model')

    # Per-testcase pass/fail table (for each model)
    ptab = mdf.copy()
    ptab['pass'] = (ptab['pred'].astype(int) == ptab['true_label'].astype(int))
    ptab = ptab[['id','text','model','pred','true_label','pass']]

    # Baseline deltas
    baseline = summary_df[summary_df['model']==baseline_model]
    if not baseline.empty:
        base = baseline.iloc[0]
        deltas = []
        for _, row in summary_df.iterrows():
            if row['model']==baseline_model: continue
            deltas.append({
                'model': row['model'],
                'accuracy_delta': row['accuracy'] - base['accuracy'],
                'f1_delta': row['f1'] - base['f1']
            })
        deltas_df = pd.DataFrame(deltas)
    else:
        deltas_df = pd.DataFrame([])

    # Write gaico_report.json (simple)
    report = {
        'baseline_model': baseline_model,
        'summary': summary_df.to_dict(orient='records'),
        'per_testcase': ptab.to_dict(orient='records'),
        'deltas_vs_baseline': deltas_df.to_dict(orient='records')
    }
    report_path = os.path.join(out_dir, 'gaico_report.json')
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=2)

    # Save CSVs too
    summary_df.to_csv(os.path.join(out_dir,'summary_by_model.csv'), index=False)
    ptab.to_csv(os.path.join(out_dir,'per_testcase_results.csv'), index=False)

    print("Saved GAICO-like artifacts to", out_dir)
    return preds_path, report_path


# ----------------------------------------------------
# Example / demo: recreate the small testcases_df and run
# ----------------------------------------------------
if __name__ == "__main__":
    # Recreate your testcases_df manually (from earlier output)
    testcases_df = pd.DataFrame([
        {'id':'TC1','text':'A wonderful, charming movie — I loved it.','true_label':1,
         'tfidf_pred':1,'distil_finetuned_pred':1,'distil_base_emb_pred':1,'gpt_pred':1},
        {'id':'TC2','text':'I wanted to like this film, but it was dull and disappointing.','true_label':0,
         'tfidf_pred':0,'distil_finetuned_pred':0,'distil_base_emb_pred':0,'gpt_pred':0},
        {'id':'TC3','text':'If you enjoy painfully slow pacing and wooden acting, this is your masterpiece.','true_label':0,
         'tfidf_pred':0,'distil_finetuned_pred':0,'distil_base_emb_pred':1,'gpt_pred':1}
    ])

    # Call GAICO generator with this mock data
    preds_path, report_path = make_predictions_csv_and_gaico_report(testcases_df, baseline_model='tfidf')

    # Read the saved summary and per-testcase tables for inline display
    summary_df = pd.read_csv(os.path.join('gaico_out','summary_by_model.csv'))
    per_testcase_df = pd.read_csv(os.path.join('gaico_out','per_testcase_results.csv'))

    print("\n=== GAICO-like Summary (Aggregated Metrics) ===")
    display(summary_df)

    print("\n=== GAICO-like Per-Testcase Results (Pass/Fail) ===")
    display(per_testcase_df)

    with open(os.path.join('gaico_out','gaico_report.json'),'r') as f:
        report = json.load(f)

    print("\n=== Deltas vs Baseline (TF-IDF) ===")
    display(pd.DataFrame(report.get('deltas_vs_baseline', [])))

    print("\nReport files saved to:", preds_path, "and", report_path)


Saved GAICO-like artifacts to gaico_out

=== GAICO-like Summary (Aggregated Metrics) ===


Unnamed: 0,model,n,accuracy,precision,recall,f1
0,distil-base_emb,3,0.666667,0.5,1.0,0.666667
1,distil-finetuned,3,1.0,1.0,1.0,1.0
2,gpt,3,0.666667,0.5,1.0,0.666667
3,tfidf,3,1.0,1.0,1.0,1.0



=== GAICO-like Per-Testcase Results (Pass/Fail) ===


Unnamed: 0,id,text,model,pred,true_label,pass
0,TC1,"A wonderful, charming movie — I loved it.",tfidf,1,1,True
1,TC2,"I wanted to like this film, but it was dull an...",tfidf,0,0,True
2,TC3,If you enjoy painfully slow pacing and wooden ...,tfidf,0,0,True
3,TC1,"A wonderful, charming movie — I loved it.",distil-finetuned,1,1,True
4,TC2,"I wanted to like this film, but it was dull an...",distil-finetuned,0,0,True
5,TC3,If you enjoy painfully slow pacing and wooden ...,distil-finetuned,0,0,True
6,TC1,"A wonderful, charming movie — I loved it.",distil-base_emb,1,1,True
7,TC2,"I wanted to like this film, but it was dull an...",distil-base_emb,0,0,True
8,TC3,If you enjoy painfully slow pacing and wooden ...,distil-base_emb,1,0,False
9,TC1,"A wonderful, charming movie — I loved it.",gpt,1,1,True



=== Deltas vs Baseline (TF-IDF) ===


Unnamed: 0,model,accuracy_delta,f1_delta
0,distil-base_emb,-0.333333,-0.333333
1,distil-finetuned,0.0,0.0
2,gpt,-0.333333,-0.333333



Report files saved to: gaico_out/predictions.csv and gaico_out/gaico_report.json
