# RAG Confidence Regression
Train lightweight regressors to predict recall@k from retrieval-side features (top-10 and top-100).

## Setup & imports

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import RidgeCV
from sklearn.pipeline import Pipeline
from sklearn import metrics
from scipy.stats import pearsonr
sns.set_theme(style="whitegrid")

RANDOM_STATE = 42
BASE = Path('results')
DATASETS = {
    'top10': BASE / 'rag_confidence_training_10.csv',
    'top100': BASE / 'rag_confidence_training_100.csv',
}


: 

## Load datasets

In [None]:
dfs = {}
for name, path in DATASETS.items():
    df = pd.read_csv(path)
    dfs[name] = df
    target_col = [c for c in df.columns if c.startswith('recall_at_')][0]
    print(f"{name}: {path} -> {df.shape[0]} rows, {df.shape[1]} cols, target={target_col}")


## Target distribution diagnostics
Histograms of recall@k for each dataset help highlight whether the label range is saturated (e.g., top-100).


In [None]:
fig, axes = plt.subplots(1, len(dfs), figsize=(5 * len(dfs), 4), sharey=True)
if len(dfs) == 1:
    axes = [axes]
for ax, (name, df) in zip(axes, dfs.items()):
    target_col = [c for c in df.columns if c.startswith('recall_at_')][0]
    sns.histplot(df[target_col], bins=20, kde=True, ax=ax)
    ax.set_title(f"{name}: {target_col}")
    ax.set_xlabel('Recall value')
    ax.set_ylabel('Count')
plt.suptitle('Label distribution per dataset', y=1.02)
plt.tight_layout()


## Feature-target correlations
Correlations show which regressors drive the target and whether any feature provides signal.


In [None]:
fig, axes = plt.subplots(len(dfs), 1, figsize=(7, 3 * len(dfs)))
if len(dfs) == 1:
    axes = [axes]
for ax, (name, df) in zip(axes, dfs.items()):
    target_col = [c for c in df.columns if c.startswith('recall_at_')][0]
    feature_cols = [c for c in df.columns if c not in {'query', 'query_id', target_col}]
    corr = df[feature_cols + [target_col]].corr()[target_col].drop(target_col)
    corr = corr.sort_values(ascending=False)
    sns.barplot(x=corr.values, y=corr.index, ax=ax, palette='viridis')
    ax.set_title(f"{name}: feature correlations vs {target_col}")
    ax.set_xlabel('Pearson correlation')
plt.tight_layout()


## Helpers: feature selection and training

In [None]:
def prepare_xy(df: pd.DataFrame):
    target_col = [c for c in df.columns if c.startswith('recall_at_')][0]
    drop_cols = {'query', 'query_id', target_col}
    feature_cols = [c for c in df.columns if c not in drop_cols]
    X = df[feature_cols]
    y = df[target_col]
    return X, y, feature_cols, target_col

def train_and_eval(df: pd.DataFrame, alphas=(0.01, 0.1, 1.0, 10.0, 100.0)):
    X, y, feature_cols, target_col = prepare_xy(df)
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=RANDOM_STATE
    )

    model = Pipeline(
        steps=[
            ('scaler', StandardScaler()),
            ('reg', RidgeCV(alphas=alphas, cv=5)),
        ]
    )
    model.fit(X_train, y_train)
    preds = model.predict(X_test)

    mae = metrics.mean_absolute_error(y_test, preds)
    mse = metrics.mean_squared_error(y_test, preds)
    r2 = metrics.r2_score(y_test, preds)
    corr, _ = pearsonr(y_test, preds)

    coefs = model.named_steps['reg'].coef_
    coef_table = pd.DataFrame({
        'feature': feature_cols,
        'coef': coefs,
    }).sort_values('coef', ascending=False)

    metrics_dict = {
        'mae': mae,
        'mse': mse,
        'rmse': np.sqrt(mse),
        'r2': r2,
        'pearson': corr,
        'best_alpha': float(model.named_steps['reg'].alpha_),
    }

    return model, metrics_dict, coef_table


## Train & evaluate (top-10 vs top-100)

In [None]:
results = {}
for name, df in dfs.items():
    model, metrics_dict, coef_table = train_and_eval(df)
    results[name] = {'metrics': metrics_dict, 'coefs': coef_table}
    print(f"\n=== {name} ===")
    for k, v in metrics_dict.items():
        print(f"{k}: {v:.4f}")
    display(coef_table.head(10))
