In [None]:
import pandas as pd
civil_df = pd.read_csv('civil_comments.csv')
toxic_df = pd.read_csv('toxic_comments.csv')
print("🟦 Civil Comments Dataset (Unintended Bias):")
display(civil_df.head(3))
print(f"Shape: {civil_df.shape}")
print(f"Columns: {civil_df.columns.tolist()}")
print("\n🟥 Toxic Comments Dataset (Toxic Challenge):")
display(toxic_df.head(3))
print(f"Shape: {toxic_df.shape}")
print(f"Columns: {toxic_df.columns.tolist()}")


# Importing and Loadking Data
1 --> Civil comments -> Source - Jigsaw, Purpose -> Bias and Fairness analysis. Text comments from real platform.
2 --> Toxic comments -> Source - kaggle, Purpose - > Training set 

In [None]:
import pandas as pd
toxic_df = pd.read_csv("toxic_comments.csv")
print("Dataset shape:", toxic_df.shape)
print("\n Column names:", toxic_df.columns.tolist())
toxic_df.head()
print("\nMissing values per column:")
print(toxic_df.isnull().sum())
label_cols = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
print("\n Label distribution:")
print(toxic_df[label_cols].sum().sort_values(ascending=False))
toxic_df['num_labels'] = toxic_df[label_cols].sum(axis=1)
print("\n Comments with multiple labels:")
print(toxic_df['num_labels'].value_counts().sort_index())


In [None]:
import pandas as pd
civil_df = pd.read_csv("civil_comments.csv")
print("Dataset shape:", civil_df.shape)
print("\nColumn names:", civil_df.columns.tolist())
print("\n❓ Missing values per column:")
print(civil_df.isnull().sum().sort_values(ascending=False))
civil_df.head()

# Model Training

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer #Converting text into numbers
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score
df = pd.read_csv("toxic_comments.csv")
X = df['comment_text']
y = df['toxic']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
vectorizer = TfidfVectorizer(stop_words='english', max_features=10000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)
y_prob = model.predict_proba(X_test_tfidf)[:, 1]
print(classification_report(y_test, y_pred))
print("AUC:", roc_auc_score(y_test, y_prob))


# A short Example

In [None]:
# comments = [
#     "You are so dumb and annoying",               # likely toxic
#     "Thank you for your help, I appreciate it",   # non-toxic
#     "What a stupid idea, no one cares"            # likely toxic
# ]
# from sklearn.feature_extraction.text import TfidfVectorizer
# import pandas as pd
# vectorizer = TfidfVectorizer(stop_words='english')
# X = vectorizer.fit_transform(comments)
# tfidf_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())
# print(tfidf_df.round(2))


# Simulating Low-Confidence Predictions

In [None]:
import numpy as np
lower_thresh = 0.4
upper_thresh = 0.6
low_conf_mask = (y_prob >= lower_thresh) & (y_prob <= upper_thresh)
X_uncertain = X_test[low_conf_mask]
y_uncertain_true = y_test[low_conf_mask]
y_uncertain_prob = y_prob[low_conf_mask]
print(f"Total low-confidence predictions: {len(X_uncertain)}")
print(f"Percentage of test set: {len(X_uncertain) / len(X_test):.2%}") #Result is out of 20% trained Jigsaw train.csv


# How uncertain is the Model? 

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
np.random.seed(42)
y_prob_simulated = np.concatenate([
    np.random.beta(2, 8, size=5000),  
    np.random.beta(8, 2, size=4000),  
    np.random.normal(0.5, 0.05, size=500)  ])
y_prob_simulated = np.clip(y_prob_simulated, 0, 1)  
plt.figure(figsize=(8, 5))
plt.hist(y_prob_simulated, bins=50, color='skyblue', edgecolor='black')
plt.axvline(0.4, color='red', linestyle='--', label='Lower Threshold (0.4)')
plt.axvline(0.6, color='red', linestyle='--', label='Upper Threshold (0.6)')
plt.title('Model Prediction Confidence Distribution')
plt.xlabel('Predicted Probability of Toxicity')
plt.ylabel('Number of Comments')
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Simulate Human Feedback & Retrain the Model

In [None]:
X_uncertain_cleaned = X_uncertain.astype(str)
X_uncertain_tfidf = vectorizer.transform(X_uncertain_cleaned)
print("Uncertain shape:", X_uncertain_tfidf.shape)
print("Vectorizer type:", type(vectorizer))
print("Vocabulary size:", len(vectorizer.vocabulary_))
from scipy.sparse import vstack
X_augmented = vstack([X_train_tfidf, X_uncertain_tfidf])
y_augmented = pd.concat([y_train.reset_index(drop=True), y_uncertain_true.reset_index(drop=True)])
retrained_model = LogisticRegression()
retrained_model.fit(X_augmented, y_augmented)
y_pred_new = retrained_model.predict(X_test_tfidf)
y_prob_new = retrained_model.predict_proba(X_test_tfidf)[:, 1]
print("\n--- After Human Feedback Retraining ---")
print(classification_report(y_test, y_pred_new))
print("AUC:", roc_auc_score(y_test, y_prob_new))



# Performace Comparison after Human(y_tets) Feedback

In [None]:
import matplotlib.pyplot as plt
import numpy as np
metrics = ['Precision', 'Recall', 'F1-score', 'AUC']
before = [0.90, 0.61, 0.73, 0.9660]
after = [0.92, 0.64, 0.76, 0.9668]
x = np.arange(len(metrics)) 
width = 0.35  
fig, ax = plt.subplots(figsize=(8, 5))
bars1 = ax.bar(x - width/2, before, width, label='Before Feedback', color='lightcoral')
bars2 = ax.bar(x + width/2, after, width, label='After Feedback', color='mediumseagreen')
ax.set_ylabel('Score')
ax.set_title('Model Performance: Before vs After Human Feedback')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.set_ylim(0, 1.1)
ax.legend()
ax.grid(True, axis='y', linestyle='--', alpha=0.7)
for bar in bars1 + bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.2f}',
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3),  # Offset label above bar
                textcoords="offset points",
                ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
df = pd.read_csv("civil_comments.csv")
civil_df = df.sample(n=10000, random_state=42).copy()

civil_df.to_csv("civil_comments_sample.csv", index=False)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np
civil_df = pd.read_csv("civil_comments.csv")
civil_df['comment_text'] = civil_df['comment_text'].astype(str).fillna("")

X_civil = vectorizer.transform(civil_df['comment_text'])

civil_df['model_prob'] = model.predict_proba(X_civil)[:, 1]
civil_df['model_pred'] = model.predict(X_civil)

low_thresh = 0.4
high_thresh = 0.6
uncertain_mask = (civil_df['model_prob'] >= low_thresh) & (civil_df['model_prob'] <= high_thresh)
civil_uncertain = civil_df[uncertain_mask].copy()

civil_uncertain[['comment_text', 'model_prob']].to_csv("uncertain_civil_comments.csv", index=False)

print(f"✅ Extracted {len(civil_uncertain)} uncertain comments.")


In [None]:
civil_uncertain = pd.read_csv("uncertain_civil_comments.csv")
civil_uncertain['word_count'] = civil_uncertain['comment_text'].str.split().apply(len)
civil_filtered = civil_uncertain[(civil_uncertain['word_count'] >= 10) & (civil_uncertain['word_count'] <= 30)]

# 🚀 Add model prediction column (0 or 1)
civil_filtered['model_label'] = (civil_filtered['model_prob'] >= 0.5).astype(int)  # or use your actual predictions if available

# Sample 30 comments
civil_form_sample = civil_filtered[['comment_text', 'model_prob', 'model_label']].sample(n=50, random_state=42).reset_index(drop=True)

# Display for form use
for i, row in civil_form_sample.iterrows():
    label = "Toxic" if row['model_label'] == 1 else "Non-Toxic"
    print(f"\nComment {i+1}:")
    print(f"Text: {row['comment_text']}")
    print(f"Model Prediction: {label}")
    print(f"Model Confidence: {row['model_prob']:.4f}")


In [None]:
#Identity Subgroups
civil_df = pd.read_csv("civil_comments.csv")
civil_df['comment_text'] = civil_df['comment_text'].astype(str)
civil_df['word_count'] = civil_df['comment_text'].str.split().apply(len)
X_civil = vectorizer.transform(civil_df['comment_text'])
civil_df['model_prob'] = model.predict_proba(X_civil)[:, 1]
civil_df['model_pred'] = model.predict(X_civil)
identity_columns = [
    'male', 'female', 'transgender', 'heterosexual',
    'homosexual_gay_or_lesbian', 'bisexual', 'christian', 'jewish', 'muslim',
    'black', 'white', 'asian', 'latino', 'psychiatric_or_mental_illness', 'hindu'
]
group_counts = civil_df[identity_columns].sum().sort_values(ascending=False)
print("\n Subgroup Mentions:\n")
print(group_counts)
print("\n Avg predicted toxicity probability by subgroup:\n")
for col in identity_columns:
    avg_prob = civil_df[civil_df[col] == 1]['model_prob'].mean()
    print(f"{col:<30}: {avg_prob:.4f}")


## Basic Analysis of Human Responses

In [None]:
import pandas as pd

# Reload the file after kernel reset
file_path = "human_feedback_responses.csv"
df_feedback = pd.read_csv(file_path)

# Show basic info and first few rows
df_feedback.info(), df_feedback.head()


In [None]:
# Drop the final duplicate column (which contains only NaNs)
df_feedback_cleaned = df_feedback.iloc[:, :-1]

# Remove timestamp column for analysis
df_feedback_cleaned = df_feedback_cleaned.drop(columns=["Timestamp"])

# Standardize responses: Agree -> 1, Disagree -> 0, Not Sure -> NaN
response_map = {
    "Agree": 1,
    "Disagree": 0,
    "Not Sure": None
}
df_feedback_cleaned_numeric = df_feedback_cleaned.applymap(lambda x: response_map.get(x.strip()) if isinstance(x, str) else x)

# Preview cleaned data
df_feedback_cleaned_numeric.head()


In [None]:
# Calculate overall agreement rate (ignoring 'Not Sure' responses)
total_valid_responses = df_feedback_cleaned_numeric.count().sum()
total_agree = (df_feedback_cleaned_numeric == 1).sum().sum()
overall_agreement_rate = total_agree / total_valid_responses

# Also compute per-question agreement rate
per_question_agreement = df_feedback_cleaned_numeric.apply(lambda col: (col == 1).sum() / col.count())

overall_agreement_rate, per_question_agreement.sort_values(ascending=True)


In [None]:
# # Step 1: Load the uncertain_civil_comments.csv file
# uncertain_path = "/mnt/data/uncertain_civil_comments.csv"
# civil_uncertain = pd.read_csv(uncertain_path)

# # Step 2: Filter comments between 10–30 words
# civil_uncertain['word_count'] = civil_uncertain['comment_text'].str.split().apply(len)
# civil_filtered = civil_uncertain[(civil_uncertain['word_count'] >= 10) & (civil_uncertain['word_count'] <= 30)]

# # Step 3: Add model label based on 0.5 threshold
# civil_filtered['model_label'] = (civil_filtered['model_prob'] >= 0.5).astype(int)

# # Step 4: Sample the same 30 comments (random_state=42)
# civil_form_sample = civil_filtered[['comment_text', 'model_prob', 'model_label']].sample(n=30, random_state=42).reset_index(drop=True)

# # Display to confirm
# civil_form_sample.head()


In [None]:
# Reset human feedback if not already loaded
feedback_path = "human_feedback_responses.csv"
df_feedback = pd.read_csv(feedback_path)

# Drop extra columns and clean response data again
df_feedback_cleaned = df_feedback.drop(columns=["Timestamp"])
response_map = {"Agree": 1, "Disagree": 0, "Not Sure": None}
df_feedback_numeric = df_feedback_cleaned.applymap(lambda x: response_map.get(x.strip()) if isinstance(x, str) else x)

# Transpose to match comment-wise
df_feedback_transposed = df_feedback_numeric.T
df_feedback_transposed.columns = [f"R{i+1}" for i in range(df_feedback_transposed.shape[1])]
df_feedback_transposed.reset_index(drop=True, inplace=True)

# Add average human agreement score
df_feedback_transposed["human_agreement_score"] = df_feedback_transposed.mean(axis=1)

# Derive human label: if > 0.5, majority agreed with model, else disagreed
df_feedback_transposed["human_label"] = (df_feedback_transposed["human_agreement_score"] >= 0.5).astype(int)

# Merge with model predictions
corrected_df = civil_form_sample.copy()
corrected_df["human_label"] = df_feedback_transposed["human_label"]
corrected_df["agreement_score"] = df_feedback_transposed["human_agreement_score"]

# Mark where model prediction differs from human label
corrected_df["label_changed"] = corrected_df["model_label"] != corrected_df["human_label"]

print(corrected_df.head(10))  # Or use corrected_df to access the full DataFrame

# Save corrected DataFrame to CSV
# corrected_df.to_csv("corrected_human_feedback.csv", index=False)

# print("✅ File saved as 'corrected_human_feedback.csv'")


In [None]:
# Load the uncertain civil comments (previously uploaded)
uncertain_df = pd.read_csv("uncertain_civil_comments.csv")
uncertain_df['comment_text'] = uncertain_df['comment_text'].astype(str)

# Filter non-toxic rows (model_label = 0) that are not in corrected_df
non_toxic_candidates = uncertain_df.copy()
non_toxic_candidates['model_label'] = (non_toxic_candidates['model_prob'] >= 0.5).astype(int)
non_toxic_candidates = non_toxic_candidates[non_toxic_candidates['model_label'] == 0]

# Remove rows that already exist in corrected_df
existing_texts = corrected_df['comment_text'].tolist()
non_toxic_candidates = non_toxic_candidates[~non_toxic_candidates['comment_text'].isin(existing_texts)]

# Filter by readable length
non_toxic_candidates['word_count'] = non_toxic_candidates['comment_text'].str.split().apply(len)
non_toxic_filtered = non_toxic_candidates[(non_toxic_candidates['word_count'] >= 5) & (non_toxic_candidates['word_count'] <= 35)]

# Sample 10 and assign label 0
non_toxic_sample = non_toxic_filtered[['comment_text', 'model_prob', 'model_label']].sample(n=10, random_state=42)
non_toxic_sample['human_label'] = None
non_toxic_sample['final_label'] = 0  # Injected manually

# Add to corrected_df
corrected_augmented = pd.concat([corrected_df, non_toxic_sample], ignore_index=True)

# Check label distribution
corrected_augmented['final_label'].value_counts()


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the feedback CSV
df = pd.read_csv("human_feedback_responses.csv")

# # Count agreement responses (assuming column is named 'Agree?')
# agree_counts = df['Agree?'].value_counts()

# Plot
colors = ['lightgreen', 'salmon']
labels = ['Agreed with Model', 'Disagreed with Model']
agree_counts = agree_counts.reindex(['Yes', 'No'])  # Ensure order

plt.figure(figsize=(6, 6))
plt.pie(agree_counts, labels=labels, autopct='%1.1f%%', startangle=140, colors=colors)
plt.title("Participant Agreement with Model Predictions")
plt.axis('equal')  # Equal aspect ratio ensures pie is circular
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Performance metrics
metrics = ['Precision', 'Recall', 'F1-score', 'AUC']
before = [0.90, 0.61, 0.73, 0.9660]
after = [0.75, 1.00, 0.86, 0.25]

x = np.arange(len(metrics))
width = 0.35

fig, ax = plt.subplots(figsize=(8, 5))
bars1 = ax.bar(x - width/2, before, width, label='Before Feedback', color='steelblue')
bars2 = ax.bar(x + width/2, after, width, label='After Feedback', color='darkorange')

# Labels and formatting
ax.set_ylabel('Score')
ax.set_title('Model Performance: Before vs After Human Feedback')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.set_ylim(0, 1.1)
ax.legend()
ax.grid(True, axis='y', linestyle='--', alpha=0.6)

# Add data labels
for bar in bars1 + bars2:
    height = bar.get_height()
    ax.annotate(f'{height:.2f}', 
                xy=(bar.get_x() + bar.get_width() / 2, height),
                xytext=(0, 3), 
                textcoords="offset points",
                ha='center', va='bottom')

plt.tight_layout()
plt.show()


In [None]:
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from scipy.sparse import vstack

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (
    classification_report, roc_auc_score, roc_curve, auc,
    precision_recall_curve, average_precision_score,
    confusion_matrix, precision_score, recall_score, f1_score
)

RANDOM_STATE = 42
LOW_THRESH   = 0.40    # uncertainty band lower
HIGH_THRESH  = 0.60    # uncertainty band upper
MAX_FEATS    = 10_000  # TF–IDF vocab size

np.random.seed(RANDOM_STATE)

# =========================
# 1) Load & Prepare Baseline Data (Jigsaw Toxic)
# =========================
toxic_df = pd.read_csv("toxic_comments.csv")
# Expecting columns: 'comment_text' + multilabels inc. 'toxic'
assert 'comment_text' in toxic_df.columns, "Missing 'comment_text' in toxic_comments.csv"
assert 'toxic' in toxic_df.columns, "Missing 'toxic' column in toxic_comments.csv"
toxic_df['comment_text'] = toxic_df['comment_text'].astype(str).fillna("")

X = toxic_df['comment_text']
y = toxic_df['toxic'].astype(int)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=RANDOM_STATE, stratify=y
)

# TF–IDF (word + char) — same vectorizer reused everywhere
vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=MAX_FEATS,
    ngram_range=(1,2)  # unigrams + bigrams; char-ngrams optional if you want
)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf  = vectorizer.transform(X_test)

# Baseline model
baseline = LogisticRegression(max_iter=1000, n_jobs=None)
baseline.fit(X_train_tfidf, y_train)

# Baseline predictions
y_pred_base = baseline.predict(X_test_tfidf)
y_prob_base = baseline.predict_proba(X_test_tfidf)[:, 1]

print("=== Baseline (no HIL) ===")
print(classification_report(y_test, y_pred_base, digits=3))
print("AUC:", roc_auc_score(y_test, y_prob_base))

# -------------------------
# Baseline plots (saved to PNGs)
# -------------------------
def plot_baseline_curves(y_true, y_prob, y_pred, prefix="fig_4_1"):
    # ROC
    fpr, tpr, _ = roc_curve(y_true, y_prob)
    roc_auc = auc(fpr, tpr)
    plt.figure(figsize=(6,5))
    plt.plot(fpr, tpr, lw=2, label=f"AUC = {roc_auc:.3f}")
    plt.plot([0,1],[0,1],'--', lw=1)
    plt.xlabel('False Positive Rate'); plt.ylabel('True Positive Rate')
    plt.title('Baseline ROC Curve (LogReg + TF–IDF)')
    plt.legend(loc='lower right'); plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout(); plt.savefig(f"{prefix}a_baseline_roc.png", dpi=300); plt.show()

    # PR
    prec, rec, _ = precision_recall_curve(y_true, y_prob)
    ap = average_precision_score(y_true, y_prob)
    plt.figure(figsize=(6,5))
    plt.plot(rec, prec, lw=2, label=f"AP = {ap:.3f}")
    plt.xlabel('Recall'); plt.ylabel('Precision')
    plt.title('Baseline Precision–Recall Curve')
    plt.legend(loc='lower left'); plt.grid(True, linestyle='--', alpha=0.5)
    plt.tight_layout(); plt.savefig(f"{prefix}b_baseline_pr.png", dpi=300); plt.show()

    # Confusion
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(5,5))
    plt.imshow(cm, cmap='Greys')
    plt.title('Baseline Confusion Matrix (thr=0.5)')
    plt.colorbar(fraction=0.046, pad=0.04)
    tick_labels = ['Non-Toxic (0)', 'Toxic (1)']
    plt.xticks([0,1], tick_labels, rotation=20); plt.yticks([0,1], tick_labels)
    for (i,j), v in np.ndenumerate(cm):
        plt.text(j, i, str(v), ha='center', va='center', fontsize=12)
    plt.xlabel('Predicted'); plt.ylabel('Actual')
    plt.tight_layout(); plt.savefig(f"{prefix}c_baseline_confusion.png", dpi=300); plt.show()

    # Metric bars
    P = precision_score(y_true, y_pred)
    R = recall_score(y_true, y_pred)
    F = f1_score(y_true, y_pred)
    A = roc_auc_score(y_true, y_prob)
    metrics = ['Precision','Recall','F1','AUC']; vals=[P,R,F,A]
    plt.figure(figsize=(7,5))
    bars = plt.bar(range(len(metrics)), vals)
    for b,v in zip(bars, vals):
        plt.text(b.get_x()+b.get_width()/2, v+0.02, f"{v:.2f}", ha='center')
    plt.xticks(range(len(metrics)), metrics); plt.ylim(0,1.05)
    plt.ylabel('Score'); plt.title('Baseline Performance')
    plt.grid(True, axis='y', linestyle='--', alpha=0.5)
    plt.tight_layout(); plt.savefig(f"{prefix}d_baseline_metrics.png", dpi=300); plt.show()

plot_baseline_curves(y_test, y_prob_base, y_pred_base)

# =========================
# 2) Score Civil Comments; Extract Uncertain Band
# =========================
civil_df = pd.read_csv("civil_comments.csv")
assert 'comment_text' in civil_df.columns, "Missing 'comment_text' in civil_comments.csv"
civil_df['comment_text'] = civil_df['comment_text'].astype(str).fillna("")

X_civil_tfidf = vectorizer.transform(civil_df['comment_text'])
civil_df['model_prob'] = baseline.predict_proba(X_civil_tfidf)[:, 1]
civil_df['model_label'] = (civil_df['model_prob'] >= 0.50).astype(int)

uncertain_mask = (civil_df['model_prob'] >= LOW_THRESH) & (civil_df['model_prob'] <= HIGH_THRESH)
civil_uncertain = civil_df.loc[uncertain_mask, ['comment_text','model_prob','model_label']].copy()
civil_uncertain.to_csv("uncertain_civil_comments.csv", index=False)
print(f"Extracted uncertain comments: {len(civil_uncertain)} "
      f"({100*len(civil_uncertain)/len(civil_df):.2f}% of Civil).")

# Optional: prepare a small, readable sample for the Google Form
def prepare_form_sample(df_uncertain, n=30, min_words=10, max_words=30, seed=RANDOM_STATE, path="civil_form_sample.csv"):
    tmp = df_uncertain.copy()
    tmp['word_count'] = tmp['comment_text'].str.split().apply(len)
    tmp = tmp[(tmp['word_count'] >= min_words) & (tmp['word_count'] <= max_words)]
    if len(tmp) < n:
        n = len(tmp)
    sample = tmp.sample(n=n, random_state=seed).reset_index(drop=True)
    sample.to_csv(path, index=False)
    return sample

# Uncomment if you want to (re)create the form sample:
# form_sample = prepare_form_sample(civil_uncertain, n=30)

# =========================
# 3) Parse Human Feedback (Google Form -> CSV)
# =========================
# expected file from Google Sheets export:
HF_PATH = "human_feedback_responses.csv"
if os.path.exists(HF_PATH):
    hf_raw = pd.read_csv(HF_PATH)
else:
    hf_raw = pd.DataFrame()  # keep empty if not provided yet

def long_from_form(df):
    """
    Flexible parser:
    - Keeps 'Timestamp' if present (ignored in logic).
    - Treats every non-Timestamp column as a question.
    - Each column header should contain the full comment text (as you set in the form).
    - Cell values are one of: 'Agree', 'Disagree', 'Not Sure' (case-insensitive).
    Returns: long df with columns [comment_text, response]
    """
    if df.empty:
        return pd.DataFrame(columns=['comment_text','response'])

    cols = [c for c in df.columns if str(c).lower() != 'timestamp']
    records = []
    for _, row in df.iterrows():
        for c in cols:
            resp = str(row[c]).strip() if pd.notnull(row[c]) else ""
            if resp == "":
                continue
            records.append({
                'comment_text': str(c).strip(),  # header contains the item
                'response'    : resp
            })
    long_df = pd.DataFrame.from_records(records)
    return long_df

hf_long = long_from_form(hf_raw)

def map_response_to_label(resp: str, model_label_for_item: int) -> float:
    """
    Map respondent choice to human_label:
      - If model said Toxic (1):
           'Agree'   -> 1
           'Disagree'-> 0
      - If model said Non-Toxic (0):
           'Agree'   -> 0
           'Disagree'-> 1
      - 'Not Sure' or other -> np.nan
    """
    r = str(resp).strip().lower()
    if r.startswith('agree'):
        return 1 if model_label_for_item == 1 else 0
    if r.startswith('disagree'):
        return 0 if model_label_for_item == 1 else 1
    return np.nan  # Not Sure / blank

# Join human responses with uncertain metadata by matching on the exact comment text
if not hf_long.empty and not civil_uncertain.empty:
    merged = hf_long.merge(
        civil_uncertain,
        on='comment_text',
        how='left'
    )
    # derive human_label
    merged['human_label'] = [
        map_response_to_label(r, ml) if pd.notnull(ml) else np.nan
        for r, ml in zip(merged['response'], merged['model_label'])
    ]
    # compute agreement score within comment (if multiple respondents per item)
    agg = (merged
           .groupby(['comment_text','model_prob','model_label'], as_index=False)
           .agg(human_label=('human_label', 'mean')))  # mean over respondents (ignores NaNs)
else:
    agg = pd.DataFrame(columns=['comment_text','model_prob','model_label','human_label'])

# Build corrected labels (Option 2 we discussed): backfill with model_label if human_label missing
def build_corrected_df(agg_df):
    df = agg_df.copy()
    # round human_label to binary when present; keep NaN otherwise
    df['human_binary'] = df['human_label'].apply(lambda v: 1 if pd.notnull(v) and v >= 0.5 else (0 if pd.notnull(v) else np.nan))
    df['final_label']  = df['human_binary']
    # backfill with model label when human missing
    m = df['final_label'].isna()
    df.loc[m, 'final_label'] = df.loc[m, 'model_label']
    df['final_label'] = df['final_label'].astype(int)
    return df[['comment_text','model_prob','model_label','human_label','final_label']].copy()

corrected_df = build_corrected_df(agg)
corrected_df.to_csv("corrected_human_feedback.csv", index=False)
print(f"Human feedback rows combined: {len(corrected_df)} "
      f"(with backfill to model labels where needed).")

# Ensure we have both classes; if not, you can inject a few non-toxic or toxic examples:
def ensure_two_classes(df_corrected, civil_full_df, need_each_class=True, max_inject=10):
    lbls = set(df_corrected['final_label'].unique().tolist())
    if need_each_class and lbls == {0}:
        # inject some toxic
        pool = civil_full_df[civil_full_df['model_prob'] >= 0.80].copy()
        pool = pool.sample(min(max_inject, len(pool)), random_state=RANDOM_STATE)
        add = pool[['comment_text','model_prob','model_label']].copy()
        add['human_label'] = np.nan
        add['final_label'] = 1
        return pd.concat([df_corrected, add], ignore_index=True)
    if need_each_class and lbls == {1}:
        # inject some non-toxic
        pool = civil_full_df[civil_full_df['model_prob'] <= 0.20].copy()
        pool = pool.sample(min(max_inject, len(pool)), random_state=RANDOM_STATE)
        add = pool[['comment_text','model_prob','model_label']].copy()
        add['human_label'] = np.nan
        add['final_label'] = 0
        return pd.concat([df_corrected, add], ignore_index=True)
    return df_corrected

corrected_df = ensure_two_classes(corrected_df, civil_df)
print("Final label balance (corrected_df):\n", corrected_df['final_label'].value_counts(dropna=False))

# =========================
# 4) Retraining with Human-Corrected Items (Augment Training)
# =========================
if not corrected_df.empty:
    X_hil  = vectorizer.transform(corrected_df['comment_text'])
    y_hil  = corrected_df['final_label'].astype(int).reset_index(drop=True)
    X_aug  = vstack([X_train_tfidf, X_hil])
    y_aug  = pd.concat([y_train.reset_index(drop=True), y_hil], ignore_index=True)

    retrained = LogisticRegression(max_iter=1000)
    retrained.fit(X_aug, y_aug)

    # Evaluate on the same toxic test set for apples-to-apples comparison
    y_pred_hil = retrained.predict(X_test_tfidf)
    y_prob_hil = retrained.predict_proba(X_test_tfidf)[:, 1]

    print("\n=== After HIL Augmentation ===")
    print(classification_report(y_test, y_pred_hil, digits=3))
    print("AUC:", roc_auc_score(y_test, y_prob_hil))

    # Comparison plot (Before vs After)
    def compare_bars(y_true, y_pred_b, y_prob_b, y_pred_a, y_prob_a, path="fig_compare_before_after.png"):
        P_b = precision_score(y_true, y_pred_b)
        R_b = recall_score(y_true, y_pred_b)
        F_b = f1_score(y_true, y_pred_b)
        A_b = roc_auc_score(y_true, y_prob_b)

        P_a = precision_score(y_true, y_pred_a)
        R_a = recall_score(y_true, y_pred_a)
        F_a = f1_score(y_true, y_pred_a)
        A_a = roc_auc_score(y_true, y_prob_a)

        metrics = ['Precision','Recall','F1-score','AUC']
        before  = [P_b, R_b, F_b, A_b]
        after   = [P_a, R_a, F_a, A_a]

        x = np.arange(len(metrics)); width = 0.35
        plt.figure(figsize=(8,5))
        b1 = plt.bar(x - width/2, before, width, label='Before')
        b2 = plt.bar(x + width/2, after,  width, label='After')
        plt.xticks(x, metrics); plt.ylim(0, 1.05)
        for bars in (b1, b2):
            for bar in bars:
                v = bar.get_height()
                plt.text(bar.get_x()+bar.get_width()/2, v+0.02, f"{v:.2f}", ha='center')
        plt.ylabel('Score'); plt.title('Model Performance: Before vs After Human Feedback')
        plt.legend()
        plt.grid(True, axis='y', linestyle='--', alpha=0.5)
        plt.tight_layout(); plt.savefig(path, dpi=300); plt.show()

    compare_bars(y_test, y_pred_base, y_prob_base, y_pred_hil, y_prob_hil)

else:
    print("No human‑feedback rows parsed; skipping retrain step.")

# =========================
# 5) Save Key Outputs
# =========================
pd.DataFrame({
    'y_true': y_test.reset_index(drop=True),
    'y_pred_baseline': y_pred_base,
    'y_prob_baseline': y_prob_base
}).to_csv("baseline_test_predictions.csv", index=False)

if 'y_pred_hil' in locals():
    pd.DataFrame({
        'y_true': y_test.reset_index(drop=True),
        'y_pred_after': y_pred_hil,
        'y_prob_after': y_prob_hil
    }).to_csv("after_hil_test_predictions.csv", index=False)

print("\nDone.")


In [None]:
# === 0) Imports ===
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import (
    roc_curve, roc_auc_score,
    precision_recall_curve, average_precision_score,
    confusion_matrix, classification_report, precision_score, recall_score, f1_score
)
import itertools

# --- Small helper for pretty bars ---
def _annotate_bars(ax):
    for p in ax.patches:
        h = p.get_height()
        ax.annotate(f"{h:.2f}", (p.get_x()+p.get_width()/2, h),
                    ha='center', va='bottom', xytext=(0,3), textcoords='offset points')

# === 1) Plotters ===
def plot_roc(y_true, y_score, title="ROC Curve"):
    fpr, tpr, _ = roc_curve(y_true, y_score)
    auc = roc_auc_score(y_true, y_score)

    plt.figure(figsize=(5.2, 4.2))
    plt.plot(fpr, tpr, lw=2, label=f"AUC = {auc:.3f}")
    plt.plot([0,1], [0,1], ls='--')
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_pr(y_true, y_score, title="Precision–Recall Curve"):
    precision, recall, _ = precision_recall_curve(y_true, y_score)
    ap = average_precision_score(y_true, y_score)

    plt.figure(figsize=(5.2,4.2))
    plt.plot(recall, precision, lw=2, label=f"AP = {ap:.3f}")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.show()

def plot_confusion(y_true, y_pred, title="Confusion Matrix (thr=0.5)"):
    cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    plt.figure(figsize=(5,4.2))
    plt.imshow(cm, cmap='Greys')
    plt.title(title)
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    tick_labels = ["Non-Toxic (0)", "Toxic (1)"]
    plt.xticks([0,1], tick_labels, rotation=15)
    plt.yticks([0,1], tick_labels)
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j], ha="center", va="center")
    plt.colorbar(fraction=0.046, pad=0.04)
    plt.tight_layout()
    plt.show()

def compute_metrics(y_true, y_prob, thr=0.5):
    y_pred = (y_prob >= thr).astype(int)
    return {
        "precision": precision_score(y_true, y_pred, zero_division=0),
        "recall":    recall_score(y_true, y_pred, zero_division=0),
        "f1":        f1_score(y_true, y_pred, zero_division=0),
        "auc":       roc_auc_score(y_true, y_prob),
        "y_pred":    y_pred
    }

def plot_comparison(baseline, hil, title="Model Performance: Before vs After HIL"):
    metrics = ["precision", "recall", "f1", "auc"]
    before = [baseline[m] for m in metrics]
    after  = [hil[m] for m in metrics]

    x = np.arange(len(metrics))
    w = 0.35
    fig, ax = plt.subplots(figsize=(7,4.2))

In [None]:
    ax.bar(x - w/2, before, w, label="Before HIL")
    ax.bar(x + w/2, after,  w, label="After HIL")
    ax.set_xticks(x)
    ax.set_xticklabels([m.capitalize() for m in metrics])
    ax.set_ylim(0, 1.05)
    ax.set_ylabel("Score")
    ax.set_title(title)
    ax.legend()
    _annotate_bars(ax)
    ax.grid(True, axis='y', linestyle='--', alpha=0.4)
    plt.tight_layout()
    plt.show()

# === 2) BASELINE predictions & plots ===
# y_test: ground truth (0/1)
# model:     baseline LogisticRegression (already fit)
# X_test_tfidf: TF-IDF of X_test (same features used to fit baseline model)
y_prob_base = model.predict_proba(X_test_tfidf)[:, 1]
base = compute_metrics(y_test, y_prob_base, thr=0.5)

print("=== Baseline (no HIL) ===")
print(classification_report(y_test, base["y_pred"]))
print("AUC:", base["auc"])

plot_roc(y_test, y_prob_base, title="Baseline ROC (LogReg + TF‑IDF)")
plot_pr(y_test, y_prob_base, title="Baseline Precision–Recall")
plot_confusion(y_test, base["y_pred"], title="Baseline Confusion Matrix (thr=0.5)")

# === 3) HIL (retrained) predictions & plots ===
# retrained_model: LogisticRegression retrained with HIL-augmented data (already fit)
y_prob_hil = retrained_model.predict_proba(X_test_tfidf)[:, 1]
hil = compute_metrics(y_test, y_prob_hil, thr=0.5)

print("\n=== After HIL (retrained) ===")
print(classification_report(y_test, hil["y_pred"]))
print("AUC:", hil["auc"])

plot_roc(y_test, y_prob_hil, title="After HIL: ROC (Retrained)")
plot_pr(y_test, y_prob_hil, title="After HIL: Precision–Recall")
plot_confusion(y_test, hil["y_pred"], title="After HIL: Confusion Matrix (thr=0.5)")

# === 4) Side-by-side comparison bar chart ===
plot_comparison(base, hil, title="Before vs After Human Feedback")

In [None]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt

# --- 1) Inputs/assumptions ----------------------------------------------------
identity_columns = [
    'male','female','transgender','heterosexual',
    'homosexual_gay_or_lesbian','bisexual','christian','jewish','muslim',
    'black','white','asian','latino','psychiatric_or_mental_illness'
]

# Ensure required cols exist
missing = [c for c in identity_columns + ['comment_text','target'] if c not in civil_df.columns]
if missing:
    raise ValueError(f"Missing columns in civil_df: {missing}")

# Binarize ground truth for fairness eval (Civil Comments 'target' is 0..1)
y_true_civil = (civil_df['target'] >= 0.5).astype(int).values

# Re-use the SAME TF-IDF vectorizer
X_civil_tfidf = vectorizer.transform(civil_df['comment_text'])

# Get probabilities from baseline and HIL models
y_prob_civil_base = model.predict_proba(X_civil_tfidf)[:, 1]
y_prob_civil_hil  = retrained_model.predict_proba(X_civil_tfidf)[:, 1]

# --- 2) Helper metrics (Borkan et al., 2019) ----------------------------------
def safe_auc(y_true, y_score):
    # handle edge cases where only one class is present
    if len(np.unique(y_true)) < 2:
        return np.nan
    return roc_auc_score(y_true, y_score)

def subgroup_auc(y_true, y_score, subgroup_mask):
    return safe_auc(y_true[subgroup_mask], y_score[subgroup_mask])

def bpsn_auc(y_true, y_score, subgroup_mask):
    # Background Positive, Subgroup Negative
    # positives from background (mask==False & y_true==1) vs negatives from subgroup (mask==True & y_true==0)
    idx = ((~subgroup_mask) & (y_true == 1)) | ((subgroup_mask) & (y_true == 0))
    return safe_auc(y_true[idx], y_score[idx])

def bnsp_auc(y_true, y_score, subgroup_mask):
    # Background Negative, Subgroup Positive
    # negatives from background vs positives from subgroup
    idx = ((~subgroup_mask) & (y_true == 0)) | ((subgroup_mask) & (y_true == 1))
    return safe_auc(y_true[idx], y_score[idx])

def average_equality_gap(y_true, y_score, subgroup_mask):
    # AEG ≈ mean difference of scores for positives (or negatives) between subgroup and background.
    # Here we use positives-based AEG (you can also compute for negatives and average both).
    pos = y_true == 1
    s_pos = y_score[subgroup_mask & pos]
    b_pos = y_score[(~subgroup_mask) & pos]
    if len(s_pos)==0 or len(b_pos)==0:
        return np.nan
    return float(np.mean(s_pos) - np.mean(b_pos))

# --- 3) Compute metrics for each subgroup for both models ---------------------
def fairness_table(y_true, y_score, df, id_cols):
    rows = []
    for col in id_cols:
        mask = df[col].astype(float) >= 0.5  # Civil Comments marks presence as float ∈ [0,1]
        rows.append({
            "subgroup": col,
            "count": int(mask.sum()),
            "subgroup_auc": subgroup_auc(y_true, y_score, mask),
            "bpsn_auc":     bpsn_auc(y_true, y_score, mask),
            "bnsp_auc":     bnsp_auc(y_true, y_score, mask),
            "aeg_pos":      average_equality_gap(y_true, y_score, mask),
            "avg_prob_in_subgroup": float(np.mean(y_score[mask])) if mask.sum()>0 else np.nan
        })
    return pd.DataFrame(rows).sort_values("subgroup")

fair_base = fairness_table(y_true_civil, y_prob_civil_base, civil_df, identity_columns)
fair_hil  = fairness_table(y_true_civil, y_prob_civil_hil,  civil_df, identity_columns)

# Merge for comparison
fair_compare = fair_base.merge(fair_hil, on="subgroup", suffixes=("_base","_hil"))
# Compute deltas (HIL - Base)
for m in ["subgroup_auc","bpsn_auc","bnsp_auc","aeg_pos","avg_prob_in_subgroup"]:
    fair_compare[f"delta_{m}"] = fair_compare[f"{m}_hil"] - fair_compare[f"{m}_base"]

# Display the top-line comparison
cols_to_show = [
    "subgroup","count_base","subgroup_auc_base","bpsn_auc_base","bnsp_auc_base","aeg_pos_base",
    "subgroup_auc_hil","bpsn_auc_hil","bnsp_auc_hil","aeg_pos_hil",
    "delta_subgroup_auc","delta_bpsn_auc","delta_bnsp_auc","delta_aeg_pos"
]
# count is the same for both (same df), rename for clarity
fair_compare["count_base"] = fair_compare["count_base"] if "count_base" in fair_compare else fair_compare["count"]

print("\n=== Fairness metrics per subgroup (Baseline vs HIL) ===")
display(fair_compare[cols_to_show])

# --- 4) Quick visualization: change in BPSN/BNSP/Subgroup AUC -----------------
metrics_for_bar = ["delta_subgroup_auc","delta_bpsn_auc","delta_bnsp_auc"]
plot_df = fair_compare[["subgroup"] + metrics_for_bar].set_index("subgroup").sort_values("delta_subgroup_auc")

ax = plot_df.plot(kind="barh", figsize=(10,7))
ax.set_title("Change in Fairness Metrics After HIL (HIL – Baseline)")
ax.set_xlabel("Delta AUC (positive = improvement)")
ax.grid(axis="x", linestyle="--", alpha=0.5)
plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt

# ===== 1) Plug your final numbers here =====
# Example placeholders — replace with your measured values
metrics = ["Accuracy", "Precision", "Recall", "F1", "Subgroup AUC", "BPSN AUC", "BNSP AUC"]

baseline = [0.95, 0.92, 0.64, 0.76, 0.78, 0.77, 0.79]  # ← replace
hil      = [0.95, 0.92, 0.66, 0.77, 0.79, 0.77, 0.81]  # ← replace

# ===== 2) Radar chart prep =====
angles = np.linspace(0, 2*np.pi, len(metrics), endpoint=False)
baseline_vals = np.r_[baseline, baseline[0]]
hil_vals      = np.r_[hil, hil[0]]
angles_full   = np.r_[angles, angles[0]]

# ===== 3) Plot =====
fig = plt.figure(figsize=(7, 7))
ax = plt.subplot(111, polar=True)

ax.plot(angles_full, baseline_vals, linewidth=2, label="Baseline")
ax.fill(angles_full, baseline_vals, alpha=0.15)

ax.plot(angles_full, hil_vals, linewidth=2, linestyle="--", label="HIL")
ax.fill(angles_full, hil_vals, alpha=0.15)

ax.set_xticks(angles)
ax.set_xticklabels(metrics)
ax.set_yticks([0.6, 0.7, 0.8, 0.9, 1.0])
ax.set_yticklabels(["0.60","0.70","0.80","0.90","1.00"])
ax.set_ylim(0.6, 1.0)
ax.set_title("Baseline vs HIL – Final Metrics (Radar View)", pad=20)
ax.legend(loc="upper right", bbox_to_anchor=(1.25, 1.1))

plt.tight_layout()
plt.show()
