# Task 02 Solutions: Threshold Optimization

Complete solutions for threshold optimization tasks

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import precision_score, recall_score, f1_score, precision_recall_curve

In [None]:
# Load data
df = pd.read_csv('../fixtures/input/classification_data.csv')
y_true = df['true_label'].values
y_prob = df['predicted_probability'].values

## Task 2.1: Find Optimal Threshold for F1-Score

In [None]:
# Solution: Try many thresholds and find best F1
thresholds = np.arange(0.0, 1.01, 0.01)
f1_scores = []

for threshold in thresholds:
    y_pred = (y_prob >= threshold).astype(int)
    f1 = f1_score(y_true, y_pred, zero_division=0)
    f1_scores.append(f1)

optimal_idx = np.argmax(f1_scores)
optimal_threshold = thresholds[optimal_idx]
optimal_f1 = f1_scores[optimal_idx]

print(f"Optimal threshold: {optimal_threshold:.2f}")
print(f"F1-Score at optimal: {optimal_f1:.4f}")

# Plot
plt.figure(figsize=(10, 6))
plt.plot(thresholds, f1_scores, linewidth=2)
plt.axvline(optimal_threshold, color='red', linestyle='--', label=f'Optimal: {optimal_threshold:.2f}')
plt.xlabel('Threshold')
plt.ylabel('F1-Score')
plt.title('F1-Score vs Threshold')
plt.legend()
plt.grid(alpha=0.3)
plt.show()

assert 0.0 <= optimal_threshold <= 1.0
print("✅ Optimal threshold found!")

## Task 2.2: High Precision Threshold (95% precision)

In [None]:
# Solution: Find threshold achieving 95% precision
target_precision = 0.95

# Initialize in case not found
high_precision_threshold = 0.9
high_precision_recall = 0.0

for threshold in np.arange(0.99, 0.0, -0.01):  # Start high
    y_pred = (y_prob >= threshold).astype(int)
    if np.sum(y_pred) > 0:  # At least some predictions
        precision = precision_score(y_true, y_pred, zero_division=0)
        if precision >= target_precision:
            high_precision_threshold = threshold
            high_precision_recall = recall_score(y_true, y_pred)
            break

print(f"Threshold for 95% precision: {high_precision_threshold:.2f}")
print(f"Recall at this threshold: {high_precision_recall:.4f}")

# Verify threshold is valid
assert 0.0 <= high_precision_threshold <= 1.0
print("✅ High precision threshold found!")

## Task 2.3: High Recall Threshold (95% recall)

In [None]:
# Solution: Find threshold achieving 95% recall
target_recall = 0.95

# Initialize in case not found
high_recall_threshold = 0.1
high_recall_precision = 0.0

for threshold in np.arange(0.0, 1.0, 0.01):  # Start low
    y_pred = (y_prob >= threshold).astype(int)
    recall = recall_score(y_true, y_pred, zero_division=0)
    if recall >= target_recall:
        high_recall_threshold = threshold
        high_recall_precision = precision_score(y_true, y_pred, zero_division=0)
        break

print(f"Threshold for 95% recall: {high_recall_threshold:.2f}")
print(f"Precision at this threshold: {high_recall_precision:.4f}")

# Verify threshold is valid
assert 0.0 <= high_recall_threshold <= 1.0
print("✅ High recall threshold found!")

## Task 2.4: Youden's J Statistic

**Youden's J = Sensitivity + Specificity - 1**

In [None]:
# Solution
from sklearn.metrics import confusion_matrix

j_scores = []

for threshold in thresholds:
    y_pred = (y_prob >= threshold).astype(int)
    cm = confusion_matrix(y_true, y_pred)
    tn, fp, fn, tp = cm.ravel()
    
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    
    j = sensitivity + specificity - 1
    j_scores.append(j)

youden_idx = np.argmax(j_scores)
youden_threshold = thresholds[youden_idx]
youden_j = j_scores[youden_idx]

print(f"Youden's optimal threshold: {youden_threshold:.2f}")
print(f"Youden's J statistic: {youden_j:.4f}")

assert 0.0 <= youden_threshold <= 1.0
print("✅ Youden's J calculated!")

## Summary: Compare All Thresholds

In [None]:
# Compare all thresholds
thresholds_comparison = pd.DataFrame([
    {'Strategy': 'Default (0.5)', 'Threshold': 0.50},
    {'Strategy': 'Optimal F1', 'Threshold': optimal_threshold},
    {'Strategy': 'High Precision', 'Threshold': high_precision_threshold},
    {'Strategy': 'High Recall', 'Threshold': high_recall_threshold},
    {'Strategy': "Youden's J", 'Threshold': youden_threshold}
])

# Calculate metrics for each
for idx, row in thresholds_comparison.iterrows():
    t = row['Threshold']
    y_pred = (y_prob >= t).astype(int)
    
    thresholds_comparison.at[idx, 'Precision'] = precision_score(y_true, y_pred, zero_division=0)
    thresholds_comparison.at[idx, 'Recall'] = recall_score(y_true, y_pred, zero_division=0)
    thresholds_comparison.at[idx, 'F1'] = f1_score(y_true, y_pred, zero_division=0)

print(thresholds_comparison.to_string(index=False))
print("\n✅ All threshold strategies compared!")