In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
from sklearn.datasets import make_circles, make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    confusion_matrix,
    classification_report
)
import numpy as np

torch.manual_seed(42)
np.random.seed(42)

In [None]:
print("=" * 60)
print("Part 1: Setup and Data Preparation")
print("=" * 60)

# Create binary classification dataset
X, y = make_circles(n_samples=1000, noise=0.03, factor=0.5, random_state=42)

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Convert to PyTorch tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train).unsqueeze(1)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test).unsqueeze(1)

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Move to device
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)


In [None]:
print("\n" + "=" * 60)
print("Part 2: Define and Train Model")
print("=" * 60)

# Define non-linear model
class BinaryClassifier(nn.Module):
    def __init__(self):
        super(BinaryClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(2, 16),
            nn.ReLU(),
            nn.Linear(16, 8),
            nn.ReLU(),
            nn.Linear(8, 1)
        )

    def forward(self, x):
        return self.layers(x)

model = BinaryClassifier().to(device)

# Train model
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

epochs = 100
for epoch in range(epochs):
    model.train()
    y_logits = model(X_train)
    loss = criterion(y_logits, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 20 == 0:
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

print("\nTraining complete!")

In [None]:
print("\n" + "=" * 60)
print("Part 3: Calculate Metrics with sklearn")
print("=" * 60)

# Make predictions
model.eval()
with torch.inference_mode():
    test_logits = model(X_test)
    test_probs = torch.sigmoid(test_logits)
    test_preds = (test_probs > 0.5).long()

# Convert to numpy for sklearn
y_true = y_test.cpu().numpy().squeeze()
y_pred = test_preds.cpu().numpy().squeeze()

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("Classification Metrics:")
print(f"  Accuracy:  {accuracy:.4f}")
print(f"  Precision: {precision:.4f}")
print(f"  Recall:    {recall:.4f}")
print(f"  F1-Score:  {f1:.4f}")

# Detailed classification report
print("\nDetailed Classification Report:")
print(classification_report(y_true, y_pred, target_names=['Class 0', 'Class 1']))


In [None]:
print("\n" + "=" * 60)
print("Part 4: Confusion Matrix")
print("=" * 60)

# Calculate confusion matrix
cm = confusion_matrix(y_true, y_pred)
print("Confusion Matrix:")
print(cm)
print(f"\nTP={cm[1,1]}, TN={cm[0,0]}, FP={cm[0,1]}, FN={cm[1,0]}")

# Explain the confusion matrix
print("\nConfusion Matrix Explained:")
print("                Predicted")
print("           Positive    Negative")
print("Actual  Pos    TP         FN")
print("        Neg    FP         TN")
print("\nTP = True Positive  (correctly predicted positive)")
print("TN = True Negative  (correctly predicted negative)")
print("FP = False Positive (incorrectly predicted positive)")
print("FN = False Negative (incorrectly predicted negative)")


In [None]:
print("\n" + "=" * 60)
print("Part 5: Visualizing Confusion Matrix")
print("=" * 60)

def plot_confusion_matrix(cm, class_names=None, title='Confusion Matrix'):
    """Plot confusion matrix with annotations"""
    if class_names is None:
        class_names = [f'Class {i}' for i in range(len(cm))]

    fig, ax = plt.subplots(figsize=(8, 6))

    # Plot heatmap
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)

    # Configure axes
    ax.set(
        xticks=np.arange(len(class_names)),
        yticks=np.arange(len(class_names)),
        xticklabels=class_names,
        yticklabels=class_names,
        title=title,
        ylabel='True Label',
        xlabel='Predicted Label'
    )

    # Rotate x-axis labels
    plt.setp(ax.get_xticklabels(), rotation=45, ha='right', rotation_mode='anchor')

    # Annotate cells
    thresh = cm.max() / 2
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], 'd'),
                   ha='center', va='center',
                   color='white' if cm[i, j] > thresh else 'black')

    fig.tight_layout()
    plt.show()

plot_confusion_matrix(cm, class_names=['Class 0', 'Class 1'])

In [None]:
print("\n" + "=" * 60)
print("Part 6: Understanding Precision vs Recall")
print("=" * 60)

print("Precision vs Recall Trade-off:")
print("\nHigh Precision, Low Recall:")
print("  - Very confident when predicting positive")
print("  - Few false positives")
print("  - But misses many actual positives")
print("  Example: Spam filter (don't mark legitimate email as spam)")

print("\nLow Precision, High Recall:")
print("  - Catches most actual positives")
print("  - Many false positives")
print("  - Better to be safe than sorry")
print("  Example: Disease screening (don't miss sick patients)")

print(f"\nThis model:")
print(f"  Precision: {precision:.4f} - Of all predicted positive, {precision*100:.1f}% are correct")
print(f"  Recall:    {recall:.4f} - Of all actual positive, {recall*100:.1f}% are caught")


In [None]:
print("\n" + "=" * 60)
print("Part 7: Using torchmetrics")
print("=" * 60)

try:
    from torchmetrics import Accuracy, Precision, Recall, F1Score, ConfusionMatrix

    # Setup metrics
    accuracy_metric = Accuracy(task='binary')
    precision_metric = Precision(task='binary')
    recall_metric = Recall(task='binary')
    f1_metric = F1Score(task='binary')
    confmat_metric = ConfusionMatrix(task='binary')

    # Calculate metrics
    acc = accuracy_metric(test_preds, y_test)
    prec = precision_metric(test_preds, y_test)
    rec = recall_metric(test_preds, y_test)
    f1 = f1_metric(test_preds, y_test)
    cm_tm = confmat_metric(test_preds, y_test)

    print("torchmetrics Results:")
    print(f"  Accuracy:  {acc:.4f}")
    print(f"  Precision: {prec:.4f}")
    print(f"  Recall:    {rec:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  Confusion Matrix:\n{cm_tm}")

except ImportError:
    print("torchmetrics not installed.")
    print("Install with: pip install torchmetrics")

In [None]:
print("\n" + "=" * 60)
print("Part 8: Multi-Class Metrics")
print("=" * 60)

# Create multi-class dataset
X_multi, y_multi = make_blobs(n_samples=1000, n_features=2, centers=4, random_state=42)

# Split
X_train_m, X_test_m, y_train_m, y_test_m = train_test_split(
    X_multi, y_multi, test_size=0.2, random_state=42, stratify=y_multi
)

# Convert to tensors
X_test_m = torch.FloatTensor(X_test_m).to(device)
y_test_m = torch.LongTensor(y_test_m).to(device)

# Train simple multi-class model
class MultiClassClassifier(nn.Module):
    def __init__(self):
        super(MultiClassClassifier, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(2, 16),
            nn.ReLU(),
            nn.Linear(16, 4)
        )

    def forward(self, x):
        return self.layers(x)

model_m = MultiClassClassifier().to(device)

# Train
criterion_m = nn.CrossEntropyLoss()
optimizer_m = optim.Adam(model_m.parameters(), lr=0.01)

for epoch in range(100):
    model_m.train()
    outputs = model_m(torch.FloatTensor(X_train_m).to(device))
    loss = criterion_m(outputs, torch.LongTensor(y_train_m).to(device))
    optimizer_m.zero_grad()
    loss.backward()
    optimizer_m.step()

# Make predictions
model_m.eval()
with torch.inference_mode():
    test_logits_m = model_m(X_test_m)
    test_preds_m = torch.argmax(test_logits_m, dim=1)

# Calculate metrics
y_true_m = y_test_m.cpu().numpy()
y_pred_m = test_preds_m.cpu().numpy()

accuracy_m = accuracy_score(y_true_m, y_pred_m)
precision_m = precision_score(y_true_m, y_pred_m, average='macro')
recall_m = recall_score(y_true_m, y_pred_m, average='macro')
f1_m = f1_score(y_true_m, y_pred_m, average='macro')

print("Multi-class Metrics:")
print(f"  Accuracy:  {accuracy_m:.4f}")
print(f"  Precision: {precision_m:.4f} (macro)")
print(f"  Recall:    {recall_m:.4f} (macro)")
print(f"  F1-Score:  {f1_m:.4f} (macro)")

# Confusion matrix
cm_m = confusion_matrix(y_true_m, y_pred_m)
print(f"\nConfusion Matrix:")
print(cm_m)

# Plot multi-class confusion matrix
plot_confusion_matrix(cm_m, class_names=['Class 0', 'Class 1', 'Class 2', 'Class 3'],
                     title='Multi-class Confusion Matrix')


In [None]:
print("\n" + "=" * 60)
print("Part 9: Comparing Models")
print("=" * 60)

print("When comparing models, use multiple metrics:")
print("  1. Accuracy - Overall correctness")
print("  2. Precision - How many predicted positives are correct")
print("  3. Recall - How many actual positives were caught")
print("  4. F1-Score - Balance between precision and recall")

print("\nWhich metric matters most depends on your problem:")
print("  - Spam detection: High precision (don't delete legitimate emails)")
print("  - Disease screening: High recall (don't miss sick patients)")
print("  - Balanced dataset: Accuracy is fine")
print("  - Imbalanced dataset: Use F1-score or precision/recall")

In [None]:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, classification_report, confusion_matrix
import numpy as np

# Create IMBALANCED dataset (90% class 0, 10% class 1)
X, y = make_classification(
    n_samples=1000,
    n_features=2,
    n_informative=2,
    n_redundant=0,
    n_clusters_per_class=1,
    weights=[0.9, 0.1],  # 90% class 0, 10% class 1
    flip_y=0.01,
    random_state=42
)

# Check class distribution
unique, counts = np.unique(y, return_counts=True)
print(f"\nClass distribution:")
for cls, count in zip(unique, counts):
    print(f"  Class {cls}: {count} samples ({count/len(y)*100:.1f}%)")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Convert to tensors
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train).unsqueeze(1)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test).unsqueeze(1)

# Setup device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
X_train, y_train = X_train.to(device), y_train.to(device)
X_test, y_test = X_test.to(device), y_test.to(device)

model = BinaryClassifier().to(device)

# Train model
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=0.01)

print("\nTraining model...")
for epoch in range(100):
    model.train()
    y_logits = model(X_train)
    loss = criterion(y_logits, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("Training complete!")

# Make predictions
model.eval()
with torch.inference_mode():
    test_logits = model(X_test)
    test_probs = torch.sigmoid(test_logits)
    test_preds = (test_probs > 0.5).long()

# Convert to numpy
y_true = y_test.cpu().numpy().squeeze()
y_pred = test_preds.cpu().numpy().squeeze()

# Calculate metrics
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print("\n" + "=" * 60)
print("Results: Accuracy vs F1-Score")
print("=" * 60)
print(f"Accuracy:  {accuracy:.4f}")
print(f"F1-Score:  {f1:.4f}")

# Show confusion matrix
cm = confusion_matrix(y_true, y_pred)
print(f"\nConfusion Matrix:")
print(cm)
print(f"TP={cm[1,1]}, TN={cm[0,0]}, FP={cm[0,1]}, FN={cm[1,0]}")


# Calculate naive baseline
baseline_accuracy = np.sum(y_true == 0) / len(y_true)
print(f"\n" + "=" * 60)
print("Comparison with Naive Baseline")
print("=" * 60)
print(f"Naive baseline : {baseline_accuracy:.4f}")
print(f"Model accuracy:                           {accuracy:.4f}")
print(f"Model F1-Score:                           {f1:.4f}")


In [None]:
print("\nTraining model...")
for epoch in range(100):
    model.train()
    y_logits = model(X_train)
    loss = criterion(y_logits, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print("Training complete!")

# Make predictions (get probabilities)
model.eval()
with torch.inference_mode():
    test_logits = model(X_test)
    test_probs = torch.sigmoid(test_logits)

# Convert to numpy
y_true = y_test.cpu().numpy().squeeze()
test_probs_np = test_probs.cpu().numpy().squeeze()

print("\n" + "=" * 60)
print("Testing Different Thresholds")
print("=" * 60)

# Test different thresholds
thresholds = [0.3, 0.5, 0.7]

for threshold in thresholds:
    # Apply threshold
    y_pred = (test_probs_np > threshold).astype(int)

    # Calculate metrics
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    accuracy = accuracy_score(y_true, y_pred)

    print(f"\nThreshold: {threshold}")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")
    print(f"  Accuracy:  {accuracy:.4f}")

In [None]:
# Compare different averaging methods
averaging_methods = ['macro', 'weighted', 'micro']

print("\nComparing Averaging Methods:")
print("-" * 60)

for avg_method in averaging_methods:
    precision = precision_score(y_true_m, y_pred_m, average=avg_method)
    recall = recall_score(y_true_m, y_pred_m, average=avg_method)
    f1 = f1_score(y_true_m, y_pred_m, average=avg_method)

    print(f"\nAverage method: '{avg_method}'")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")

In [None]:

from sklearn.metrics import confusion_matrix

print("=" * 60)
print("Exercise 4: Create Custom Metrics")
print("=" * 60)

cm = confusion_matrix(y_true, y_pred)

# Extract values from confusion matrix
TN = cm[0, 0]
FP = cm[0, 1]
FN = cm[1, 0]
TP = cm[1, 1]

print(f"\nConfusion Matrix Values:")
print(f"  TN (True Negative):  {TN}")
print(f"  FP (False Positive): {FP}")
print(f"  FN (False Negative): {FN}")
print(f"  TP (True Positive):  {TP}")

# Custom Metric 1: Specificity
specificity = TN / (TN + FP)
print(f"\n" + "=" * 60)
print("Custom Metric 1: Specificity")
print("=" * 60)
print(f"Specificity = TN / (TN + FP)")
print(f"Specificity = {TN} / ({TN} + {FP})")
print(f"Specificity = {specificity:.4f}")
print("\nInterpretation:")
print(f"  Of all actual negatives, {specificity*100:.1f}% were correctly identified")
print("  High specificity = few false alarms")

# Custom Metric 2: False Positive Rate (FPR)
fpr = FP / (FP + TN)
print(f"\n" + "=" * 60)
print("Custom Metric 2: False Positive Rate")
print("=" * 60)
print(f"FPR = FP / (FP + TN)")
print(f"FPR = {FP} / ({FP} + {TN})")
print(f"FPR = {fpr:.4f}")
print("\nInterpretation:")
print(f"  Of all actual negatives, {fpr*100:.1f}% were incorrectly labeled positive")
print("  Low FPR = good (fewer false alarms)")

# Relationship
print(f"\n" + "=" * 60)
print("Relationship:")
print("=" * 60)
print(f"Specificity + FPR = {specificity:.4f} + {fpr:.4f} = {specificity + fpr:.4f}")
print("Specificity = 1 - FPR")
print("(They are complementary)")

# Compare with Recall (Sensitivity)
recall = TP / (TP + FN)
print(f"\n" + "=" * 60)
print("Comparison with Recall:")
print("=" * 60)
print(f"Recall (Sensitivity):     {recall:.4f} - focuses on positives")
print(f"Specificity:              {specificity:.4f} - focuses on negatives")
print("\nBoth are important for a complete picture!")
print("=" * 60)

In [None]:
cm_m = confusion_matrix(y_true_m, y_pred_m)

print("\nConfusion Matrix:")
print(cm_m)

# Calculate per-class metrics
print("\n" + "=" * 60)
print("Per-class Metrics:")
print("=" * 60)

n_classes = len(np.unique(y_true_m))

for class_id in range(n_classes):
    # Calculate metrics for each class using one-vs-rest approach
    precision = precision_score(y_true_m, y_pred_m, labels=[class_id], average=None)[0]
    recall = recall_score(y_true_m, y_pred_m, labels=[class_id], average=None)[0]
    f1 = f1_score(y_true_m, y_pred_m, labels=[class_id], average=None)[0]

    # Count samples
    n_samples = np.sum(y_true_m == class_id)

    print(f"\nClass {class_id} ({n_samples} samples):")
    print(f"  Precision: {precision:.4f}")
    print(f"  Recall:    {recall:.4f}")
    print(f"  F1-Score:  {f1:.4f}")

# Find hardest class to classify
print("\n" + "=" * 60)
print("Analysis: Which Class is Hardest to Classify?")
print("=" * 60)

# Calculate F1 for all classes
f1_per_class = f1_score(y_true_m, y_pred_m, average=None)

hardest_class = np.argmin(f1_per_class)
easiest_class = np.argmax(f1_per_class)

print(f"\nHardest class: Class {hardest_class}")
print(f"  F1-Score: {f1_per_class[hardest_class]:.4f}")

print(f"\nEasiest class: Class {easiest_class}")
print(f"  F1-Score: {f1_per_class[easiest_class]:.4f}")

# Per-class accuracy from confusion matrix
print("\n" + "=" * 60)
print("Per-class Accuracy (from confusion matrix):")
print("=" * 60)

for class_id in range(n_classes):
    # Diagonal element = correctly classified
    correct = cm_m[class_id, class_id]
    total = np.sum(cm_m[class_id, :])
    class_accuracy = correct / total if total > 0 else 0

    print(f"Class {class_id}: {class_accuracy:.4f} ({correct}/{total} correct)")

print("\n" + "=" * 60)
print("Conclusion:")
print("=" * 60)
print(f"The hardest class is Class {hardest_class} with F1={f1_per_class[hardest_class]:.4f}")
print("This could be due to:")
print("  - Overlapping features with other classes")
print("  - Fewer training samples")
print("  - Inherent difficulty in the data")
print("=" * 60)

In [None]:
from IPython.display import clear_output
clear_output(wait=True)