In [1]:
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter

# Generate synthetic dataset
def generate_data(n_samples=200):
    np.random.seed(42)
    X = np.random.rand(n_samples, 2) * 10
    y = (X[:, 0] + X[:, 1] > 10).astype(int)
    
    # Add noise (to simulate real-world messiness)
    noise = np.random.choice(n_samples, size=int(n_samples * 0.1), replace=False)
    y[noise] = 1 - y[noise]
    
    return X, y

# Simple Decision Tree-Like Overfit Model (manually overfit)
def train_overfit_model(X, y):
    thresholds = np.linspace(0, 10, 100)
    best_acc = 0
    best_split = (0, 5)  # (feature index, threshold)
    
    for feat in range(X.shape[1]):
        for thresh in thresholds:
            pred = (X[:, feat] > thresh).astype(int)
            acc = (pred == y).mean()
            if acc > best_acc:
                best_acc = acc
                best_split = (feat, thresh)
                
    return best_split

def predict_model(X, model):
    feat, thresh = model
    return (X[:, feat] > thresh).astype(int)

# Evaluation Metrics
def evaluate_metrics(y_true, y_pred):
    tp = np.sum((y_true == 1) & (y_pred == 1))
    tn = np.sum((y_true == 0) & (y_pred == 0))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))
    
    accuracy = (tp + tn) / len(y_true)
    precision = tp / (tp + fp + 1e-8)
    recall = tp / (tp + fn + 1e-8)
    f1 = 2 * precision * recall / (precision + recall + 1e-8)
    
    return {
        "Accuracy": accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1-Score": f1
    }

# K-Fold Cross Validation
def cross_validate(X, y, k=5):
    fold_size = len(X) // k
    metrics_list = []
    
    for i in range(k):
        start = i * fold_size
        end = (i+1) * fold_size
        X_val = X[start:end]
        y_val = y[start:end]
        X_train = np.concatenate((X[:start], X[end:]), axis=0)
        y_train = np.concatenate((y[:start], y[end:]), axis=0)

        model = train_overfit_model(X_train, y_train)
        y_pred = predict_model(X_val, model)
        metrics = evaluate_metrics(y_val, y_pred)
        metrics_list.append(metrics)
    
    # Average across folds
    avg_metrics = {
        metric: np.mean([m[metric] for m in metrics_list])
        for metric in metrics_list[0]
    }
    
    return avg_metrics

# Run
X, y = generate_data()
metrics = cross_validate(X, y, k=5)
print("Cross-Validation Metrics (Overfitted Model):")
for key, value in metrics.items():
    print(f"{key}: {value:.4f}")


Cross-Validation Metrics (Overfitted Model):
Accuracy: 0.6400
Precision: 0.6579
Recall: 0.6864
F1-Score: 0.6417
