In [1]:
%load_ext autoreload
%autoreload 2
import numpy as np
import matplotlib.pyplot as plt
from implementations import *
from Data_cleaning import *
from helpers import *

In [2]:
x_train, x_test, y_train, train_ids, test_ids = load_csv_data("data\dataset\dataset")

In [105]:
x_train_2, keep_mask = remove_nan_features(x_train, 0.4)
X_test = x_test[:, keep_mask]

In [106]:
print(x_train_2.shape, x_test.shape)

(328135, 163) (109379, 321)


In [107]:
y_tr_ = (y_train + 1) / 2

In [108]:
def stratified_three_way_split(X, y, val_ratio=0.15, test_ratio=0.15, seed=0):
    np.random.seed(seed)
    idx_pos = np.where(y == 1)[0]
    idx_neg = np.where(y == 0)[0]
    np.random.shuffle(idx_pos)
    np.random.shuffle(idx_neg)

    n_pos, n_neg = len(idx_pos), len(idx_neg)
    n_val_pos = int(n_pos * val_ratio)
    n_test_pos = int(n_pos * test_ratio)
    n_val_neg = int(n_neg * val_ratio)
    n_test_neg = int(n_neg * test_ratio)

    val_idx = np.concatenate([idx_pos[:n_val_pos], idx_neg[:n_val_neg]])
    test_idx = np.concatenate([
        idx_pos[n_val_pos:n_val_pos + n_test_pos],
        idx_neg[n_val_neg:n_val_neg + n_test_neg]
    ])
    train_idx = np.concatenate([
        idx_pos[n_val_pos + n_test_pos:],
        idx_neg[n_val_neg + n_test_neg:]
    ])

    np.random.shuffle(train_idx)
    np.random.shuffle(val_idx)
    np.random.shuffle(test_idx)

    return (
        X[train_idx], y[train_idx],
        X[val_idx], y[val_idx],
        X[test_idx], y[test_idx]
    )


In [109]:
x_tr, y_tr, x_va, y_va, x_te, y_te = stratified_three_way_split(x_train_2, y_tr_)

In [110]:
print(x_train_2.shape, x_tr.shape, y_tr.shape, x_va.shape, y_va.shape, x_te.shape, y_te.shape)

(328135, 163) (229695, 163) (229695,) (49220, 163) (49220,) (49220, 163) (49220,)


In [111]:
def sanity_check_split(y_tr, y_val, y_te, name_train="Train", name_val="Validation", name_test="Test"):
    """
    Checks that a stratified 3-way split preserved class proportions and sample counts.
    """
    print("\n=== Sanity Check: Stratified Split ===")
    n_total = len(y_tr) + len(y_val) + len(y_te)
    print(f"Total samples: {n_total:,}")

    def report(y, name):
        frac1 = np.mean(y)
        n = len(y)
        print(f"{name:<12}: n={n:<8} | positives={frac1:.4f} | negatives={1-frac1:.4f}")

    report(y_tr, name_train)
    report(y_val, name_val)
    report(y_te, name_test)

    # quick checks
    assert set(np.unique(y_tr)) <= {0, 1}, "❌ Train labels not binary"
    assert set(np.unique(y_val)) <= {0, 1}, "❌ Val labels not binary"
    assert set(np.unique(y_te)) <= {0, 1}, "❌ Test labels not binary"

    fracs = np.array([np.mean(y_tr), np.mean(y_val), np.mean(y_te)])
    diff = np.max(fracs) - np.min(fracs)
    if diff < 0.01:
        print(f"✅ Class balance preserved (max diff = {diff:.4f})")
    else:
        print(f"⚠️ Class ratio differs across splits (max diff = {diff:.4f})")

    print("=====================================\n")


In [112]:
sanity_check_split(y_tr, y_va, y_te)



=== Sanity Check: Stratified Split ===
Total samples: 328,135
Train       : n=229695   | positives=0.0883 | negatives=0.9117
Validation  : n=49220    | positives=0.0883 | negatives=0.9117
Test        : n=49220    | positives=0.0883 | negatives=0.9117
✅ Class balance preserved (max diff = 0.0000)



In [113]:
import numpy as np

def detect_integer_and_categorical_features(X, unique_threshold=10, tol=1e-8):
    """
    Detects which features in X are integer-like and which are categorical
    (integer-like with few unique values).

    Parameters
    ----------
    X : np.ndarray
        2D array of shape (n_samples, n_features).
    unique_threshold : int
        Maximum number of unique values (excluding NaNs) to consider a feature categorical.
    tol : float
        Numerical tolerance for detecting integer-like values.

    Returns
    -------
    int_count : int
        Number of integer-like features.
    cat_count : int
        Number of categorical features (integer-like with ≤ unique_threshold values).
    int_mask : np.ndarray (bool)
        Mask for integer-like features (True = integer-like).
    cat_mask : np.ndarray (bool)
        Mask for categorical features (True = categorical).
    """
    n_features = X.shape[1]
    int_mask = np.zeros(n_features, dtype=bool)
    cat_mask = np.zeros(n_features, dtype=bool)

    for j in range(n_features):
        col = X[:, j]
        col_nonan = col[~np.isnan(col)]
        if len(col_nonan) == 0:
            continue

        # Check if column is integer-like
        if np.all(np.abs(col_nonan - np.round(col_nonan)) < tol):
            int_mask[j] = True
            # If also low-cardinality, mark as categorical
            unique_vals = np.unique(col_nonan)
            if len(unique_vals) <= unique_threshold:
                cat_mask[j] = True

    int_count = np.sum(int_mask)
    cat_count = np.sum(cat_mask)
    return int_count, cat_count, int_mask, cat_mask


In [114]:
int_count, cat_count, int_mask, cat_mask = detect_integer_and_categorical_features(x_train_2, tol = 1e-12)
print(int_count, cat_count)

141 102


In [115]:
def impute_missing_values(X, cat_mask, reference_stats=None, numeric_strategy="median"):
    """
    Impute NaNs:
      - numerical (cat_mask=False): median or mean (choose via numeric_strategy)
      - categorical (cat_mask=True): mode (most frequent)

    If reference_stats is provided (from training), they are used directly.
    Otherwise, stats are computed from X and returned for reuse.

    Parameters
    ----------
    X : np.ndarray, shape (n_samples, n_features)
    cat_mask : np.ndarray of bool, shape (n_features,)
    reference_stats : list/np.ndarray or None
        Per-column fill values computed on the training set.
    numeric_strategy : {"median","mean"}

    Returns
    -------
    X_imp : np.ndarray
    stats : list of length n_features (per-column fill values)
    """
    X_imp = X.copy()
    n_features = X_imp.shape[1]
    if cat_mask.shape[0] != n_features:
        raise ValueError("cat_mask length must match number of columns in X.")

    if reference_stats is None:
        stats = [None] * n_features
        for j in range(n_features):
            col = X_imp[:, j]
            missing = np.isnan(col)
            if np.all(missing):
                # Degenerate case: all missing. Choose a safe default.
                # For categorical, use 0; for numeric, use 0.0
                fill = 0.0 if not cat_mask[j] else 0.0
            else:
                if cat_mask[j]:
                    # mode
                    vals, counts = np.unique(col[~missing], return_counts=True)
                    fill = vals[np.argmax(counts)]
                else:
                    if numeric_strategy == "mean":
                        fill = np.nanmean(col)
                    else:  # default median
                        fill = np.nanmedian(col)
            if np.any(missing):
                X_imp[missing, j] = fill
            stats[j] = float(fill)
        return X_imp, stats
    else:
        # Use provided stats; must match n_features
        if len(reference_stats) != n_features:
            raise ValueError("reference_stats length does not match number of columns in X.")
        for j in range(n_features):
            fill = reference_stats[j]
            missing = np.isnan(X_imp[:, j])
            if np.any(missing):
                X_imp[missing, j] = fill
        return X_imp, list(reference_stats)


In [116]:
# Fit on training
X_tr_imp, impute_stats = impute_missing_values(x_tr, cat_mask, numeric_strategy="median")

# Apply to val/test with the same stats
X_val_imp, _ = impute_missing_values(x_va, cat_mask, reference_stats=impute_stats)
X_te_imp,  _ = impute_missing_values(x_te,  cat_mask, reference_stats=impute_stats)
X_test_imp, _ = impute_missing_values(X_test, cat_mask, reference_stats=impute_stats)

In [117]:
# ✅ Check that imputation worked correctly
print("=== Imputation Check ===")
print(f"Train shape: {X_tr_imp.shape}")
print(f"Val shape:   {X_val_imp.shape}")
print(f"Test shape:  {X_te_imp.shape}")

# Check for remaining NaNs
print(f"NaNs remaining in train: {np.isnan(X_tr_imp).sum()}")
print(f"NaNs remaining in val:   {np.isnan(X_val_imp).sum()}")
print(f"NaNs remaining in test:  {np.isnan(X_te_imp).sum()}")

# Check some sample statistics
print("\nExample fill values:")
for j in range(min(5, len(impute_stats))):
    kind = "Categorical" if cat_mask[j] else "Numeric"
    print(f"  Feature {j}: {kind}, fill value = {impute_stats[j]}")

print("\n✅ Imputation completed successfully!")


=== Imputation Check ===
Train shape: (229695, 163)
Val shape:   (49220, 163)
Test shape:  (49220, 163)
NaNs remaining in train: 0
NaNs remaining in val:   0
NaNs remaining in test:  0

Example fill values:
  Feature 0: Numeric, fill value = 29.0
  Feature 1: Numeric, fill value = 6.0
  Feature 2: Numeric, fill value = 6242015.0
  Feature 3: Numeric, fill value = 6.0
  Feature 4: Numeric, fill value = 14.0

✅ Imputation completed successfully!


In [118]:
def drop_low_variance_or_correlation(X, y, cat_mask, 
                                     min_var=1e-8, 
                                     min_corr=0.2, 
                                     min_cat_assoc=1e-1):
    """
    Drops columns with low variance (for numerics) or weak association (for categoricals)
    with the target y.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features)
    y : np.ndarray
        Target vector (n_samples,)
    cat_mask : np.ndarray of bool
        Mask indicating which features are categorical
    min_var : float
        Minimum variance to keep a numeric feature
    min_corr : float
        Minimum |Pearson correlation| to keep a numeric feature
    min_cat_assoc : float
        Minimum normalized chi2-like association to keep a categorical feature

    Returns
    -------
    X_filtered : np.ndarray
        Matrix with uninformative features removed
    keep_mask : np.ndarray of bool
        Boolean mask of kept features
    dropped_info : dict
        Information summary about dropped features
    cat_mask_new : np.ndarray of bool
        Updated categorical mask
    num_mask_new : np.ndarray of bool
        Updated numerical mask
    """

    X_copy = X.copy()
    n, d = X_copy.shape

    # 1️⃣ --- Variance filter (numeric only)
    var = np.var(X_copy, axis=0)
    var_mask = np.ones(d, dtype=bool)
    var_mask[~cat_mask] = var[~cat_mask] > min_var  # numeric only

    # 2️⃣ --- Correlation filter for numeric, association for categorical
    scores = np.zeros(d)

    for j in range(d):
        col = X_copy[:, j]
        if cat_mask[j]:
            # categorical feature: compute normalized chi² association
            vals, counts = np.unique(col, return_counts=True)
            if len(vals) < 2:
                scores[j] = 0.0
                continue
            p_y = y.mean()
            chi2 = 0.0
            for v, cnt in zip(vals, counts):
                mask = (col == v)
                n_v = mask.sum()
                if n_v == 0:
                    continue
                p1 = y[mask].mean()
                expected = n_v * p_y
                observed = n_v * p1
                chi2 += (observed - expected) ** 2 / (expected + 1e-12)
            scores[j] = chi2 / n  # normalized association strength
        else:
            # numeric feature: Pearson correlation
            if np.std(col) < 1e-12:
                scores[j] = 0.0
            else:
                corr = np.corrcoef(col, y)[0, 1]
                scores[j] = 0.0 if np.isnan(corr) else abs(corr)

    # threshold differently by type
    assoc_mask = np.ones(d, dtype=bool)
    assoc_mask[cat_mask] = scores[cat_mask] > min_cat_assoc
    assoc_mask[~cat_mask] = scores[~cat_mask] > min_corr

    # 3️⃣ --- Combine filters
    keep_mask = var_mask & assoc_mask
    X_filtered = X_copy[:, keep_mask]

    # 4️⃣ --- Update masks
    cat_mask_new = cat_mask[keep_mask]
    num_mask_new = ~cat_mask_new

    # 5️⃣ --- Info summary
    dropped_info = {
        "total_features": d,
        "kept_features": int(np.sum(keep_mask)),
        "dropped_low_variance": int(np.sum(~var_mask)),
        "dropped_low_assoc_or_corr": int(np.sum(~assoc_mask & var_mask))
    }

    return X_filtered, keep_mask, dropped_info, cat_mask_new, num_mask_new


In [119]:
# 🧩 Feature filtering
X_tr_filt, keep_mask, info, cat_mask, num_mask = drop_low_variance_or_correlation(
    X_tr_imp, y_tr, cat_mask,
    min_var=1e-2,       # numeric variance threshold
    min_corr=0.2,      # numeric correlation threshold
    min_cat_assoc=1e-2 # categorical association threshold
)

# ✅ Summary
print("=== Feature Selection Summary ===")
print(f"Total features before:       {info['total_features']}")
print(f"Kept features:               {info['kept_features']}")
print(f"Dropped (low variance):      {info['dropped_low_variance']}")
print(f"Dropped (low assoc/corr):    {info['dropped_low_assoc_or_corr']}")

# ✅ Apply same mask to validation and test data
X_val_filt  = X_val_imp[:, keep_mask]
X_te_filt   = X_te_imp[:, keep_mask]
X_test_filt = X_test_imp[:, keep_mask]


=== Feature Selection Summary ===
Total features before:       163
Kept features:               35
Dropped (low variance):      0
Dropped (low assoc/corr):    128


In [120]:
import numpy as np

def standardize_numeric_features(X, num_mask, reference_stats=None):
    """
    Standardize only numerical features (mean=0, std=1),
    leaving categorical ones unchanged.

    Parameters
    ----------
    X : np.ndarray
        Feature matrix (n_samples, n_features)
    num_mask : np.ndarray of bool
        Mask of numeric features (True = numeric)
    reference_stats : dict or None
        If None, compute mean/std from X (training set).
        If provided, apply them (validation/test sets).

    Returns
    -------
    X_scaled : np.ndarray
        Matrix with standardized numeric features.
    stats : dict
        {"mean": mean_vector, "std": std_vector} for reuse on val/test.
    """
    X_scaled = X.astype(float).copy()

    # Safety check
    if num_mask.shape[0] != X.shape[1]:
        raise ValueError("num_mask length must match number of columns in X.")

    # --- Compute or apply scaling ---
    if reference_stats is None:
        mean = np.mean(X[:, num_mask], axis=0)
        std = np.std(X[:, num_mask], axis=0)
        std[std == 0] = 1.0  # avoid division by zero
        X_scaled[:, num_mask] = (X[:, num_mask] - mean) / std
        stats = {"mean": mean, "std": std}
    else:
        mean = reference_stats["mean"]
        std = reference_stats["std"]
        X_scaled[:, num_mask] = (X[:, num_mask] - mean) / std
        stats = reference_stats

    return X_scaled, stats


In [121]:
# 1️⃣ Fit on training data
X_tr_scaled, scale_stats = standardize_numeric_features(X_tr_filt, num_mask)

# 2️⃣ Apply to validation/test sets (no leakage)
X_va_scaled, _ = standardize_numeric_features(X_val_filt, num_mask, scale_stats)
X_te_scaled, _  = standardize_numeric_features(X_te_filt,  num_mask, scale_stats)
X_test_scaled, _ = standardize_numeric_features(X_test_filt,  num_mask, scale_stats)

In [122]:
print("Mean of numeric features (train):", np.mean(X_tr_scaled[:, num_mask], axis=0)[:5])
print("Std  of numeric features (train):", np.std(X_tr_scaled[:, num_mask], axis=0)[:5])


Mean of numeric features (train): [ 1.25469050e-16 -2.19756442e-16]
Std  of numeric features (train): [1. 1.]


In [123]:
print("Number of numeric features:", np.sum(num_mask))
print("Number of categorical features:", np.sum(~num_mask))
print("num_mask shape:", num_mask.shape)
print("X_tr_filt shape:", X_tr_filt.shape)

Number of numeric features: 2
Number of categorical features: 33
num_mask shape: (35,)
X_tr_filt shape: (229695, 35)


In [124]:
def one_hot_encode_all_categories(X, cat_mask, reference_uniques=None):
    """
    One-hot encodes *all* categorical features (pure binary 0/1),
    leaving numeric features untouched.

    Parameters
    ----------
    X : np.ndarray
        Data matrix.
    cat_mask : np.ndarray of bool
        Mask where True marks categorical features.
    reference_uniques : list or None
        If provided, use these unique values (from training set)
        to ensure consistent encoding across val/test.

    Returns
    -------
    X_encoded : np.ndarray
        Encoded matrix with only numeric + one-hot binary columns.
    uniques_list : list
        List of unique categories per categorical feature (for reuse).
    """

    n, d = X.shape
    X_parts = []
    uniques_list = []

    for j in range(d):
        col = X[:, j]
        if cat_mask[j]:
            # Use provided unique values (for val/test) or compute from X
            if reference_uniques is None:
                uniques = np.unique(col[~np.isnan(col)])  # ignore NaNs
            else:
                uniques = reference_uniques[j]

            uniques_list.append(uniques)

            # One-hot encode (drop first category to avoid dummy trap)
            one_hot = np.zeros((n, len(uniques) - 1))
            for i, u in enumerate(uniques[1:]):
                one_hot[:, i] = (col == u).astype(float)

            X_parts.append(one_hot)

        else:
            # Numeric feature → keep as is
            X_parts.append(col.reshape(-1, 1))
            uniques_list.append(None)

    X_encoded = np.concatenate(X_parts, axis=1)
    return X_encoded, uniques_list


In [125]:
# On training set
X_tr_encoded, uniques_list = one_hot_encode_all_categories(X_tr_scaled, cat_mask)

# On validation and test (same category mapping)
X_va_encoded, _ = one_hot_encode_all_categories(X_va_scaled, cat_mask, uniques_list)
X_te_encoded, _ = one_hot_encode_all_categories(X_te_scaled, cat_mask, uniques_list)
X_test_encoded, _ = one_hot_encode_all_categories(X_test_scaled, cat_mask, uniques_list)

In [126]:
print("=== One-Hot Encoding Sanity Check ===")

# 1️⃣ Shapes
print(f"Train shape: {X_tr_encoded.shape}")
print(f"Val shape:   {X_va_encoded.shape}")
print(f"Test shape:  {X_te_encoded.shape}")

# 2️⃣ Column consistency
assert X_tr_encoded.shape[1] == X_va_encoded.shape[1] == X_te_encoded.shape[1], \
    "❌ Mismatch in feature counts between splits!"

# 3️⃣ Check for NaNs
print(f"NaNs in train: {np.isnan(X_tr_encoded).sum()}")
print(f"NaNs in val:   {np.isnan(X_va_encoded).sum()}")
print(f"NaNs in test:  {np.isnan(X_te_encoded).sum()}")

# 4️⃣ How many categorical features were one-hot encoded
encoded_features = sum(
    (uniques is not None and 3 <= len(uniques) <= 5)
    for uniques in uniques_list if uniques is not None
)
total_added = X_tr_encoded.shape[1] - X_tr_scaled.shape[1]

print(f"\nFeatures one-hot encoded: {encoded_features}")
print(f"New columns added: {total_added}")

# 5️⃣ Quick data type check
print(f"\nData type of X_tr_encoded: {X_tr_encoded.dtype}")

print("\n✅ One-hot encoding looks good!")

=== One-Hot Encoding Sanity Check ===
Train shape: (229695, 124)
Val shape:   (49220, 124)
Test shape:  (49220, 124)
NaNs in train: 0
NaNs in val:   0
NaNs in test:  0

Features one-hot encoded: 23
New columns added: 89

Data type of X_tr_encoded: float64

✅ One-hot encoding looks good!


In [127]:
print("Encoded train shape:", X_tr_encoded.shape)
print("Min/Max categorical:", np.min(X_tr_encoded), np.max(X_tr_encoded))


Encoded train shape: (229695, 124)
Min/Max categorical: -2.1916497651573863 1.7761965566945117


In [128]:
def rebuild_encoded_masks(num_mask, cat_mask, uniques_list):
    """
    After one-hot encoding, rebuilds masks for the encoded dataset.

    Parameters
    ----------
    num_mask : np.ndarray of bool
        Original numeric mask (before encoding)
    cat_mask : np.ndarray of bool
        Original categorical mask (before encoding)
    uniques_list : list
        List of unique categories per original feature
        (from the one-hot encoder)

    Returns
    -------
    num_mask_encoded, cat_mask_encoded : np.ndarray of bool
        Boolean masks aligned with encoded data shape
    """

    num_mask_encoded = []
    cat_mask_encoded = []

    for is_num, is_cat, uniques in zip(num_mask, cat_mask, uniques_list):
        if is_num:
            # numeric column -> stays numeric
            num_mask_encoded.append(True)
            cat_mask_encoded.append(False)

        elif is_cat:
            if uniques is None:
                # categorical feature but no encoding? -> error check
                raise ValueError("Categorical feature without uniques_list entry")
            else:
                # one-hot created (len(uniques)-1) binary columns
                n_new = len(uniques) - 1
                num_mask_encoded.extend([False] * n_new)
                cat_mask_encoded.extend([True] * n_new)

    return np.array(num_mask_encoded), np.array(cat_mask_encoded)


In [129]:
num_mask_encoded, cat_mask_encoded = rebuild_encoded_masks(num_mask, cat_mask, uniques_list)
print("Encoded masks:", num_mask_encoded.shape, cat_mask_encoded.shape)
num_vals = X_tr_encoded[:, num_mask_encoded]
cat_vals = X_tr_encoded[:, cat_mask_encoded]

print("Numeric mean:", np.mean(num_vals), "std:", np.std(num_vals))
print("Categorical min:", np.min(cat_vals), "max:", np.max(cat_vals))



Encoded masks: (124,) (124,)
Numeric mean: -4.6772485969188335e-17 std: 1.0
Categorical min: 0.0 max: 1.0


In [130]:
print(np.mean(X_tr_encoded), np.std(X_tr_encoded))

0.18765136657376646 0.41056955982492044


In [131]:
def sigmoid(z):
    """Numerically stable sigmoid."""
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))


def logistic_loss(y, tx, w):
    """Compute standard (unpenalized) logistic loss."""
    pred = sigmoid(tx @ w)
    eps = 1e-15  # to avoid log(0)
    loss = -np.mean(y * np.log(pred + eps) + (1 - y) * np.log(1 - pred + eps))
    return float(loss)

def sigmoid(z):
    z = np.clip(z, -500, 500)
    return 1 / (1 + np.exp(-z))


def weighted_logistic_loss(y, tx, w, lambda_=0.0, pos_weight=1.0, neg_weight=1.0):
    """
    Weighted (and optionally penalized) logistic loss.
    Class weights allow balancing for imbalanced datasets.
    """
    p = sigmoid(tx @ w)
    eps = 1e-15  # avoid log(0)

    # weights per sample
    sample_weights = np.where(y == 1, pos_weight, neg_weight)

    # weighted average loss
    loss = -np.sum(sample_weights * (y * np.log(p + eps) + (1 - y) * np.log(1 - p + eps))) / np.sum(sample_weights)
    
    # no regularization term added to returned loss (for monitoring only)
    return float(loss)


def weighted_gradient_logistic(y, tx, w, lambda_=0.0, pos_weight=1.0, neg_weight=1.0):
    """
    Gradient of the weighted logistic loss with L2 penalty.
    """
    p = sigmoid(tx @ w)
    sample_weights = np.where(y == 1, pos_weight, neg_weight)
    error = sample_weights * (p - y)
    grad = (tx.T @ error) / np.sum(sample_weights)
    grad[1:] += 2 * lambda_ * w[1:]  # don't regularize bias
    return grad.ravel()



def compute_gradient_logistic(y, tx, w, lambda_=0.0):
    """Compute penalized gradient of logistic loss."""
    pred = sigmoid(tx @ w)
    error = pred - y
    grad = (tx.T @ error) / len(y)
    grad[1:] += 2 * lambda_ * w[1:]  # don't regularize bias
    return grad.ravel()


def logistic_regression_penalized(
    y, x, lambda_=1e-3, gamma=0.05, max_iter=10000, tol=1e-8, clip_grad=10.0, verbose=True
):
    """
    Logistic regression with L2 penalization in the gradient step only.
    Returns (loss, w), where:
      - loss = final *unpenalized* logistic loss (float)
      - w = final weights (1D np.ndarray)
    """

    # Add bias column
    tx = np.c_[np.ones((x.shape[0], 1)), x]
    w = np.zeros(tx.shape[1])  # 1D weights
    losses = []

    for it in range(max_iter):
        # Compute gradient and loss
        grad = compute_gradient_logistic(y, tx, w, lambda_)
        grad_norm = np.linalg.norm(grad)
        if grad_norm > clip_grad:
            grad *= clip_grad / grad_norm  # stability

        loss = logistic_loss(y, tx, w)
        losses.append(loss)

        # Update weights
        w -= gamma * grad

        # Convergence check
        if it > 0 and abs(losses[-1] - losses[-2]) < tol:
            if verbose:
                print(f"✅ Converged at iteration {it}")
            break

        if verbose and it % 100 == 0:
            print(f"Iter {it:5d} | Loss = {loss:.6f} | GradNorm = {grad_norm:.4f}")

    # Return *last* unpenalized loss and final weights
    return losses[-1], w

def logistic_regression_weighted_gd(
    y, x, lambda_=1e-3, gamma=0.05, pos_weight=1.0, neg_weight=1.0,
    max_iter=10000, tol=1e-8, clip_grad=10.0, verbose=True
):
    """
    Logistic regression with class weights and L2 regularization.
    Returns (loss, w).
    """
    tx = np.c_[np.ones((x.shape[0], 1)), x]
    w = np.zeros(tx.shape[1])
    losses = []

    for it in range(max_iter):
        grad = weighted_gradient_logistic(y, tx, w, lambda_, pos_weight, neg_weight)
        grad_norm = np.linalg.norm(grad)
        if grad_norm > clip_grad:
            grad *= clip_grad / grad_norm

        loss = weighted_logistic_loss(y, tx, w, lambda_, pos_weight, neg_weight)
        losses.append(loss)

        w -= gamma * grad

        if it > 0 and abs(losses[-1] - losses[-2]) < tol:
            if verbose:
                print(f"✅ Converged at iteration {it}")
            break

        if verbose and it % 100 == 0:
            print(f"Iter {it:5d} | Loss = {loss:.6f} | GradNorm = {grad_norm:.4f}")

    return losses[-1], w

def accuracy_numpy(y_true, y_pred):
    """
    Compute accuracy using NumPy.
    """
    return np.mean(y_true == y_pred)


# --- Full evaluation wrapper ---
def evaluate_model(y_true, X, w, threshold=0.5):
    """
    Evaluate trained logistic regression on a dataset.
    Returns accuracy and F1 score.
    """
    preds, probs = predict_with_threshold(X, w, threshold=threshold)
    acc = accuracy_numpy(y_true, preds)
    f1 = f1_score(y_true, preds)
    print(f"✅ Accuracy: {acc*100:.2f}%")
    print(f"✅ F1 Score: {f1:.4f}")
    return acc, f1

In [132]:
def predict_with_threshold(x, w, threshold=0.5):
    tx = np.c_[np.ones((x.shape[0], 1)), x]
    probs = sigmoid(tx @ w)
    preds = (probs >= threshold).astype(int)
    return preds, probs

In [133]:
def f1_score(y_true, y_pred):
    """
    Compute F1 score using only NumPy.
    Works for binary classification (0/1).
    """
    y_true = y_true.astype(int)
    y_pred = y_pred.astype(int)

    tp = np.sum((y_true == 1) & (y_pred == 1))
    fp = np.sum((y_true == 0) & (y_pred == 1))
    fn = np.sum((y_true == 1) & (y_pred == 0))

    precision = tp / (tp + fp + 1e-15)
    recall = tp / (tp + fn + 1e-15)
    f1 = 2 * precision * recall / (precision + recall + 1e-15)
    return f1


In [134]:
def safe_grid_search(
    y_train, X_train,
    y_val, X_val,
    pos_weights=[8, 9],
    lambdas=[1e-7, 1e-3],
    thresholds=np.linspace(0.6, 0.85, 40),
    max_iter=10000,
    gamma=0.5
):
    """
    Safe grid search for weighted penalized logistic regression.
    Returns: best_params, best_f1, results_list
    """
    best_f1 = -1
    best_params = None
    results = []

    total = len(pos_weights) * len(lambdas) * len(thresholds)
    run = 0

    for pw in pos_weights:
        for lam in lambdas:
            run += 1
            print(f"\n=== Run {run}/{total//len(thresholds)} (pos_weight={pw}, lambda_={lam}) ===")

            try:
                # Train model
                loss, w = logistic_regression_weighted_gd(
                    y_train, X_train,
                    lambda_=lam,
                    gamma=gamma,
                    pos_weight=pw,
                    neg_weight=1.0,
                    max_iter=max_iter,
                    verbose=False
                )

                # Skip invalid runs
                if np.isnan(loss) or np.isinf(loss) or loss > 10:
                    print(f"⚠️  Invalid loss ({loss:.4f}), skipping.")
                    continue

                # Evaluate all thresholds for this model
                for th in thresholds:
                    preds, _ = predict_with_threshold(X_val, w, threshold=th)
                    f1 = f1_score(y_val, preds)
                    results.append((pw, lam, th, f1))

                    print(f"   → threshold={th:.2f} | F1={f1:.4f}")

                    # Update best model
                    if f1 > best_f1:
                        best_f1 = f1
                        best_params = (pw, lam, th)
                        print(f"   ✅ New best F1 = {best_f1:.4f}")

            except Exception as e:
                print(f"❌ Error for pos_weight={pw}, lambda_={lam}: {e}")
                continue

    # Sort results by F1 descending
    results.sort(key=lambda t: t[3], reverse=True)

    print("\n=== 🏁 Grid Search Complete ===")
    if best_params:
        print(f"🏆 Best F1 = {best_f1:.4f} at pos_weight={best_params[0]}, λ={best_params[1]}, threshold={best_params[2]}")
    else:
        print("⚠️ No valid runs completed.")

    return best_params, best_f1, results


In [86]:
best_params, best_f1, results = safe_grid_search(
    y_tr, X_tr_encoded,
    y_va, X_va_encoded,
    pos_weights=[8, 9, 10],
    lambdas=[1e-8, 1e-7, 1e-4],
    thresholds=np.linspace(0.65, 0.75, 10), 
    gamma = 0.5
)



=== Run 1/9 (pos_weight=8, lambda_=1e-08) ===
   → threshold=0.65 | F1=0.3792
   ✅ New best F1 = 0.3792
   → threshold=0.66 | F1=0.3827
   ✅ New best F1 = 0.3827
   → threshold=0.67 | F1=0.3861
   ✅ New best F1 = 0.3861
   → threshold=0.68 | F1=0.3900
   ✅ New best F1 = 0.3900
   → threshold=0.69 | F1=0.3929
   ✅ New best F1 = 0.3929
   → threshold=0.71 | F1=0.3958
   ✅ New best F1 = 0.3958
   → threshold=0.72 | F1=0.3982
   ✅ New best F1 = 0.3982
   → threshold=0.73 | F1=0.4003
   ✅ New best F1 = 0.4003
   → threshold=0.74 | F1=0.4039
   ✅ New best F1 = 0.4039
   → threshold=0.75 | F1=0.4058
   ✅ New best F1 = 0.4058

=== Run 2/9 (pos_weight=8, lambda_=1e-07) ===
   → threshold=0.65 | F1=0.3792
   → threshold=0.66 | F1=0.3827
   → threshold=0.67 | F1=0.3861
   → threshold=0.68 | F1=0.3900
   → threshold=0.69 | F1=0.3929
   → threshold=0.71 | F1=0.3958
   → threshold=0.72 | F1=0.3982
   → threshold=0.73 | F1=0.4003
   → threshold=0.74 | F1=0.4039
   → threshold=0.75 | F1=0.4058

=== R

In [97]:
best_params, best_f1, results = safe_grid_search(
    y_tr, X_tr_encoded,
    y_va, X_va_encoded,
    pos_weights=[9, 10],
    lambdas=[1e-8, 1e-5, 1e-3],
    thresholds=np.linspace(0.65, 0.75, 10), 
    gamma = 0.5
)


=== Run 1/6 (pos_weight=9, lambda_=1e-08) ===
   ✅ New best F1 = 0.4087
   ✅ New best F1 = 0.4109
   ✅ New best F1 = 0.4125
   ✅ New best F1 = 0.4156

=== Run 2/6 (pos_weight=9, lambda_=1e-05) ===

=== Run 3/6 (pos_weight=9, lambda_=0.001) ===

=== Run 4/6 (pos_weight=10, lambda_=1e-08) ===

=== Run 5/6 (pos_weight=10, lambda_=1e-05) ===

=== Run 6/6 (pos_weight=10, lambda_=0.001) ===

=== 🏁 Grid Search Complete ===
🏆 Best F1 = 0.4156 at pos_weight=9, λ=1e-08, threshold=0.6833333333333333


In [98]:
best_params, best_f1, results = safe_grid_search(
    y_tr, X_tr_encoded,
    y_va, X_va_encoded,
    pos_weights=[8, 9],
    lambdas=[1e-8],
    thresholds=np.linspace(0.6, 0.85, 30), 
    gamma = 0.5
)


=== Run 1/2 (pos_weight=8, lambda_=1e-08) ===
   ✅ New best F1 = 0.4064
   ✅ New best F1 = 0.4070
   ✅ New best F1 = 0.4093
   ✅ New best F1 = 0.4105
   ✅ New best F1 = 0.4124
   ✅ New best F1 = 0.4146
   ✅ New best F1 = 0.4153

=== Run 2/2 (pos_weight=9, lambda_=1e-08) ===

=== 🏁 Grid Search Complete ===
🏆 Best F1 = 0.4153 at pos_weight=8, λ=1e-08, threshold=0.6603448275862068


In [33]:
# Combine training and validation data
x_tr_final = np.vstack((X_tr_encoded, X_va_encoded))
y_tr_final = np.hstack((y_tr, y_va))
loss, w = logistic_regression_weighted_gd(
    y_tr_final, x_tr_final, lambda_=1e-8, gamma=0.5, pos_weight=8.0, neg_weight=1.0,
    max_iter=10000, tol=1e-8, clip_grad=10.0, verbose=True
)

Iter     0 | Loss = 0.693147 | GradNorm = 0.5733
Iter   100 | Loss = 0.478305 | GradNorm = 0.0115
Iter   200 | Loss = 0.474415 | GradNorm = 0.0067
Iter   300 | Loss = 0.472858 | GradNorm = 0.0046
Iter   400 | Loss = 0.472064 | GradNorm = 0.0034
Iter   500 | Loss = 0.471606 | GradNorm = 0.0027
Iter   600 | Loss = 0.471317 | GradNorm = 0.0022
Iter   700 | Loss = 0.471121 | GradNorm = 0.0018
Iter   800 | Loss = 0.470980 | GradNorm = 0.0016
Iter   900 | Loss = 0.470871 | GradNorm = 0.0014
Iter  1000 | Loss = 0.470785 | GradNorm = 0.0012
Iter  1100 | Loss = 0.470714 | GradNorm = 0.0011
Iter  1200 | Loss = 0.470654 | GradNorm = 0.0011
Iter  1300 | Loss = 0.470601 | GradNorm = 0.0010
Iter  1400 | Loss = 0.470556 | GradNorm = 0.0009
Iter  1500 | Loss = 0.470515 | GradNorm = 0.0009
Iter  1600 | Loss = 0.470479 | GradNorm = 0.0008
Iter  1700 | Loss = 0.470446 | GradNorm = 0.0008
Iter  1800 | Loss = 0.470417 | GradNorm = 0.0007
Iter  1900 | Loss = 0.470390 | GradNorm = 0.0007
Iter  2000 | Loss = 

In [35]:

loss2, w2 = logistic_regression_weighted_gd(
    y_tr_final, x_tr_final, lambda_=0, gamma=0.5, pos_weight=9.0, neg_weight=1.0,
    max_iter=10000, tol=1e-8, clip_grad=10.0, verbose=True
)

Iter     0 | Loss = 0.693147 | GradNorm = 0.5279
Iter   100 | Loss = 0.481214 | GradNorm = 0.0117
Iter   200 | Loss = 0.477209 | GradNorm = 0.0068
Iter   300 | Loss = 0.475595 | GradNorm = 0.0047
Iter   400 | Loss = 0.474771 | GradNorm = 0.0035
Iter   500 | Loss = 0.474296 | GradNorm = 0.0027
Iter   600 | Loss = 0.473997 | GradNorm = 0.0022
Iter   700 | Loss = 0.473795 | GradNorm = 0.0018
Iter   800 | Loss = 0.473648 | GradNorm = 0.0016
Iter   900 | Loss = 0.473536 | GradNorm = 0.0014
Iter  1000 | Loss = 0.473446 | GradNorm = 0.0013
Iter  1100 | Loss = 0.473372 | GradNorm = 0.0012
Iter  1200 | Loss = 0.473310 | GradNorm = 0.0011
Iter  1300 | Loss = 0.473255 | GradNorm = 0.0010
Iter  1400 | Loss = 0.473208 | GradNorm = 0.0009
Iter  1500 | Loss = 0.473166 | GradNorm = 0.0009
Iter  1600 | Loss = 0.473128 | GradNorm = 0.0008
Iter  1700 | Loss = 0.473094 | GradNorm = 0.0008
Iter  1800 | Loss = 0.473064 | GradNorm = 0.0008
Iter  1900 | Loss = 0.473036 | GradNorm = 0.0007
Iter  2000 | Loss = 

In [36]:
loss3, w3 = logistic_regression_weighted_gd(
    y_tr_final, x_tr_final, lambda_=1e-12, gamma=0.5, pos_weight=9.0, neg_weight=1.0,
    max_iter=10000, tol=1e-8, clip_grad=10.0, verbose=True
)

Iter     0 | Loss = 0.693147 | GradNorm = 0.5279
Iter   100 | Loss = 0.481214 | GradNorm = 0.0117
Iter   200 | Loss = 0.477209 | GradNorm = 0.0068
Iter   300 | Loss = 0.475595 | GradNorm = 0.0047
Iter   400 | Loss = 0.474771 | GradNorm = 0.0035
Iter   500 | Loss = 0.474296 | GradNorm = 0.0027
Iter   600 | Loss = 0.473997 | GradNorm = 0.0022
Iter   700 | Loss = 0.473795 | GradNorm = 0.0018
Iter   800 | Loss = 0.473648 | GradNorm = 0.0016
Iter   900 | Loss = 0.473536 | GradNorm = 0.0014
Iter  1000 | Loss = 0.473446 | GradNorm = 0.0013
Iter  1100 | Loss = 0.473372 | GradNorm = 0.0012
Iter  1200 | Loss = 0.473310 | GradNorm = 0.0011
Iter  1300 | Loss = 0.473255 | GradNorm = 0.0010
Iter  1400 | Loss = 0.473208 | GradNorm = 0.0009
Iter  1500 | Loss = 0.473166 | GradNorm = 0.0009
Iter  1600 | Loss = 0.473128 | GradNorm = 0.0008
Iter  1700 | Loss = 0.473094 | GradNorm = 0.0008
Iter  1800 | Loss = 0.473064 | GradNorm = 0.0008
Iter  1900 | Loss = 0.473036 | GradNorm = 0.0007
Iter  2000 | Loss = 

In [55]:
# Suppose you've already trained your model
# loss, w = logistic_regression_weighted_gd(...)

# Evaluate on test or validation data
acc, f1 = evaluate_model(y_te, X_te_encoded, w2, 0.69)

✅ Accuracy: 86.54%
✅ F1 Score: 0.4153
