In [None]:
import numpy as np
import pandas as pd
import cvxopt
from scipy.optimize import minimize
from collections import defaultdict

def generate_kmers(sequence, k):
    """Generate all k-mers of a given sequence."""
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def weighted_degree_kernel(X1, X2, k_max, weights=None):
    """
    Compute the Weighted Degree Kernel between two sets of sequences.
    
    Parameters:
        X1, X2: Arrays of DNA sequences.
        k_max: Maximum k-mer length.
        weights: Weighting scheme for different k-mer lengths (default: higher k gets more weight).

    Returns:
        Kernel matrix of shape (len(X1), len(X2)).
    """
    if weights is None:
        weights = np.array([1 / (d + 1) for d in range(1, k_max + 1)])  # Decay weighting

    kernel_matrix = np.zeros((len(X1), len(X2)))

    for d in range(1, k_max + 1):  # Compute kernel for all k-mer sizes up to k_max
        kmer_counts_X1 = [{generate_kmers(seq, d)[i]: i for i in range(len(seq) - d + 1)} for seq in X1]
        kmer_counts_X2 = [{generate_kmers(seq, d)[i]: i for i in range(len(seq) - d + 1)} for seq in X2]

        for i, seq1 in enumerate(X1):
            for j, seq2 in enumerate(X2):
                common_kmers = set(kmer_counts_X1[i]) & set(kmer_counts_X2[j])
                kernel_matrix[i, j] += weights[d - 1] * sum(1 for km in common_kmers)

    return kernel_matrix

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_loss(beta, K, y, C):
    """Logistic loss function for kernelized logistic regression."""
    m = len(y)
    linear_term = K @ beta  # K * beta
    loss = np.mean(np.log(1 + np.exp(-y * linear_term))) + (C / 2) * np.dot(beta, beta)
    return loss

def train_logistic_regression(K_train, y_train, C=1.0):
    """Train logistic regression with kernel trick using optimization."""
    beta_init = np.zeros(K_train.shape[0])

    res = minimize(fun=logistic_loss, x0=beta_init, args=(K_train, y_train, C), method='L-BFGS-B')

    return res.x  # Optimal beta

def predict_logistic_regression(K_test, beta):
    """Make predictions using kernelized logistic regression."""
    probs = sigmoid(K_test @ beta)
    return np.where(probs >= 0.5, 1, -1)

def train_and_predict_weighted_degree_logistic(X_train_path, Y_train_path, X_test_path, k_max=6, C=1.0):
    """Pipeline for training and predicting with weighted degree kernel logistic regression."""
    # Load data
    df_train = pd.read_csv(X_train_path)
    df_labels = pd.read_csv(Y_train_path)
    df_test = pd.read_csv(X_test_path)

    X_train = df_train["seq"].values
    y_train = np.where(df_labels["Bound"] == 1, 1, -1)  # Convert labels to {-1,1}
    X_test = df_test["seq"].values

    # Compute kernel matrices
    K_train = weighted_degree_kernel(X_train, X_train, k_max)
    K_test = weighted_degree_kernel(X_test, X_train, k_max)

    # Train logistic regression model
    beta = train_logistic_regression(K_train, y_train, C)

    # Make predictions
    predictions = predict_logistic_regression(K_test, beta)

    # Convert {-1,1} predictions to {0,1}
    df_test["Bound"] = (predictions + 1) // 2

    return df_test

# === Run the function ===
df_predictions = train_and_predict_weighted_degree_logistic("./data/Xtr0.csv", "./data/Ytr0.csv", "./data/Xte0.csv", k_max=6, C=1.0")

# Save predictions
df_predictions.to_csv("predictions.csv", index=False)


Why This Might Improve Performance

✅ Better than Spectrum Kernel:

    Spectrum Kernel ignores sequence order, while the Weighted Degree Kernel captures structural patterns in DNA.
✅ More Robust to Small Variations:
    By weighting longer k-mers more, it helps capture longer-term dependencies in the sequence.
    
✅ Less Sensitive to Overfitting:
    The decay weighting penalizes very short patterns, reducing noise.