In [None]:
import numpy as np
import pandas as pd
from itertools import product
from scipy.optimize import minimize

def generate_kmers(sequence, k):
    """Generate all k-mers of a given sequence."""
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def hamming_distance(s1, s2):
    """Compute the Hamming distance between two strings."""
    return sum(c1 != c2 for c1, c2 in zip(s1, s2))

def generate_mismatch_kmers(kmer, m, alphabet="ACGT"):
    """
    Generate all possible k-mers within Hamming distance `m` of the given `kmer`.
    
    Args:
        kmer: Original k-mer string.
        m: Maximum number of allowed mismatches.
        alphabet: Allowed nucleotide characters.

    Returns:
        Set of k-mers within `m` mismatches.
    """
    mismatched_kmers = set()
    k = len(kmer)

    # Generate all possible k-mers
    for positions in product(range(k), repeat=m):  # Positions to mutate
        for replacements in product(alphabet, repeat=m):  # Replacement letters
            kmer_list = list(kmer)
            for pos, new_char in zip(positions, replacements):
                kmer_list[pos] = new_char  # Mutate positions
            mismatched_kmers.add("".join(kmer_list))

    return mismatched_kmers

def mismatch_kernel(X1, X2, k, m):
    """
    Compute the Mismatch Kernel matrix.
    
    Args:
        X1: List of sequences (training or test).
        X2: List of sequences (training).
        k: k-mer length.
        m: Max number of mismatches allowed.

    Returns:
        Kernel matrix of shape (len(X1), len(X2)).
    """
    kernel_matrix = np.zeros((len(X1), len(X2)))
    
    for i, seq1 in enumerate(X1):
        kmer_counts_1 = {}
        for kmer in generate_kmers(seq1, k):
            for mismatch_kmer in generate_mismatch_kmers(kmer, m):
                kmer_counts_1[mismatch_kmer] = kmer_counts_1.get(mismatch_kmer, 0) + 1
        
        for j, seq2 in enumerate(X2):
            kmer_counts_2 = {}
            for kmer in generate_kmers(seq2, k):
                for mismatch_kmer in generate_mismatch_kmers(kmer, m):
                    kmer_counts_2[mismatch_kmer] = kmer_counts_2.get(mismatch_kmer, 0) + 1

            # Compute dot product between mismatch k-mer counts
            kernel_matrix[i, j] = sum(kmer_counts_1[kmer] * kmer_counts_2.get(kmer, 0) for kmer in kmer_counts_1)
    
    return kernel_matrix

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def logistic_loss(beta, K, y, C):
    """Logistic loss function for kernelized logistic regression."""
    linear_term = K @ beta  # K * beta
    loss = np.mean(np.log(1 + np.exp(-y * linear_term))) + (C / 2) * np.dot(beta, beta)
    return loss

def train_logistic_regression(K_train, y_train, C=1.0):
    """Train logistic regression with kernel trick using optimization."""
    beta_init = np.zeros(K_train.shape[0])

    res = minimize(fun=logistic_loss, x0=beta_init, args=(K_train, y_train, C), method='L-BFGS-B')

    return res.x  # Optimal beta

def predict_logistic_regression(K_test, beta):
    """Make predictions using kernelized logistic regression."""
    probs = sigmoid(K_test @ beta)
    return np.where(probs >= 0.5, 1, -1)

def train_and_predict_mismatch_logistic(X_train_path, Y_train_path, X_test_path, k=6, m=1, C=1.0):
    """Pipeline for training and predicting with Mismatch Kernel Logistic Regression."""
    # Load data
    df_train = pd.read_csv(X_train_path)
    df_labels = pd.read_csv(Y_train_path)
    df_test = pd.read_csv(X_test_path)

    X_train = df_train["seq"].values
    y_train = np.where(df_labels["Bound"] == 1, 1, -1)  # Convert labels to {-1,1}
    X_test = df_test["seq"].values

    # Compute kernel matrices
    K_train = mismatch_kernel(X_train, X_train, k, m)
    K_test = mismatch_kernel(X_test, X_train, k, m)

    # Train logistic regression model
    beta = train_logistic_regression(K_train, y_train, C)

    # Make predictions
    predictions = predict_logistic_regression(K_test, beta)

    # Convert {-1,1} predictions to {0,1}
    df_test["Bound"] = (predictions + 1) // 2

    return df_test

# === Run the function ===
df_predictions = train_and_predict_mismatch_logistic("./data/Xtr0.csv", "./data/Ytr0.csv", "./data/Xte0.csv", k=6, m=1, C=1.0)

# Save predictions
df_predictions.to_csv("mismatch_predictions.csv", index=False)
