In [None]:
import numpy as np
import pandas as pd
import cvxopt
from collections import Counter
import optuna

cvxopt.solvers.options["show_progress"] = False

from scipy.linalg import solve

In [None]:
#============Useful functions==============
def load_data(seq_file: str, label_file: str) -> tuple:
    """
    加载多组CSV文件并合并数据
    
    参数:
        seq_files: 多个x.csv文件路径列表（如["x.csv", "x1.csv"]）
        label_files: 多个y.csv文件路径列表（如["y.csv", "y1.csv"]）
    
    返回:
        (sequences, labels): 合并后的序列列表和标签数组
    """
    # 合并所有数据
    df_seq = pd.read_csv(seq_file)
    df_label = pd.read_csv(label_file)
    
    merged = pd.merge(df_seq, df_label, on='Id', how='inner')
    if merged.empty:
        print(f"警告: {seq_file}和{label_file}中没有匹配的ID")

    # 提取数据
    sequences = merged['seq'].values
    labels = np.where(merged['Bound'] == 1, 1, -1)
    return sequences, labels

def manual_kfold_split(X, y, n_splits=3, seed=42):
    """Manually splits X and y into K folds for cross-validation."""
    np.random.seed(seed)
    indices = np.arange(len(X))
    np.random.shuffle(indices)
    
    folds = np.array_split(indices, n_splits)
    return folds

def save_to_csv(df_0, df_1, df_2, suffixe = ""):
    res = pd.concat([df_0, df_1, df_2], ignore_index = True)
    res["Bound"] = res["Bound"].astype("int")
    res[["Id", "Bound"]].to_csv(f"./results/predictions_{suffixe}.csv", index = False)
    return
    # df_pred.to_csv("predictions.csv"

In [None]:
#===============Computation of the spectrum kernel====================
def generate_kmers(sequence, k):
    """Extract all k-mers from a sequence."""
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def build_kmer_dict(sequences, k):
    """Create a dictionary mapping each unique k-mer to an index."""
    kmer_set = set()
    for seq in sequences:
        kmer_set.update(generate_kmers(seq, k))
    return {kmer: idx for idx, kmer in enumerate(sorted(kmer_set))}

def compute_kmer_feature_matrix(sequences, k, kmer_dict):
    """Convert sequences into k-mer frequency vectors."""
    num_samples = len(sequences)
    num_kmers = len(kmer_dict)
    feature_matrix = np.zeros((num_samples, num_kmers))

    for i, seq in enumerate(sequences):
        kmer_counts = Counter(generate_kmers(seq, k))
        for kmer, count in kmer_counts.items():
            if kmer in kmer_dict:
                feature_matrix[i, kmer_dict[kmer]] = count
    return feature_matrix

def spectrum_kernel_matrix(sequences1, sequences2, k):
    """Compute the Spectrum Kernel matrix between two sets of sequences."""
    all_sequences = np.concatenate([sequences1, sequences2])
    kmer_dict = build_kmer_dict(all_sequences, k)

    # Convert sequences to feature matrices
    X1 = compute_kmer_feature_matrix(sequences1, k, kmer_dict)
    X2 = compute_kmer_feature_matrix(sequences2, k, kmer_dict)

    return X1 @ X2.T  # Compute the dot product



# With SVM

In [None]:
def train_svm(K, y, C=1.0):
    """Train an SVM using the precomputed kernel matrix K."""
    n = len(y)
    y = y.astype(float).reshape(-1, 1)  # Ensure y is a column vector

    # Construct the quadratic programming matrices
    P = cvxopt.matrix(np.outer(y, y) * K)  # P_ij = y_i * y_j * K_ij
    q = cvxopt.matrix(-np.ones(n))        # q_i = -1
    G = cvxopt.matrix(np.vstack((-np.eye(n), np.eye(n))))  # Constraints 0 <= α <= C
    h = cvxopt.matrix(np.hstack((np.zeros(n), C * np.ones(n))))
    A = cvxopt.matrix(y.T)  # Equality constraint sum(α_i * y_i) = 0
    b = cvxopt.matrix(0.0)

    # Solve the quadratic program
    solution = cvxopt.solvers.qp(P, q, G, h, A, b)
    alphas = np.ravel(solution['x'])

    # Support vectors: α > 0
    sv_indices = alphas > 1e-5
    support_vectors = np.where(sv_indices)[0]
    alphas = alphas[sv_indices]
    support_y = y[sv_indices].flatten()

    # Compute the bias term (intercept)
    bias = np.mean(support_y - np.sum(alphas * support_y * K[support_vectors][:, support_vectors], axis=1))

    return alphas, support_vectors, bias

def predict_svm(K_test, alphas, support_vectors, support_y, bias):
    """Make predictions using the trained SVM."""
    return np.sign(np.sum(alphas * support_y * K_test[:, support_vectors], axis=1) + bias)


In [None]:


def cross_val_score_manual(X, y, k, C, n_splits=3):
    """Performs cross-validation without using sklearn."""
    folds = manual_kfold_split(X, y, n_splits)
    accuracies = []

    for i in range(n_splits):
        # print("i = ", i)
        val_indices = folds[i]  # Current fold is validation set
        train_indices = np.hstack([folds[j] for j in range(n_splits) if j != i])  # Rest are training

        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[val_indices], y[val_indices]

        # Compute kernel matrices
        K_train = spectrum_kernel_matrix(X_train, X_train, k)
        K_val = spectrum_kernel_matrix(X_val, X_train, k)

        # Train and predict
        alphas, support_vectors, bias = train_svm(K_train, y_train, C)
        predictions = predict_svm(K_val, alphas, support_vectors, y_train[support_vectors], bias)

        # Compute accuracy
        accuracy = np.mean(predictions == y_val)
        accuracies.append(accuracy)

        # print("accuracies: ", accuracies)
    return np.mean(accuracies)

def objective(trial):
    """Objective function for Optuna to optimize k and C."""
    k = trial.suggest_int("k", 4, 10)  # k-mer length between 2 and 6
    C = trial.suggest_float("C", 1e-3, 1e2, log = True)  # C in [0.01, 100]

    cross_val = cross_val_score_manual(X_train, Y_train, k, C)

    print(f"C = {C}, k = {k}: {cross_val}")
    return cross_val

def train_and_predict_spectrum_svm(X_train_path, Y_train_path, X_test_path, n_trials=20):
    """Hyperparameter optimization with Optuna, then train and predict."""
    global X_train, Y_train  # Needed for Optuna's objective function
    X_train, Y_train = load_data(X_train_path, Y_train_path)

    df_pred = pd.read_csv(X_test_path)
    X_test = df_pred["seq"].values
    # X_test = load_data(X_test_path)

    # Run Optuna optimization
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Best hyperparameters
    best_k = study.best_params["k"]
    best_C = study.best_params["C"]
    print(f"Best k: {best_k}, Best C: {best_C}")

    # Train final model with best hyperparameters
    K_train = spectrum_kernel_matrix(X_train, X_train, best_k)
    K_test = spectrum_kernel_matrix(X_test, X_train, best_k)
    alphas, support_vectors, bias = train_svm(K_train, Y_train, best_C)

    # Predict on test set
    predictions = predict_svm(K_test, alphas, support_vectors, Y_train[support_vectors], bias)

    # Convert {-1,1} predictions to {0,1}
    predictions = (predictions + 1) // 2

    df_pred["Bound"] = predictions

    return df_pred

In [6]:

# === Run the function with hyperparameter tuning ===
prediction_0 = train_and_predict_spectrum_svm("./data/Xtr0.csv", "./data/Ytr0.csv", "./data/Xte0.csv", n_trials=20)
prediction_1 = train_and_predict_spectrum_svm("./data/Xtr1.csv", "./data/Ytr1.csv", "./data/Xte1.csv", n_trials=20)
prediction_2 = train_and_predict_spectrum_svm("./data/Xtr2.csv", "./data/Ytr2.csv", "./data/Xte2.csv", n_trials=20)

[I 2025-03-14 10:28:20,849] A new study created in memory with name: no-name-3caa849b-46f7-4f74-9c36-79f98be1f011
[I 2025-03-14 10:28:36,454] Trial 0 finished with value: 0.5905072989031009 and parameters: {'k': 5, 'C': 0.06387016921007889}. Best is trial 0 with value: 0.5905072989031009.


C = 0.06387016921007889, k = 5: 0.5905072989031009


[I 2025-03-14 10:28:57,187] Trial 1 finished with value: 0.594998296647472 and parameters: {'k': 7, 'C': 53.59841649588178}. Best is trial 1 with value: 0.594998296647472.


C = 53.59841649588178, k = 7: 0.594998296647472


[I 2025-03-14 10:29:16,240] Trial 2 finished with value: 0.608499053776415 and parameters: {'k': 7, 'C': 0.010238600219771014}. Best is trial 2 with value: 0.608499053776415.


C = 0.010238600219771014, k = 7: 0.608499053776415


[I 2025-03-14 10:29:51,929] Trial 3 finished with value: 0.5839955397676537 and parameters: {'k': 10, 'C': 0.08088900316264791}. Best is trial 2 with value: 0.608499053776415.


C = 0.08088900316264791, k = 10: 0.5839955397676537


[I 2025-03-14 10:30:17,592] Trial 4 finished with value: 0.5789920355137746 and parameters: {'k': 9, 'C': 0.0030310511626014795}. Best is trial 2 with value: 0.608499053776415.


C = 0.0030310511626014795, k = 9: 0.5789920355137746


[I 2025-03-14 10:30:45,013] Trial 5 finished with value: 0.5889937913925919 and parameters: {'k': 9, 'C': 0.48928818106654115}. Best is trial 2 with value: 0.608499053776415.


C = 0.48928818106654115, k = 9: 0.5889937913925919


[I 2025-03-14 10:30:54,887] Trial 6 finished with value: 0.592509300905103 and parameters: {'k': 4, 'C': 0.2252691017382136}. Best is trial 2 with value: 0.608499053776415.


C = 0.2252691017382136, k = 4: 0.592509300905103


[I 2025-03-14 10:31:15,684] Trial 7 finished with value: 0.5909980445212829 and parameters: {'k': 8, 'C': 0.06529876197834737}. Best is trial 2 with value: 0.608499053776415.


C = 0.06529876197834737, k = 8: 0.5909980445212829


[I 2025-03-14 10:31:33,106] Trial 8 finished with value: 0.583012547780164 and parameters: {'k': 4, 'C': 10.522135524686643}. Best is trial 2 with value: 0.608499053776415.


C = 10.522135524686643, k = 4: 0.583012547780164


[I 2025-03-14 10:32:01,364] Trial 9 finished with value: 0.5889937913925919 and parameters: {'k': 9, 'C': 3.1929314193450167}. Best is trial 2 with value: 0.608499053776415.


C = 3.1929314193450167, k = 9: 0.5889937913925919


[I 2025-03-14 10:32:12,858] Trial 10 finished with value: 0.5949952951452201 and parameters: {'k': 6, 'C': 0.0013577819250302266}. Best is trial 2 with value: 0.608499053776415.


C = 0.0013577819250302266, k = 6: 0.5949952951452201


[I 2025-03-14 10:32:27,143] Trial 11 finished with value: 0.594998296647472 and parameters: {'k': 7, 'C': 76.94398515570495}. Best is trial 2 with value: 0.608499053776415.


C = 76.94398515570495, k = 7: 0.594998296647472


[I 2025-03-14 10:32:39,121] Trial 12 finished with value: 0.6094970532751642 and parameters: {'k': 7, 'C': 0.006967238737746815}. Best is trial 12 with value: 0.6094970532751642.


C = 0.006967238737746815, k = 7: 0.6094970532751642


[I 2025-03-14 10:32:51,647] Trial 13 finished with value: 0.6104980542761652 and parameters: {'k': 7, 'C': 0.008094842686455033}. Best is trial 13 with value: 0.6104980542761652.


C = 0.008094842686455033, k = 7: 0.6104980542761652


[I 2025-03-14 10:33:02,240] Trial 14 finished with value: 0.6025005515260388 and parameters: {'k': 6, 'C': 0.00931443520861501}. Best is trial 13 with value: 0.6104980542761652.


C = 0.00931443520861501, k = 6: 0.6025005515260388


[I 2025-03-14 10:33:11,948] Trial 15 finished with value: 0.6045033039036037 and parameters: {'k': 6, 'C': 0.009945830475354597}. Best is trial 13 with value: 0.6104980542761652.


C = 0.009945830475354597, k = 6: 0.6045033039036037


[I 2025-03-14 10:33:29,099] Trial 16 finished with value: 0.5859922891407149 and parameters: {'k': 8, 'C': 0.0014031126264034389}. Best is trial 13 with value: 0.6104980542761652.


C = 0.0014031126264034389, k = 8: 0.5859922891407149


[I 2025-03-14 10:33:44,316] Trial 17 finished with value: 0.5940032986509748 and parameters: {'k': 8, 'C': 0.01535732524423759}. Best is trial 13 with value: 0.6104980542761652.


C = 0.01535732524423759, k = 8: 0.5940032986509748


[I 2025-03-14 10:33:53,309] Trial 18 finished with value: 0.5755012883948416 and parameters: {'k': 5, 'C': 1.0504276728257824}. Best is trial 13 with value: 0.6104980542761652.


C = 1.0504276728257824, k = 5: 0.5755012883948416


[I 2025-03-14 10:34:02,611] Trial 19 finished with value: 0.5945068006537272 and parameters: {'k': 5, 'C': 0.032491038004026154}. Best is trial 13 with value: 0.6104980542761652.


C = 0.032491038004026154, k = 5: 0.5945068006537272
Best k: 7, Best C: 0.008094842686455033


[I 2025-03-14 10:34:12,692] A new study created in memory with name: no-name-3ecc32df-5974-49a3-94ef-63ad0c0daf2b
[I 2025-03-14 10:34:25,377] Trial 0 finished with value: 0.6530030780405592 and parameters: {'k': 5, 'C': 45.75111486532372}. Best is trial 0 with value: 0.6530030780405592.


C = 45.75111486532372, k = 5: 0.6530030780405592


[I 2025-03-14 10:34:48,580] Trial 1 finished with value: 0.7380086233159696 and parameters: {'k': 9, 'C': 0.006224109351870698}. Best is trial 1 with value: 0.7380086233159696.


C = 0.006224109351870698, k = 9: 0.7380086233159696


[I 2025-03-14 10:34:59,462] Trial 2 finished with value: 0.6320015667841755 and parameters: {'k': 4, 'C': 1.3511260462339876}. Best is trial 1 with value: 0.7380086233159696.


C = 1.3511260462339876, k = 4: 0.6320015667841755


[I 2025-03-14 10:35:10,550] Trial 3 finished with value: 0.6530030780405592 and parameters: {'k': 5, 'C': 17.604284023876147}. Best is trial 1 with value: 0.7380086233159696.


C = 17.604284023876147, k = 5: 0.6530030780405592


[I 2025-03-14 10:35:27,130] Trial 4 finished with value: 0.7155071113092103 and parameters: {'k': 8, 'C': 4.491263229907515}. Best is trial 1 with value: 0.7380086233159696.


C = 4.491263229907515, k = 8: 0.7155071113092103


[I 2025-03-14 10:35:55,053] Trial 5 finished with value: 0.7165096130613371 and parameters: {'k': 10, 'C': 20.276557442331217}. Best is trial 1 with value: 0.7380086233159696.


C = 20.276557442331217, k = 10: 0.7165096130613371


[I 2025-03-14 10:36:11,892] Trial 6 finished with value: 0.7285103694399048 and parameters: {'k': 8, 'C': 0.006758829143178599}. Best is trial 1 with value: 0.7380086233159696.


C = 0.006758829143178599, k = 8: 0.7285103694399048


[I 2025-03-14 10:36:22,117] Trial 7 finished with value: 0.6345055700378038 and parameters: {'k': 4, 'C': 0.008929693745023634}. Best is trial 1 with value: 0.7380086233159696.


C = 0.008929693745023634, k = 4: 0.6345055700378038


[I 2025-03-14 10:36:31,832] Trial 8 finished with value: 0.6795003399201299 and parameters: {'k': 6, 'C': 1.8873476023722529}. Best is trial 1 with value: 0.7380086233159696.


C = 1.8873476023722529, k = 6: 0.6795003399201299


[I 2025-03-14 10:36:42,274] Trial 9 finished with value: 0.6530030780405592 and parameters: {'k': 5, 'C': 1.3531365508555728}. Best is trial 1 with value: 0.7380086233159696.


C = 1.3531365508555728, k = 5: 0.6530030780405592


[I 2025-03-14 10:37:06,494] Trial 10 finished with value: 0.7165096130613371 and parameters: {'k': 10, 'C': 0.06466970975933817}. Best is trial 1 with value: 0.7380086233159696.


C = 0.06466970975933817, k = 10: 0.7165096130613371


[I 2025-03-14 10:37:24,726] Trial 11 finished with value: 0.7270066168117143 and parameters: {'k': 8, 'C': 0.0019102150851897821}. Best is trial 1 with value: 0.7380086233159696.


C = 0.0019102150851897821, k = 8: 0.7270066168117143


[I 2025-03-14 10:37:38,802] Trial 12 finished with value: 0.7160068614341478 and parameters: {'k': 8, 'C': 0.021218342782665586}. Best is trial 1 with value: 0.7380086233159696.


C = 0.021218342782665586, k = 8: 0.7160068614341478


[I 2025-03-14 10:37:59,676] Trial 13 finished with value: 0.7235091163127145 and parameters: {'k': 9, 'C': 0.0016855307292960517}. Best is trial 1 with value: 0.7380086233159696.


C = 0.0016855307292960517, k = 9: 0.7235091163127145


[I 2025-03-14 10:38:09,824] Trial 14 finished with value: 0.7030058544301423 and parameters: {'k': 7, 'C': 0.10095325038271338}. Best is trial 1 with value: 0.7380086233159696.


C = 0.10095325038271338, k = 7: 0.7030058544301423


[I 2025-03-14 10:38:33,456] Trial 15 finished with value: 0.7350101225663445 and parameters: {'k': 9, 'C': 0.007085492810783733}. Best is trial 1 with value: 0.7380086233159696.


C = 0.007085492810783733, k = 9: 0.7350101225663445


[I 2025-03-14 10:38:55,667] Trial 16 finished with value: 0.7190098644371509 and parameters: {'k': 9, 'C': 0.18308797241716132}. Best is trial 1 with value: 0.7380086233159696.


C = 0.18308797241716132, k = 9: 0.7190098644371509


[I 2025-03-14 10:39:18,417] Trial 17 finished with value: 0.7190098644371509 and parameters: {'k': 9, 'C': 0.020077281073614523}. Best is trial 1 with value: 0.7380086233159696.


C = 0.020077281073614523, k = 9: 0.7190098644371509


[I 2025-03-14 10:39:33,035] Trial 18 finished with value: 0.7055083569326448 and parameters: {'k': 7, 'C': 0.001232674665419028}. Best is trial 1 with value: 0.7380086233159696.


C = 0.001232674665419028, k = 7: 0.7055083569326448


[I 2025-03-14 10:40:00,052] Trial 19 finished with value: 0.7180148664406536 and parameters: {'k': 10, 'C': 0.005321218307053443}. Best is trial 1 with value: 0.7380086233159696.


C = 0.005321218307053443, k = 10: 0.7180148664406536
Best k: 9, Best C: 0.006224109351870698


[I 2025-03-14 10:40:20,147] A new study created in memory with name: no-name-f4a0a376-fca2-48ea-832a-327b64ed849f
[I 2025-03-14 10:40:43,552] Trial 0 finished with value: 0.6415035725380553 and parameters: {'k': 9, 'C': 5.898253036375837}. Best is trial 0 with value: 0.6415035725380553.


C = 5.898253036375837, k = 9: 0.6415035725380553


[I 2025-03-14 10:41:05,456] Trial 1 finished with value: 0.6425038231634933 and parameters: {'k': 9, 'C': 0.014236754071536224}. Best is trial 1 with value: 0.6425038231634933.


C = 0.014236754071536224, k = 9: 0.6425038231634933


[I 2025-03-14 10:41:22,186] Trial 2 finished with value: 0.64700382541462 and parameters: {'k': 8, 'C': 0.014359907523629523}. Best is trial 2 with value: 0.64700382541462.


C = 0.014359907523629523, k = 8: 0.64700382541462


[I 2025-03-14 10:41:32,742] Trial 3 finished with value: 0.6120055587821704 and parameters: {'k': 6, 'C': 0.716273301737162}. Best is trial 2 with value: 0.64700382541462.


C = 0.716273301737162, k = 6: 0.6120055587821704


[I 2025-03-14 10:41:49,400] Trial 4 finished with value: 0.6500023261642452 and parameters: {'k': 8, 'C': 0.01862740621123094}. Best is trial 4 with value: 0.6500023261642452.


C = 0.01862740621123094, k = 8: 0.6500023261642452


[I 2025-03-14 10:42:00,678] Trial 5 finished with value: 0.5954995475235356 and parameters: {'k': 4, 'C': 0.5730270443631178}. Best is trial 4 with value: 0.6500023261642452.


C = 0.5730270443631178, k = 4: 0.5954995475235356


[I 2025-03-14 10:42:13,065] Trial 6 finished with value: 0.6120055587821704 and parameters: {'k': 6, 'C': 66.91533247037248}. Best is trial 4 with value: 0.6500023261642452.


C = 66.91533247037248, k = 6: 0.6120055587821704


[I 2025-03-14 10:42:34,776] Trial 7 finished with value: 0.6415035725380553 and parameters: {'k': 9, 'C': 1.4474201417909436}. Best is trial 4 with value: 0.6500023261642452.


C = 1.4474201417909436, k = 9: 0.6415035725380553


[I 2025-03-14 10:42:45,571] Trial 8 finished with value: 0.6120055587821704 and parameters: {'k': 6, 'C': 0.23139387619652454}. Best is trial 4 with value: 0.6500023261642452.


C = 0.23139387619652454, k = 6: 0.6120055587821704


[I 2025-03-14 10:43:07,462] Trial 9 finished with value: 0.6415035725380553 and parameters: {'k': 9, 'C': 0.40691408374409893}. Best is trial 4 with value: 0.6500023261642452.


C = 0.40691408374409893, k = 9: 0.6415035725380553


[I 2025-03-14 10:43:18,637] Trial 10 finished with value: 0.6060160610385498 and parameters: {'k': 4, 'C': 0.00103801684579861}. Best is trial 4 with value: 0.6500023261642452.


C = 0.00103801684579861, k = 4: 0.6060160610385498


[I 2025-03-14 10:43:31,334] Trial 11 finished with value: 0.6230053141597369 and parameters: {'k': 7, 'C': 0.015425469141999253}. Best is trial 4 with value: 0.6500023261642452.


C = 0.015425469141999253, k = 7: 0.6230053141597369


[I 2025-03-14 10:43:49,909] Trial 12 finished with value: 0.6520020770395584 and parameters: {'k': 8, 'C': 0.019553012635754607}. Best is trial 12 with value: 0.6520020770395584.


C = 0.019553012635754607, k = 8: 0.6520020770395584


[I 2025-03-14 10:44:17,650] Trial 13 finished with value: 0.6550043296669984 and parameters: {'k': 10, 'C': 0.00201371347012383}. Best is trial 13 with value: 0.6550043296669984.


C = 0.00201371347012383, k = 10: 0.6550043296669984


[I 2025-03-14 10:44:46,061] Trial 14 finished with value: 0.6450003226614921 and parameters: {'k': 10, 'C': 0.001144059751038491}. Best is trial 13 with value: 0.6550043296669984.


C = 0.001144059751038491, k = 10: 0.6450003226614921


[I 2025-03-14 10:45:11,495] Trial 15 finished with value: 0.6420138279208745 and parameters: {'k': 10, 'C': 0.07282528766368182}. Best is trial 13 with value: 0.6550043296669984.


C = 0.07282528766368182, k = 10: 0.6420138279208745


[I 2025-03-14 10:45:29,202] Trial 16 finished with value: 0.6425053239146192 and parameters: {'k': 8, 'C': 0.00367041805838318}. Best is trial 13 with value: 0.6550043296669984.


C = 0.00367041805838318, k = 8: 0.6425053239146192


[I 2025-03-14 10:45:56,549] Trial 17 finished with value: 0.6635030832931883 and parameters: {'k': 10, 'C': 0.004263322916787863}. Best is trial 17 with value: 0.6635030832931883.


C = 0.004263322916787863, k = 10: 0.6635030832931883


[I 2025-03-14 10:46:23,442] Trial 18 finished with value: 0.6630040835438137 and parameters: {'k': 10, 'C': 0.0036759618109881657}. Best is trial 17 with value: 0.6635030832931883.


C = 0.0036759618109881657, k = 10: 0.6630040835438137


[I 2025-03-14 10:46:51,089] Trial 19 finished with value: 0.6420138279208745 and parameters: {'k': 10, 'C': 0.07299381650288257}. Best is trial 17 with value: 0.6635030832931883.


C = 0.07299381650288257, k = 10: 0.6420138279208745
Best k: 10, Best C: 0.004263322916787863


In [9]:
save_to_csv(prediction_0, prediction_1, prediction_2)

# Spectrum kernel with logistic regression

In [8]:
def sigmoid(z):
    """Sigmoid activation function."""
    return 1 / (1 + np.exp(-z))

def train_logistic_regression(K_train, y_train, C, lr=0.01, epochs=1000):
    """
    Train logistic regression using batch gradient descent.
    
    Parameters:
    - K_train: (n_samples, n_samples) kernel matrix
    - y_train: (n_samples,) labels {-1,1}
    - C: regularization parameter
    - lr: learning rate
    - epochs: number of gradient updates
    
    Returns:
    - w: optimized weight vector
    - b: bias term
    """
    n_samples = K_train.shape[0]
    
    # Initialize weights
    w = np.zeros(n_samples)
    b = 0
    
    # Gradient Descent
    for _ in range(epochs):
        linear_model = K_train @ w + b
        y_pred = sigmoid(linear_model)
        
        # Compute gradients
        error = y_pred - (y_train + 1) / 2  # Convert {-1,1} -> {0,1} for logistic loss
        dw = (K_train.T @ error) / n_samples + C * w
        db = np.mean(error)
        
        # Update weights
        w -= lr * dw
        b -= lr * db

    return w, b


def predict_logistic_regression(K_test, w, b, threshold = 0.5):
    """
    Predict using logistic regression with kernel.
    
    Parameters:
    - K_test: (n_samples, n_train_samples) kernel matrix
    - w: trained weight vector
    - b: bias term
    - threshold: threshold for prediction 
    
    Returns:
    - predictions: {-1,1}
    """
    y_pred_prob = sigmoid(K_test @ w + b)
    return np.where(y_pred_prob >= threshold, 1, -1)


def cross_val_score_manual_logistic_regression(X, y, k, C, threshold, n_splits=3):
    """Performs cross-validation without using sklearn."""
    folds = manual_kfold_split(X, y, n_splits)
    accuracies = []

    for i in range(n_splits):
        val_indices = folds[i]  
        train_indices = np.hstack([folds[j] for j in range(n_splits) if j != i])

        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[val_indices], y[val_indices]

        # Compute kernel matrices
        K_train = spectrum_kernel_matrix(X_train, X_train, k)
        K_val = spectrum_kernel_matrix(X_val, X_train, k)

        # Train and predict
        w, b = train_logistic_regression(K_train, y_train, C)
        predictions = predict_logistic_regression(K_val, w, b, threshold = threshold)

        # Compute accuracy
        accuracy = np.mean(predictions == y_val)
        accuracies.append(accuracy)

    return np.mean(accuracies)


def objective(trial):
    """Objective function for Optuna to optimize k and C."""
    k = trial.suggest_int("k", 2, 6)  # k-mer length between 2 and 6
    C = trial.suggest_float("C", 1e-2, 1e0, log = True)  # Regularization term
    threshold = trial.suggest_float("threshold", 0.48, 0.51)

    cross_val = cross_val_score_manual_logistic_regression(X_train, Y_train, k, C, threshold)

    print(f"C = {C}, k = {k}: {cross_val}")
    return cross_val


def train_and_predict_spectrum_logistic_regression(X_train_path, Y_train_path, X_test_path, n_trials=30):
    """Hyperparameter optimization with Optuna, then train and predict."""
    global X_train, Y_train  
    X_train, Y_train = load_data(X_train_path, Y_train_path)

    df_pred = pd.read_csv(X_test_path)
    X_test = df_pred["seq"].values

    # Run Optuna optimization
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Best hyperparameters
    best_k = study.best_params["k"]
    best_C = study.best_params["C"]


    #A MODIFIER !!!
    best_threshold = study.best_params["threshold"]
    print(f"Best k: {best_k}, Best C: {best_C}, Best threshold: {best_threshold}")

    # Train final model
    K_train = spectrum_kernel_matrix(X_train, X_train, best_k)
    K_test = spectrum_kernel_matrix(X_test, X_train, best_k)
    w, b = train_logistic_regression(K_train, Y_train, best_C)

    # Predict on test set
    predictions = predict_logistic_regression(K_test, w, b, threshold = best_threshold)

    # Convert {-1,1} predictions to {0,1}
    predictions = (predictions + 1) // 2

    df_pred["Bound"] = predictions

    return df_pred


# === Run the function with hyperparameter tuning ===
prediction_0_LR = train_and_predict_spectrum_logistic_regression("./data/Xtr0.csv", "./data/Ytr0.csv", "./data/Xte0.csv", n_trials=30)
prediction_1_LR = train_and_predict_spectrum_logistic_regression("./data/Xtr1.csv", "./data/Ytr1.csv", "./data/Xte1.csv", n_trials=30)
prediction_2_LR = train_and_predict_spectrum_logistic_regression("./data/Xtr2.csv", "./data/Ytr2.csv", "./data/Xte2.csv", n_trials=30)

[I 2025-02-24 16:06:21,728] A new study created in memory with name: no-name-b0f00558-8592-4025-8971-68258e4fa337
  return 1 / (1 + np.exp(-z))
[I 2025-02-24 16:06:24,630] Trial 0 finished with value: 0.49450800125462796 and parameters: {'k': 3, 'C': 0.1522700303229262, 'threshold': 0.5071690092608194}. Best is trial 0 with value: 0.49450800125462796.


C = 0.1522700303229262, k = 3: 0.49450800125462796


[I 2025-02-24 16:06:27,603] Trial 1 finished with value: 0.5160002581291936 and parameters: {'k': 6, 'C': 0.3915350126854863, 'threshold': 0.4886432415948424}. Best is trial 1 with value: 0.5160002581291936.


C = 0.3915350126854863, k = 6: 0.5160002581291936


[I 2025-02-24 16:06:29,976] Trial 2 finished with value: 0.488500494497496 and parameters: {'k': 2, 'C': 0.02769345731545997, 'threshold': 0.5070951983618185}. Best is trial 1 with value: 0.5160002581291936.


C = 0.02769345731545997, k = 2: 0.488500494497496


[I 2025-02-24 16:06:32,592] Trial 3 finished with value: 0.5005057531294413 and parameters: {'k': 4, 'C': 0.016143017299670912, 'threshold': 0.49790708950667606}. Best is trial 1 with value: 0.5160002581291936.


C = 0.016143017299670912, k = 4: 0.5005057531294413


[I 2025-02-24 16:06:35,270] Trial 4 finished with value: 0.49950775363069216 and parameters: {'k': 5, 'C': 0.34802770574802827, 'threshold': 0.4972895250912335}. Best is trial 1 with value: 0.5160002581291936.


C = 0.34802770574802827, k = 5: 0.49950775363069216


[I 2025-02-24 16:06:38,013] Trial 5 finished with value: 0.5025070047558803 and parameters: {'k': 5, 'C': 0.2789423532704591, 'threshold': 0.5098924941944678}. Best is trial 1 with value: 0.5160002581291936.


C = 0.2789423532704591, k = 5: 0.5025070047558803


[I 2025-02-24 16:06:40,610] Trial 6 finished with value: 0.49300124712418564 and parameters: {'k': 4, 'C': 0.612685045427793, 'threshold': 0.4992106657333515}. Best is trial 1 with value: 0.5160002581291936.


C = 0.612685045427793, k = 4: 0.49300124712418564


[I 2025-02-24 16:06:43,688] Trial 7 finished with value: 0.5250115182648917 and parameters: {'k': 6, 'C': 0.470350374155682, 'threshold': 0.5023812335439868}. Best is trial 7 with value: 0.5250115182648917.


C = 0.470350374155682, k = 6: 0.5250115182648917


[I 2025-02-24 16:06:46,804] Trial 8 finished with value: 0.5040175107641375 and parameters: {'k': 6, 'C': 0.9298564999583202, 'threshold': 0.5057908926734325}. Best is trial 7 with value: 0.5250115182648917.


C = 0.9298564999583202, k = 6: 0.5040175107641375


[I 2025-02-24 16:06:49,607] Trial 9 finished with value: 0.5115085100092597 and parameters: {'k': 5, 'C': 0.1620398490070341, 'threshold': 0.4865912663832645}. Best is trial 7 with value: 0.5250115182648917.


C = 0.1620398490070341, k = 5: 0.5115085100092597


[I 2025-02-24 16:06:53,476] Trial 10 finished with value: 0.5444980212596404 and parameters: {'k': 6, 'C': 0.05527774672114904, 'threshold': 0.49154779019248745}. Best is trial 10 with value: 0.5444980212596404.


C = 0.05527774672114904, k = 6: 0.5444980212596404


[I 2025-02-24 16:06:57,139] Trial 11 finished with value: 0.5650102876489683 and parameters: {'k': 6, 'C': 0.053544648178012764, 'threshold': 0.48004993143439206}. Best is trial 11 with value: 0.5650102876489683.


C = 0.053544648178012764, k = 6: 0.5650102876489683


[I 2025-02-24 16:07:01,070] Trial 12 finished with value: 0.5515102808955882 and parameters: {'k': 6, 'C': 0.048748957755943484, 'threshold': 0.4806712463695473}. Best is trial 11 with value: 0.5650102876489683.


C = 0.048748957755943484, k = 6: 0.5515102808955882


[I 2025-02-24 16:07:04,409] Trial 13 finished with value: 0.5210037623830727 and parameters: {'k': 5, 'C': 0.06208610306282273, 'threshold': 0.48127985399082635}. Best is trial 11 with value: 0.5650102876489683.


C = 0.06208610306282273, k = 5: 0.5210037623830727


[I 2025-02-24 16:07:07,568] Trial 14 finished with value: 0.49250299775037404 and parameters: {'k': 3, 'C': 0.03444959663481846, 'threshold': 0.4801826783343527}. Best is trial 11 with value: 0.5650102876489683.


C = 0.03444959663481846, k = 3: 0.49250299775037404


[I 2025-02-24 16:07:11,131] Trial 15 finished with value: 0.5010047528788159 and parameters: {'k': 4, 'C': 0.010154354037281228, 'threshold': 0.4842692418098473}. Best is trial 11 with value: 0.5650102876489683.


C = 0.010154354037281228, k = 4: 0.5010047528788159


[I 2025-02-24 16:07:15,044] Trial 16 finished with value: 0.5325032678855767 and parameters: {'k': 6, 'C': 0.07908581020216288, 'threshold': 0.4923592568232516}. Best is trial 11 with value: 0.5650102876489683.


C = 0.07908581020216288, k = 6: 0.5325032678855767


[I 2025-02-24 16:07:18,094] Trial 17 finished with value: 0.5240052646349498 and parameters: {'k': 5, 'C': 0.03397463650830244, 'threshold': 0.4838239534633762}. Best is trial 11 with value: 0.5650102876489683.


C = 0.03397463650830244, k = 5: 0.5240052646349498


[I 2025-02-24 16:07:20,986] Trial 18 finished with value: 0.4950077513795655 and parameters: {'k': 3, 'C': 0.12680150604687956, 'threshold': 0.4883069884344368}. Best is trial 11 with value: 0.5650102876489683.


C = 0.12680150604687956, k = 3: 0.4950077513795655


[I 2025-02-24 16:07:24,374] Trial 19 finished with value: 0.582013797905852 and parameters: {'k': 6, 'C': 0.02024923961544178, 'threshold': 0.48393262784412083}. Best is trial 19 with value: 0.582013797905852.


C = 0.02024923961544178, k = 6: 0.582013797905852


[I 2025-02-24 16:07:27,357] Trial 20 finished with value: 0.5000052526289408 and parameters: {'k': 4, 'C': 0.018049382589429924, 'threshold': 0.48373213164806017}. Best is trial 19 with value: 0.582013797905852.


C = 0.018049382589429924, k = 4: 0.5000052526289408


[I 2025-02-24 16:07:30,609] Trial 21 finished with value: 0.5724802763783273 and parameters: {'k': 6, 'C': 0.04612092923803916, 'threshold': 0.4811076631985286}. Best is trial 19 with value: 0.582013797905852.


C = 0.04612092923803916, k = 6: 0.5724802763783273


[I 2025-02-24 16:07:33,994] Trial 22 finished with value: 0.5795172984078532 and parameters: {'k': 6, 'C': 0.01798983673058243, 'threshold': 0.48631646890947416}. Best is trial 19 with value: 0.582013797905852.


C = 0.01798983673058243, k = 6: 0.5795172984078532


[I 2025-02-24 16:07:37,074] Trial 23 finished with value: 0.5265055160107633 and parameters: {'k': 5, 'C': 0.017420586943529638, 'threshold': 0.486107878143836}. Best is trial 19 with value: 0.582013797905852.


C = 0.017420586943529638, k = 5: 0.5265055160107633


[I 2025-02-24 16:07:40,626] Trial 24 finished with value: 0.573513543528536 and parameters: {'k': 6, 'C': 0.01082612163979407, 'threshold': 0.49109466137664115}. Best is trial 19 with value: 0.582013797905852.


C = 0.01082612163979407, k = 6: 0.573513543528536


[I 2025-02-24 16:07:44,966] Trial 25 finished with value: 0.5325040182611397 and parameters: {'k': 5, 'C': 0.010250102678983698, 'threshold': 0.4905519183265818}. Best is trial 19 with value: 0.582013797905852.


C = 0.010250102678983698, k = 5: 0.5325040182611397


[I 2025-02-24 16:07:49,644] Trial 26 finished with value: 0.5709847778813296 and parameters: {'k': 6, 'C': 0.02382832255253754, 'threshold': 0.49503400847139295}. Best is trial 19 with value: 0.582013797905852.


C = 0.02382832255253754, k = 6: 0.5709847778813296


[I 2025-02-24 16:07:52,361] Trial 27 finished with value: 0.488500494497496 and parameters: {'k': 2, 'C': 0.014114058985173532, 'threshold': 0.4946258726273587}. Best is trial 19 with value: 0.582013797905852.


C = 0.014114058985173532, k = 2: 0.488500494497496


[I 2025-02-24 16:07:55,347] Trial 28 finished with value: 0.5285037661349504 and parameters: {'k': 5, 'C': 0.02361353087050417, 'threshold': 0.489208163230158}. Best is trial 19 with value: 0.582013797905852.


C = 0.02361353087050417, k = 5: 0.5285037661349504


[I 2025-02-24 16:07:58,401] Trial 29 finished with value: 0.4950077513795655 and parameters: {'k': 3, 'C': 0.01257561820104729, 'threshold': 0.48602911953544226}. Best is trial 19 with value: 0.582013797905852.


C = 0.01257561820104729, k = 3: 0.4950077513795655
Best k: 6, Best C: 0.02024923961544178, Best threshold: 0.48393262784412083


[I 2025-02-24 16:08:01,891] A new study created in memory with name: no-name-b60ec731-4659-4fd3-9628-e4c40d420a6b
[I 2025-02-24 16:08:06,500] Trial 0 finished with value: 0.66697582139861 and parameters: {'k': 6, 'C': 0.02402154421302285, 'threshold': 0.5039186881606753}. Best is trial 0 with value: 0.66697582139861.


C = 0.02402154421302285, k = 6: 0.66697582139861


  return 1 / (1 + np.exp(-z))
[I 2025-02-24 16:08:09,733] Trial 1 finished with value: 0.5105030067548809 and parameters: {'k': 2, 'C': 0.2898630600385778, 'threshold': 0.49073862454487127}. Best is trial 0 with value: 0.66697582139861.


C = 0.2898630600385778, k = 2: 0.5105030067548809


[I 2025-02-24 16:08:13,197] Trial 2 finished with value: 0.5669897783840812 and parameters: {'k': 5, 'C': 0.011251866673727646, 'threshold': 0.48677612396237646}. Best is trial 0 with value: 0.66697582139861.


C = 0.011251866673727646, k = 5: 0.5669897783840812


[I 2025-02-24 16:08:16,166] Trial 3 finished with value: 0.5105030067548809 and parameters: {'k': 2, 'C': 0.12027316739173624, 'threshold': 0.48410421071681425}. Best is trial 0 with value: 0.66697582139861.


C = 0.12027316739173624, k = 2: 0.5105030067548809


[I 2025-02-24 16:08:19,628] Trial 4 finished with value: 0.5564890227558893 and parameters: {'k': 5, 'C': 0.016831154389176244, 'threshold': 0.4992532761297553}. Best is trial 0 with value: 0.66697582139861.


C = 0.016831154389176244, k = 5: 0.5564890227558893


[I 2025-02-24 16:08:22,838] Trial 5 finished with value: 0.5269895082488786 and parameters: {'k': 4, 'C': 0.0680568054785008, 'threshold': 0.48106798090803193}. Best is trial 0 with value: 0.66697582139861.


C = 0.0680568054785008, k = 4: 0.5269895082488786


[I 2025-02-24 16:08:26,529] Trial 6 finished with value: 0.6780025902964434 and parameters: {'k': 6, 'C': 0.03167153005145385, 'threshold': 0.4844154047095179}. Best is trial 6 with value: 0.6780025902964434.


C = 0.03167153005145385, k = 6: 0.6780025902964434


[I 2025-02-24 16:08:29,784] Trial 7 finished with value: 0.5069904987446218 and parameters: {'k': 5, 'C': 0.9820495247973137, 'threshold': 0.4908204158262195}. Best is trial 6 with value: 0.6780025902964434.


C = 0.9820495247973137, k = 5: 0.5069904987446218


[I 2025-02-24 16:08:33,073] Trial 8 finished with value: 0.502491246869058 and parameters: {'k': 4, 'C': 0.6490385632065075, 'threshold': 0.5023378679000247}. Best is trial 6 with value: 0.6780025902964434.


C = 0.6490385632065075, k = 4: 0.502491246869058


[I 2025-02-24 16:08:37,084] Trial 9 finished with value: 0.6664790727759243 and parameters: {'k': 6, 'C': 0.027355379793740368, 'threshold': 0.4990790858424138}. Best is trial 6 with value: 0.6780025902964434.


C = 0.027355379793740368, k = 6: 0.6664790727759243


[I 2025-02-24 16:08:40,111] Trial 10 finished with value: 0.5000142571356964 and parameters: {'k': 3, 'C': 0.06126203624994909, 'threshold': 0.4932763468778013}. Best is trial 6 with value: 0.6780025902964434.


C = 0.06126203624994909, k = 3: 0.5000142571356964


[I 2025-02-24 16:08:43,726] Trial 11 finished with value: 0.6845008426717571 and parameters: {'k': 6, 'C': 0.031168890471893106, 'threshold': 0.5045736544096422}. Best is trial 11 with value: 0.6845008426717571.


C = 0.031168890471893106, k = 6: 0.6845008426717571


[I 2025-02-24 16:08:47,667] Trial 12 finished with value: 0.6209973091532311 and parameters: {'k': 6, 'C': 0.03551372971784097, 'threshold': 0.5083921338742348}. Best is trial 11 with value: 0.6845008426717571.


C = 0.03551372971784097, k = 6: 0.6209973091532311


[I 2025-02-24 16:08:52,089] Trial 13 finished with value: 0.5764827796312054 and parameters: {'k': 6, 'C': 0.13706796652627007, 'threshold': 0.5095672240393027}. Best is trial 11 with value: 0.6845008426717571.


C = 0.13706796652627007, k = 6: 0.5764827796312054


[I 2025-02-24 16:08:55,787] Trial 14 finished with value: 0.571489030259645 and parameters: {'k': 5, 'C': 0.046548701508994275, 'threshold': 0.4802367111990016}. Best is trial 11 with value: 0.6845008426717571.


C = 0.046548701508994275, k = 5: 0.571489030259645


[I 2025-02-24 16:08:59,245] Trial 15 finished with value: 0.5334915125020072 and parameters: {'k': 4, 'C': 0.010927820657831782, 'threshold': 0.4962424234278626}. Best is trial 11 with value: 0.6845008426717571.


C = 0.010927820657831782, k = 4: 0.5334915125020072


[I 2025-02-24 16:09:03,584] Trial 16 finished with value: 0.6000580790685738 and parameters: {'k': 6, 'C': 0.09678776893355719, 'threshold': 0.4875584560176722}. Best is trial 11 with value: 0.6845008426717571.


C = 0.09678776893355719, k = 6: 0.6000580790685738


[I 2025-02-24 16:09:07,491] Trial 17 finished with value: 0.5309875092483788 and parameters: {'k': 5, 'C': 0.20983847369333597, 'threshold': 0.504031383302474}. Best is trial 11 with value: 0.6845008426717571.


C = 0.20983847369333597, k = 5: 0.5309875092483788


[I 2025-02-24 16:09:11,397] Trial 18 finished with value: 0.6990101045573308 and parameters: {'k': 6, 'C': 0.01920829840800954, 'threshold': 0.5068671179278016}. Best is trial 18 with value: 0.6990101045573308.


C = 0.01920829840800954, k = 6: 0.6990101045573308


[I 2025-02-24 16:09:14,664] Trial 19 finished with value: 0.5040145092618856 and parameters: {'k': 3, 'C': 0.017403469663691138, 'threshold': 0.5068086854479475}. Best is trial 18 with value: 0.6990101045573308.


C = 0.017403469663691138, k = 3: 0.5040145092618856


[I 2025-02-24 16:09:18,134] Trial 20 finished with value: 0.5339912626269449 and parameters: {'k': 4, 'C': 0.01771541265346576, 'threshold': 0.5060315466325422}. Best is trial 18 with value: 0.6990101045573308.


C = 0.01771541265346576, k = 4: 0.5339912626269449


[I 2025-02-24 16:09:21,639] Trial 21 finished with value: 0.6010410710560635 and parameters: {'k': 6, 'C': 0.04263300239580066, 'threshold': 0.5007001102824721}. Best is trial 18 with value: 0.6990101045573308.


C = 0.04263300239580066, k = 6: 0.6010410710560635


[I 2025-02-24 16:09:25,218] Trial 22 finished with value: 0.6750033391712552 and parameters: {'k': 6, 'C': 0.032042610476621707, 'threshold': 0.49680472143361176}. Best is trial 18 with value: 0.6990101045573308.


C = 0.032042610476621707, k = 6: 0.6750033391712552


[I 2025-02-24 16:09:28,504] Trial 23 finished with value: 0.5679892786339562 and parameters: {'k': 5, 'C': 0.02339314522523844, 'threshold': 0.5050391498500582}. Best is trial 18 with value: 0.6990101045573308.


C = 0.02339314522523844, k = 5: 0.5679892786339562


[I 2025-02-24 16:09:31,904] Trial 24 finished with value: 0.6535283409346379 and parameters: {'k': 6, 'C': 0.06225952426655213, 'threshold': 0.49374149005885537}. Best is trial 18 with value: 0.6990101045573308.


C = 0.06225952426655213, k = 6: 0.6535283409346379


[I 2025-02-24 16:09:35,154] Trial 25 finished with value: 0.5679817748783266 and parameters: {'k': 5, 'C': 0.01572168757249131, 'threshold': 0.5017492709541465}. Best is trial 18 with value: 0.6990101045573308.


C = 0.01572168757249131, k = 5: 0.5679817748783266


[I 2025-02-24 16:09:38,694] Trial 26 finished with value: 0.6235440838139489 and parameters: {'k': 6, 'C': 0.04360657786434326, 'threshold': 0.5074291255766302}. Best is trial 18 with value: 0.6990101045573308.


C = 0.04360657786434326, k = 6: 0.6235440838139489


[I 2025-02-24 16:09:41,774] Trial 27 finished with value: 0.5200027613820717 and parameters: {'k': 3, 'C': 0.08112245858294165, 'threshold': 0.48312368022287044}. Best is trial 18 with value: 0.6990101045573308.


C = 0.08112245858294165, k = 3: 0.5200027613820717


[I 2025-02-24 16:09:44,944] Trial 28 finished with value: 0.5604930267598932 and parameters: {'k': 5, 'C': 0.02277247151156676, 'threshold': 0.5094872770734897}. Best is trial 18 with value: 0.6990101045573308.


C = 0.02277247151156676, k = 5: 0.5604930267598932


[I 2025-02-24 16:09:48,564] Trial 29 finished with value: 0.6545143344243793 and parameters: {'k': 6, 'C': 0.012769375255630648, 'threshold': 0.5034262345065803}. Best is trial 18 with value: 0.6990101045573308.


C = 0.012769375255630648, k = 6: 0.6545143344243793
Best k: 6, Best C: 0.01920829840800954, Best threshold: 0.5068671179278016


[I 2025-02-24 16:09:52,154] A new study created in memory with name: no-name-d36352d5-f0c5-422e-a164-5a4198bf2e11
  return 1 / (1 + np.exp(-z))
[I 2025-02-24 16:09:55,095] Trial 0 finished with value: 0.5575170372771572 and parameters: {'k': 3, 'C': 0.09996570445050645, 'threshold': 0.4853405818119471}. Best is trial 0 with value: 0.5575170372771572.


C = 0.09996570445050645, k = 3: 0.5575170372771572


[I 2025-02-24 16:09:58,694] Trial 1 finished with value: 0.5960060510285398 and parameters: {'k': 6, 'C': 0.07292356601122152, 'threshold': 0.4907732837754635}. Best is trial 1 with value: 0.5960060510285398.


C = 0.07292356601122152, k = 6: 0.5960060510285398


[I 2025-02-24 16:10:01,601] Trial 2 finished with value: 0.5690127909018464 and parameters: {'k': 3, 'C': 0.020041278529962427, 'threshold': 0.49119723178807156}. Best is trial 1 with value: 0.5960060510285398.


C = 0.020041278529962427, k = 3: 0.5690127909018464


[I 2025-02-24 16:10:04,796] Trial 3 finished with value: 0.5640122881502192 and parameters: {'k': 5, 'C': 0.8726728937366919, 'threshold': 0.5037828185162471}. Best is trial 1 with value: 0.5960060510285398.


C = 0.8726728937366919, k = 5: 0.5640122881502192


[I 2025-02-24 16:10:08,264] Trial 4 finished with value: 0.5940047994021008 and parameters: {'k': 6, 'C': 0.057164739027093764, 'threshold': 0.49112067715336805}. Best is trial 1 with value: 0.5960060510285398.


C = 0.057164739027093764, k = 6: 0.5940047994021008


[I 2025-02-24 16:10:11,250] Trial 5 finished with value: 0.5510165337751545 and parameters: {'k': 3, 'C': 0.31308085118213175, 'threshold': 0.48156832349932155}. Best is trial 1 with value: 0.5960060510285398.


C = 0.31308085118213175, k = 3: 0.5510165337751545


[I 2025-02-24 16:10:14,736] Trial 6 finished with value: 0.5520145332739036 and parameters: {'k': 3, 'C': 0.6103074883520305, 'threshold': 0.48033115458056885}. Best is trial 1 with value: 0.5960060510285398.


C = 0.6103074883520305, k = 3: 0.5520145332739036


[I 2025-02-24 16:10:17,544] Trial 7 finished with value: 0.5605140372756564 and parameters: {'k': 2, 'C': 0.030891932374528332, 'threshold': 0.4985177134613736}. Best is trial 1 with value: 0.5960060510285398.


C = 0.030891932374528332, k = 2: 0.5605140372756564


[I 2025-02-24 16:10:21,527] Trial 8 finished with value: 0.5740065402734068 and parameters: {'k': 6, 'C': 0.13972974726040785, 'threshold': 0.5078327329753111}. Best is trial 1 with value: 0.5960060510285398.


C = 0.13972974726040785, k = 6: 0.5740065402734068


[I 2025-02-24 16:10:24,339] Trial 9 finished with value: 0.5645135390262827 and parameters: {'k': 2, 'C': 0.012205432154956774, 'threshold': 0.48404988657785536}. Best is trial 1 with value: 0.5960060510285398.


C = 0.012205432154956774, k = 2: 0.5645135390262827


[I 2025-02-24 16:10:27,361] Trial 10 finished with value: 0.570512041276659 and parameters: {'k': 5, 'C': 0.15080127961038325, 'threshold': 0.49783600473145867}. Best is trial 1 with value: 0.5960060510285398.


C = 0.15080127961038325, k = 5: 0.570512041276659


[I 2025-02-24 16:10:30,821] Trial 11 finished with value: 0.5870120495307901 and parameters: {'k': 6, 'C': 0.04317264046438225, 'threshold': 0.4909048273274188}. Best is trial 1 with value: 0.5960060510285398.


C = 0.04317264046438225, k = 6: 0.5870120495307901


[I 2025-02-24 16:10:34,116] Trial 12 finished with value: 0.5775077926502215 and parameters: {'k': 5, 'C': 0.06393140872305934, 'threshold': 0.49053158188071705}. Best is trial 1 with value: 0.5960060510285398.


C = 0.06393140872305934, k = 5: 0.5775077926502215


[I 2025-02-24 16:10:37,670] Trial 13 finished with value: 0.5885128006567286 and parameters: {'k': 6, 'C': 0.06053494555267592, 'threshold': 0.48858473385610246}. Best is trial 1 with value: 0.5960060510285398.


C = 0.06053494555267592, k = 6: 0.5885128006567286


[I 2025-02-24 16:10:40,864] Trial 14 finished with value: 0.5625130377754065 and parameters: {'k': 5, 'C': 0.24719856999131534, 'threshold': 0.49721384816493713}. Best is trial 1 with value: 0.5960060510285398.


C = 0.24719856999131534, k = 5: 0.5625130377754065


[I 2025-02-24 16:10:44,885] Trial 15 finished with value: 0.5680125402764084 and parameters: {'k': 4, 'C': 0.025851574205672776, 'threshold': 0.49464952643174065}. Best is trial 1 with value: 0.5960060510285398.


C = 0.025851574205672776, k = 4: 0.5680125402764084


[I 2025-02-24 16:10:48,422] Trial 16 finished with value: 0.5945068006537272 and parameters: {'k': 6, 'C': 0.08284111169966517, 'threshold': 0.48700016249346817}. Best is trial 1 with value: 0.5960060510285398.


C = 0.08284111169966517, k = 6: 0.5945068006537272


[I 2025-02-24 16:10:51,501] Trial 17 finished with value: 0.5670107888998444 and parameters: {'k': 4, 'C': 0.11620660336806515, 'threshold': 0.4860310089601209}. Best is trial 1 with value: 0.5960060510285398.


C = 0.11620660336806515, k = 4: 0.5670107888998444


[I 2025-02-24 16:10:55,046] Trial 18 finished with value: 0.569021045033039 and parameters: {'k': 6, 'C': 0.27288426597115606, 'threshold': 0.4945492856628212}. Best is trial 1 with value: 0.5960060510285398.


C = 0.27288426597115606, k = 6: 0.569021045033039


[I 2025-02-24 16:10:58,152] Trial 19 finished with value: 0.5695177936557246 and parameters: {'k': 5, 'C': 0.08527708415038715, 'threshold': 0.501466250920309}. Best is trial 1 with value: 0.5960060510285398.


C = 0.08527708415038715, k = 5: 0.5695177936557246


[I 2025-02-24 16:11:01,248] Trial 20 finished with value: 0.5650147899023462 and parameters: {'k': 4, 'C': 0.19490356866948003, 'threshold': 0.4869905281241621}. Best is trial 1 with value: 0.5960060510285398.


C = 0.19490356866948003, k = 4: 0.5650147899023462


[I 2025-02-24 16:11:04,659] Trial 21 finished with value: 0.5895062979021 and parameters: {'k': 6, 'C': 0.0570443263305881, 'threshold': 0.49248465852805723}. Best is trial 1 with value: 0.5960060510285398.


C = 0.0570443263305881, k = 6: 0.5895062979021


[I 2025-02-24 16:11:08,214] Trial 22 finished with value: 0.5970138054096076 and parameters: {'k': 6, 'C': 0.04702410502959134, 'threshold': 0.48855778512832665}. Best is trial 22 with value: 0.5970138054096076.


C = 0.04702410502959134, k = 6: 0.5970138054096076


[I 2025-02-24 16:11:11,649] Trial 23 finished with value: 0.5680102891497194 and parameters: {'k': 6, 'C': 0.03797323643157507, 'threshold': 0.4829331119489867}. Best is trial 22 with value: 0.5970138054096076.


C = 0.03797323643157507, k = 6: 0.5680102891497194


[I 2025-02-24 16:11:14,753] Trial 24 finished with value: 0.5755185470327899 and parameters: {'k': 5, 'C': 0.017829959480734882, 'threshold': 0.4885837997224746}. Best is trial 22 with value: 0.5970138054096076.


C = 0.017829959480734882, k = 5: 0.5755185470327899


[I 2025-02-24 16:11:18,168] Trial 25 finished with value: 0.5865115490302896 and parameters: {'k': 6, 'C': 0.08093574415099448, 'threshold': 0.4879546320350588}. Best is trial 22 with value: 0.5970138054096076.


C = 0.08093574415099448, k = 6: 0.5865115490302896


[I 2025-02-24 16:11:21,271] Trial 26 finished with value: 0.584015049532291 and parameters: {'k': 5, 'C': 0.04199285327304349, 'threshold': 0.4935464122636891}. Best is trial 22 with value: 0.5970138054096076.


C = 0.04199285327304349, k = 5: 0.584015049532291


[I 2025-02-24 16:11:24,808] Trial 27 finished with value: 0.5885120502811657 and parameters: {'k': 6, 'C': 0.46339290367314545, 'threshold': 0.484076418218948}. Best is trial 22 with value: 0.5970138054096076.


C = 0.46339290367314545, k = 6: 0.5885120502811657


[I 2025-02-24 16:11:27,773] Trial 28 finished with value: 0.5705135420277849 and parameters: {'k': 4, 'C': 0.01067371317662793, 'threshold': 0.4887295834630342}. Best is trial 22 with value: 0.5970138054096076.


C = 0.01067371317662793, k = 4: 0.5705135420277849


[I 2025-02-24 16:11:31,309] Trial 29 finished with value: 0.583009546277912 and parameters: {'k': 6, 'C': 0.10569969542094641, 'threshold': 0.48607478578110125}. Best is trial 22 with value: 0.5970138054096076.


C = 0.10569969542094641, k = 6: 0.583009546277912
Best k: 6, Best C: 0.04702410502959134, Best threshold: 0.48855778512832665


In [10]:
save_to_csv(prediction_0_LR, prediction_1_LR, prediction_2_LR, suffixe = "lr_separately")

# Prediction only (without Optuna)

In [11]:
def predict_spectrum_svm(X_train_path, Y_train_path, X_test_path, best_k, best_C):
    X_train, Y_train = load_data(X_train_path, Y_train_path)

    df_pred = pd.read_csv(X_test_path)
    X_test = df_pred["seq"].values

    K_train = spectrum_kernel_matrix(X_train, X_train, best_k)
    K_test = spectrum_kernel_matrix(X_test, X_train, best_k)
    alphas, support_vectors, bias = train_svm(K_train, Y_train, best_C)

    # Predict on test set
    predictions = predict_svm(K_test, alphas, support_vectors, Y_train[support_vectors], bias)

    # Convert {-1,1} predictions to {0,1}
    predictions = (predictions + 1) // 2

    df_pred["Bound"] = predictions

    return df_pred

In [12]:
def predict_spectrum_LR(X_train_path, 
                        Y_train_path, 
                        X_test_path, 
                        best_k, 
                        best_C, 
                        best_threshold):
    X_train, Y_train = load_data(X_train_path, Y_train_path)

    df_pred = pd.read_csv(X_test_path)
    X_test = df_pred["seq"].values

    K_train = spectrum_kernel_matrix(X_train, X_train, best_k)
    K_test = spectrum_kernel_matrix(X_test, X_train, best_k)
    w, b = train_logistic_regression(K_train, Y_train, best_C)

    # Predict on test set
    predictions = predict_logistic_regression(K_test, w, b, threshold = best_threshold)

    # Convert {-1,1} predictions to {0,1}
    predictions = (predictions + 1) // 2

    df_pred["Bound"] = predictions

    return df_pred


# With Kernel Ridge Regression 

In [None]:



# ===== Kernel Ridge Regression (KRR) =====
def train_kernel_ridge_regression(K_train, y_train, lambda_reg=1.0):
    """Train Kernel Ridge Regression: Solves (K + λI)α = y."""
    n = K_train.shape[0]
    alpha = solve(K_train + lambda_reg * np.eye(n), y_train, assume_a='pos')
    return alpha

def predict_kernel_ridge_regression(K_test, alpha):
    """Predict using Kernel Ridge Regression."""
    return np.sign(K_test @ alpha)  # Predict {-1,1}


def cross_val_score_krr(X, y, k, lambda_reg, n_splits=3):
    """Performs cross-validation for KRR with Spectrum Kernel."""
    folds = manual_kfold_split(X, y, n_splits)
    accuracies = []

    for i in range(n_splits):
        val_indices = folds[i]  
        train_indices = np.hstack([folds[j] for j in range(n_splits) if j != i])

        X_train, y_train = X[train_indices], y[train_indices]
        X_val, y_val = X[val_indices], y[val_indices]

        K_train = spectrum_kernel_matrix(X_train, X_train, k)
        K_val = spectrum_kernel_matrix(X_val, X_train, k)

        alpha = train_kernel_ridge_regression(K_train, y_train, lambda_reg)
        predictions = predict_kernel_ridge_regression(K_val, alpha)

        accuracy = np.mean(predictions == y_val)
        accuracies.append(accuracy)

    return np.mean(accuracies)

# ===== Optuna Optimization =====
def objective(trial):
    """Objective function for Optuna to optimize k and lambda_reg."""
    k = trial.suggest_int("k", 4, 10)  # k-mer length between entre 4 et 10
    lambda_reg = trial.suggest_float("lambda_reg", 1e0, 1e3, log = True)  # λ in [0.01, 100]

    cross_val = cross_val_score_krr(X_train, Y_train, k, lambda_reg)
    return cross_val  # Maximize accuracy

def train_and_predict_spectrum_krr(X_train_path, Y_train_path, X_test_path, n_trials=20):
    """Hyperparameter tuning + final prediction with Kernel Ridge Regression."""
    global X_train, Y_train  # Needed for Optuna

    # Load data
    X_train, Y_train = load_data(X_train_path, Y_train_path)

    df_test = pd.read_csv(X_test_path)
    X_test = df_test["seq"].values

    # Run Optuna optimization
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=n_trials)

    # Best hyperparameters
    best_k = study.best_params["k"]
    best_lambda = study.best_params["lambda_reg"]
    print(f"Best k: {best_k}, Best lambda_reg: {best_lambda}")

    # Train final model with best hyperparameters
    K_train = spectrum_kernel_matrix(X_train, X_train, best_k)
    K_test = spectrum_kernel_matrix(X_test, X_train, best_k)
    alpha = train_kernel_ridge_regression(K_train, Y_train, best_lambda)

    # Predict on test set
    predictions = predict_kernel_ridge_regression(K_test, alpha)

    # Convert {-1,1} predictions to {0,1}
    df_test["Bound"] = (predictions + 1) // 2

    return df_test


In [5]:
# === Run the function ===
prediction_0 = train_and_predict_spectrum_krr("./data/Xtr0.csv", "./data/Ytr0.csv", "./data/Xte0.csv", n_trials=40)
prediction_1 = train_and_predict_spectrum_krr("./data/Xtr1.csv", "./data/Ytr1.csv", "./data/Xte1.csv", n_trials=40)
prediction_2 = train_and_predict_spectrum_krr("./data/Xtr2.csv", "./data/Ytr2.csv", "./data/Xte2.csv", n_trials=40)

# Save predictions
save_to_csv(prediction_0, prediction_1, prediction_2, suffixe="krr_with greater_k_2")

[I 2025-03-10 17:02:59,981] A new study created in memory with name: no-name-02272cc9-927d-4b3d-b19a-11b780b49ff1


[I 2025-03-10 17:03:07,589] Trial 0 finished with value: 0.5995073034053543 and parameters: {'k': 8, 'lambda_reg': 37.441727772169614}. Best is trial 0 with value: 0.5995073034053543.
[I 2025-03-10 17:03:24,160] Trial 1 finished with value: 0.5909972941457199 and parameters: {'k': 10, 'lambda_reg': 15.711130527696206}. Best is trial 0 with value: 0.5995073034053543.
[I 2025-03-10 17:03:25,637] Trial 2 finished with value: 0.5610010310160235 and parameters: {'k': 5, 'lambda_reg': 3.137818811464361}. Best is trial 0 with value: 0.5995073034053543.
[I 2025-03-10 17:03:27,497] Trial 3 finished with value: 0.5894972933953443 and parameters: {'k': 6, 'lambda_reg': 54.420278531939154}. Best is trial 0 with value: 0.5995073034053543.
[I 2025-03-10 17:03:29,568] Trial 4 finished with value: 0.6079993036514776 and parameters: {'k': 6, 'lambda_reg': 291.95410818783324}. Best is trial 4 with value: 0.6079993036514776.
[I 2025-03-10 17:03:31,094] Trial 5 finished with value: 0.5625002813908361 and 

Best k: 5, Best lambda_reg: 358.2057585584608


[I 2025-03-10 17:08:58,127] A new study created in memory with name: no-name-65973cab-efcc-4c51-bc39-bac5130d0e01
[I 2025-03-10 17:09:15,148] Trial 0 finished with value: 0.7155071113092103 and parameters: {'k': 9, 'lambda_reg': 2.5608022706733906}. Best is trial 0 with value: 0.7155071113092103.
[I 2025-03-10 17:09:30,972] Trial 1 finished with value: 0.7160083621852738 and parameters: {'k': 9, 'lambda_reg': 470.97737077267726}. Best is trial 1 with value: 0.7160083621852738.
[I 2025-03-10 17:09:46,619] Trial 2 finished with value: 0.715005860433147 and parameters: {'k': 9, 'lambda_reg': 822.259108135477}. Best is trial 1 with value: 0.7160083621852738.
[I 2025-03-10 17:10:06,717] Trial 3 finished with value: 0.6900048474261368 and parameters: {'k': 10, 'lambda_reg': 19.719087994432293}. Best is trial 1 with value: 0.7160083621852738.
[I 2025-03-10 17:10:31,488] Trial 4 finished with value: 0.7160061110585847 and parameters: {'k': 9, 'lambda_reg': 77.54051675380039}. Best is trial 1 w

Best k: 8, Best lambda_reg: 532.6006476964751


[I 2025-03-10 17:18:20,009] A new study created in memory with name: no-name-5aeed77b-c7ca-4b10-8703-88cb731f6f8d
[I 2025-03-10 17:18:21,939] Trial 0 finished with value: 0.6135063099081091 and parameters: {'k': 5, 'lambda_reg': 68.78778002779663}. Best is trial 0 with value: 0.6135063099081091.
[I 2025-03-10 17:18:42,508] Trial 1 finished with value: 0.6374995685340513 and parameters: {'k': 9, 'lambda_reg': 11.433122623152883}. Best is trial 1 with value: 0.6374995685340513.
[I 2025-03-10 17:19:09,238] Trial 2 finished with value: 0.6360033196614906 and parameters: {'k': 10, 'lambda_reg': 59.01678442152743}. Best is trial 1 with value: 0.6374995685340513.
[I 2025-03-10 17:19:11,187] Trial 3 finished with value: 0.5885030457744101 and parameters: {'k': 5, 'lambda_reg': 18.35651133397762}. Best is trial 1 with value: 0.6374995685340513.
[I 2025-03-10 17:19:23,967] Trial 4 finished with value: 0.6429975702839271 and parameters: {'k': 8, 'lambda_reg': 6.8190849757811005}. Best is trial 4 

Best k: 8, Best lambda_reg: 39.08345223804072
