In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import optuna

# Load data
train_features = pd.read_csv("train_features.csv")
train_targets = pd.read_csv("train_targets_scored.csv")

# Use a fraction of data for faster iterations
fraction = 0.4  # Adjust fraction for computational feasibility
train_features = train_features.sample(frac=fraction, random_state=42).reset_index(drop=True)
train_targets = train_targets.loc[train_features.index].reset_index(drop=True)

# Preprocessing
X = train_features.drop(columns=["sig_id"])
y = train_targets.drop(columns=["sig_id"])

# One-hot encode categorical features
X = pd.get_dummies(X, columns=["cp_type", "cp_time", "cp_dose"], drop_first=True)

# Standardize features
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X = pca.fit_transform(X)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)  # Removed stratify=y

# Define Optuna objective function for hyperparameter tuning
def objective(trial):
    # Hyperparameters to optimize
    n_neighbors = trial.suggest_int("n_neighbors", 3, 30)
    weights = trial.suggest_categorical("weights", ["uniform", "distance"])
    p = trial.suggest_int("p", 1, 2)  # Manhattan (p=1) or Euclidean (p=2)
    
    # KNN model
    knn_model = MultiOutputClassifier(KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p))
    knn_model.fit(X_train, y_train)
    
    # Predict probabilities for validation set
    y_val_pred = np.column_stack([
        clf.predict_proba(X_val)[:, 1] if clf.predict_proba(X_val).shape[1] > 1 else clf.predict_proba(X_val)[:, 0]
        for clf in knn_model.estimators_
    ])
    return log_loss(y_val, y_val_pred, eps=1e-15)

# Run Optuna optimization
print("Starting Optuna optimization...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3)  # Increase number of trials for better optimization

# Get best parameters from Optuna
best_params = study.best_params
print(f"Best parameters: {best_params}")

# Train the optimized model
print("Training the optimized KNN model...")
optimized_knn_model = MultiOutputClassifier(
    KNeighborsClassifier(
        n_neighbors=best_params["n_neighbors"],
        weights=best_params["weights"],
        p=best_params["p"]
    )
)
optimized_knn_model.fit(X_train, y_train)

# Predict probabilities for validation set
y_val_pred = np.column_stack([
    clf.predict_proba(X_val)[:, 1] if clf.predict_proba(X_val).shape[1] > 1 else clf.predict_proba(X_val)[:, 0]
    for clf in optimized_knn_model.estimators_
])

# Calculate and display validation log loss
log_loss_score = log_loss(y_val, y_val_pred, eps=1e-15)
print(f"Validation Log Loss (optimized): {log_loss_score:.4f}")


[I 2024-12-27 10:53:35,724] A new study created in memory with name: no-name-d98b9fa6-ef01-469f-9a97-7e6dcd4474df


Starting Optuna optimization...


[I 2024-12-27 11:05:02,243] Trial 0 finished with value: 23.896101628737412 and parameters: {'n_neighbors': 5, 'weights': 'distance', 'p': 1}. Best is trial 0 with value: 23.896101628737412.
[I 2024-12-27 11:16:26,660] Trial 1 finished with value: 21.171960547815942 and parameters: {'n_neighbors': 23, 'weights': 'distance', 'p': 1}. Best is trial 1 with value: 21.171960547815942.
[I 2024-12-27 11:17:12,919] Trial 2 finished with value: 22.663984526899775 and parameters: {'n_neighbors': 12, 'weights': 'distance', 'p': 2}. Best is trial 1 with value: 21.171960547815942.


Best parameters: {'n_neighbors': 23, 'weights': 'distance', 'p': 1}
Training the optimized KNN model...
Validation Log Loss (optimized): 21.1720


