In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
import optuna

# load data
train_features = pd.read_csv("train_features.csv")
train_targets = pd.read_csv("train_targets_scored.csv")

# Increase the fraction of data used for training
fraction = 0.4  # Use a larger fraction of data
train_features = train_features.sample(frac=fraction, random_state=42).reset_index(drop=True)
train_targets = train_targets.loc[train_features.index].reset_index(drop=True)

# Preprocessing
X = train_features.drop(columns=["sig_id"])
y = train_targets.drop(columns=["sig_id"])

# Encoding of categorical features
X = pd.get_dummies(X, columns=["cp_type", "cp_time", "cp_dose"], drop_first=True)

# Standardize features
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

# Apply Principal Component Analysis (PCA)
pca = PCA(n_components=0.95)  # Retain 95% of variance
X = pca.fit_transform(X)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define an Optuna objective function for hyperparameter tuning
def objective(trial):
    param = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 400),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "subsample": trial.suggest_float("subsample", 0.6, 1.0),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "random_state": 42,
    }
    model = MultiOutputClassifier(XGBClassifier(**param, eval_metric='logloss'))
    model.fit(X_train, y_train)
    y_val_pred = np.column_stack([
        clf.predict_proba(X_val)[:, 1] for clf in model.estimators_
    ])
    return log_loss(y_val, y_val_pred, eps=1e-15)

# optuna optimization
print("Starting Optuna optimization...")
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=3)

# parameters from Optuna
best_params = study.best_params
print(f"Best parameters: {best_params}")

# training optimized Model
print("Training the optimized model...")
optimized_model = MultiOutputClassifier(XGBClassifier(**best_params, eval_metric='logloss'))
optimized_model.fit(X_train, y_train)

# predict probabilities
y_val_pred = np.column_stack([
    clf.predict_proba(X_val)[:, 1] for clf in optimized_model.estimators_
])

# log loss
log_loss_score = log_loss(y_val, y_val_pred, eps=1e-15)
print(f"Validation Log Loss (optimized): {log_loss_score:.4f}")

# Feature importance listing
feature_importances = optimized_model.estimators_[0].feature_importances_
print("Feature importances:")
for i, importance in enumerate(feature_importances):
    print(f"Feature {i + 1}: {importance:.4f}")


[I 2024-12-22 19:38:49,003] A new study created in memory with name: no-name-839b43a1-9a0c-471c-9555-675621ccfa22


Starting Optuna optimization...


[I 2024-12-22 19:47:25,188] Trial 0 finished with value: 3.668514753831667 and parameters: {'n_estimators': 105, 'max_depth': 9, 'learning_rate': 0.010832175019578921, 'subsample': 0.6249674978772852, 'colsample_bytree': 0.9939023088180211}. Best is trial 0 with value: 3.668514753831667.
[I 2024-12-22 19:52:35,635] Trial 1 finished with value: 3.615892207701256 and parameters: {'n_estimators': 80, 'max_depth': 5, 'learning_rate': 0.022538684115441972, 'subsample': 0.7345150477760821, 'colsample_bytree': 0.5786323711008818}. Best is trial 1 with value: 3.615892207701256.
[I 2024-12-22 20:13:48,939] Trial 2 finished with value: 3.551105842111734 and parameters: {'n_estimators': 248, 'max_depth': 7, 'learning_rate': 0.05129596324167866, 'subsample': 0.6252259043941274, 'colsample_bytree': 0.9102142375640712}. Best is trial 2 with value: 3.551105842111734.
[I 2024-12-22 20:39:39,490] Trial 3 finished with value: 3.992198137568356 and parameters: {'n_estimators': 397, 'max_depth': 9, 'learn

Best parameters: {'n_estimators': 341, 'max_depth': 5, 'learning_rate': 0.019742998791995415, 'subsample': 0.9148006382280481, 'colsample_bytree': 0.5508829418971352}
Training the optimized model...
Validation Log Loss (optimized): 3.4283
Feature importances:
Feature 1: 0.0025
Feature 2: 0.0000
Feature 3: 0.0008
Feature 4: 0.0005
Feature 5: 0.0000
Feature 6: 0.0000
Feature 7: 0.0007
Feature 8: 0.0000
Feature 9: 0.0000
Feature 10: 0.0000
Feature 11: 0.0000
Feature 12: 0.0000
Feature 13: 0.0000
Feature 14: 0.0000
Feature 15: 0.0040
Feature 16: 0.0000
Feature 17: 0.0000
Feature 18: 0.0000
Feature 19: 0.0000
Feature 20: 0.0000
Feature 21: 0.0000
Feature 22: 0.0000
Feature 23: 0.0000
Feature 24: 0.0000
Feature 25: 0.0000
Feature 26: 0.0000
Feature 27: 0.0041
Feature 28: 0.0009
Feature 29: 0.0000
Feature 30: 0.0000
Feature 31: 0.0040
Feature 32: 0.0000
Feature 33: 0.0005
Feature 34: 0.0006
Feature 35: 0.0006
Feature 36: 0.0070
Feature 37: 0.0000
Feature 38: 0.0008
Feature 39: 0.0072
Feature 

