In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.decomposition import PCA
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MultiLabelBinarizer

print("Applying SMOTE for multi-label resampling...")

# Combine multi-labels into a single-label class for SMOTE
combined_labels = y_train.apply(lambda row: "_".join(row.astype(str)), axis=1)

# Apply SMOTE
smote = SMOTE(random_state=42, k_neighbors=2)
X_resampled, combined_labels_resampled = smote.fit_resample(X_train, combined_labels)

# Convert combined labels back to multi-label format
mlb = MultiLabelBinarizer(classes=y_train.columns)
y_train_resampled = pd.DataFrame(mlb.fit_transform(combined_labels_resampled.str.split("_")), columns=y_train.columns)

# Reset indices for consistency
X_train_resampled = pd.DataFrame(X_resampled, columns=X_train.columns).reset_index(drop=True)
y_train_resampled = y_train_resampled.reset_index(drop=True)

print(f"Resampled X_train shape: {X_train_resampled.shape}")
print(f"Resampled y_train shape: {y_train_resampled.shape}")

# Load Data
data = pd.read_csv("train_features.csv")
targets = pd.read_csv("train_targets_scored.csv")

# Drop 'sig_id' and process categorical features
data = data.drop(columns=["sig_id"], errors="ignore")
categorical_cols = ["cp_type", "cp_time", "cp_dose"]
data = pd.get_dummies(data, columns=categorical_cols, drop_first=True)

# Scale numeric features
scaler = StandardScaler()
data.iloc[:, :] = scaler.fit_transform(data)

# Drop 'sig_id' from targets
targets = targets.drop(columns=["sig_id"], errors="ignore")

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(
    data, targets, test_size=0.2, random_state=42, stratify=targets.sum(axis=1)
)

# Apply SMOTE for each label independently
from imblearn.over_sampling import SMOTE

print("Applying SMOTE for each label independently...")
X_train_smote = X_train.copy()  # Copy features
y_train_smote = pd.DataFrame()  # Initialize empty DataFrame for targets

for i in range(y_train.shape[1]):  # Loop over each label
    smote = SMOTE(random_state=42, k_neighbors=2)
    
    # Resample features and single target label
    X_resampled, y_resampled = smote.fit_resample(X_train_smote, y_train.iloc[:, i])
    
    if i == 0:
        # If first label, initialize X_train_resampled
        X_train_resampled = X_resampled
    else:
        # Ensure features remain the same for subsequent labels
        assert (X_train_resampled == X_resampled).all().all(), "Features must be identical across labels!"
    
    # Append resampled target column
    y_train_smote[y_train.columns[i]] = y_resampled

# Convert y_train_smote back to the original shape and column names
y_train_smote.columns = y_train.columns

print(f"Resampled X_train shape: {X_train_resampled.shape}")
print(f"Resampled y_train shape: {y_train_smote.shape}")


# Dimensionality Reduction with PCA (Optional)
n_components = 100
print(f"Reducing features to {n_components} dimensions with PCA...")
pca = PCA(n_components=n_components, random_state=42)
X_train_pca = pca.fit_transform(X_train_smote)
X_val_pca = pca.transform(X_val)

# Model Hyperparameters
xgb_params = {
    "max_depth": 6,
    "n_estimators": 500,
    "learning_rate": 0.05,
    "subsample": 0.9,
    "colsample_bytree": 0.9,
    "min_child_weight": 3,
    "gamma": 1.0,
    "random_state": 42
}

# Wrap XGBoost in MultiOutputClassifier
xgb_model = MultiOutputClassifier(XGBClassifier(**xgb_params), n_jobs=-1)

# Train and evaluate using Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
log_loss_scores = []

print("Training and evaluating the model...")
for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_smote, y_train_smote.sum(axis=1))):
    print(f"Processing fold {fold + 1}...")
    
    # Split data into training and validation sets for the fold
    X_train_fold, X_val_fold = X_train_pca[train_idx], X_train_pca[val_idx]
    y_train_fold, y_val_fold = y_train_smote.iloc[train_idx], y_train_smote.iloc[val_idx]

    # Fit the model
    xgb_model.fit(X_train_fold, y_train_fold)

    # Predict probabilities on validation set
    y_val_pred = xgb_model.predict_proba(X_val_fold)

    # Convert predictions to the correct format for log loss
    y_val_pred = np.column_stack([pred[:, 1] for pred in y_val_pred])

    # Calculate log loss for the fold
    fold_log_loss = log_loss(y_val_fold, y_val_pred, eps=1e-15)
    log_loss_scores.append(fold_log_loss)
    print(f"Fold {fold + 1} Log Loss: {fold_log_loss:.4f}")

# Average log loss across all folds
avg_log_loss = np.mean(log_loss_scores)
print(f"Average Validation Log Loss: {avg_log_loss:.4f}")


Applying SMOTE for multi-label resampling...
