In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance

# Load data
train_features = pd.read_csv("train_features.csv")
train_targets = pd.read_csv("train_targets_scored.csv")

# Subsample for testing purposes
fraction = 0.5
train_features = train_features.sample(frac=fraction, random_state=42).reset_index(drop=True)
train_targets = train_targets.loc[train_features.index].reset_index(drop=True)

# Preprocessing
X = train_features.drop(columns=["sig_id"])
y = train_targets.drop(columns=["sig_id"])

# Encoding of categorical features
X = pd.get_dummies(X, columns=["cp_type", "cp_time", "cp_dose"], drop_first=True)

# Standardize features
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Neural Network Model (MLPClassifier)
mlp_model = MLPClassifier(
    hidden_layer_sizes=(100, 50),  # Two hidden layers with 100 and 50 units
    activation='relu',  # ReLU activation function
    solver='adam',  # Adam optimizer
    max_iter=1000,  # Increased number of iterations to allow the model to converge
    random_state=42,
    verbose=True
)

# XGBoost Model
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    random_state=42
)

# Train both models
print("Training the MLP model...")
mlp_model.fit(X_train, y_train)

print("Training the XGBoost model...")
xgb_model.fit(X_train, y_train)

# Predict probabilities for both models
mlp_pred_prob = mlp_model.predict_proba(X_val)
xgb_pred_prob = xgb_model.predict_proba(X_val)

# Combine the predictions using averaging (soft voting)
# Note: We assume the output probabilities are of shape (n_samples, n_labels)
combined_pred_prob = (mlp_pred_prob + xgb_pred_prob) / 2

# Calculate log loss for multilabel classification
log_loss_score = log_loss(y_val, combined_pred_prob, eps=1e-15)
print(f"Validation Log Loss (ensemble): {log_loss_score:.4f}")

# Feature importance analysis using permutation importance on XGBoost (as it supports feature importance)
print("Calculating feature importance from XGBoost...")

# Using permutation importance on the XGBoost model
result = permutation_importance(xgb_model, X_val, y_val, n_repeats=10, random_state=42)

# Sorting and displaying the feature importances
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': result.importances_mean
}).sort_values(by='Importance', ascending=False)

print(importance_df.head(10))  # Displaying top 10 important features


Training the MLP model...
Iteration 1, loss = 51.91990347
Iteration 2, loss = 6.04483452
Iteration 3, loss = 5.06611163
Iteration 4, loss = 4.76672442
Iteration 5, loss = 4.59725801
Iteration 6, loss = 4.48237406
Iteration 7, loss = 4.38802219
Iteration 8, loss = 4.31103979
Iteration 9, loss = 4.23979563
Iteration 10, loss = 4.17681570
Iteration 11, loss = 4.11262199
Iteration 12, loss = 4.05514570
Iteration 13, loss = 3.99722483
Iteration 14, loss = 3.94065938
Iteration 15, loss = 3.88502050
Iteration 16, loss = 3.81770666
Iteration 17, loss = 3.75793830
Iteration 18, loss = 3.69169682
Iteration 19, loss = 3.62355850
Iteration 20, loss = 3.54939146
Iteration 21, loss = 3.47485137
Iteration 22, loss = 3.39362811
Iteration 23, loss = 3.31632956
Iteration 24, loss = 3.23659063
Iteration 25, loss = 3.15376209
Iteration 26, loss = 3.06306595
Iteration 27, loss = 2.97495522
Iteration 28, loss = 2.88502845
Iteration 29, loss = 2.79939200
Iteration 30, loss = 2.70712060
Iteration 31, loss = 2



Validation Log Loss (ensemble): 4.5969
Calculating feature importance from XGBoost...
    Feature  Importance
0       g-0         0.0
588   g-588         0.0
577   g-577         0.0
578   g-578         0.0
579   g-579         0.0
580   g-580         0.0
581   g-581         0.0
582   g-582         0.0
583   g-583         0.0
584   g-584         0.0


In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.inspection import permutation_importance

# Load data
train_features = pd.read_csv("train_features.csv")
train_targets = pd.read_csv("train_targets_scored.csv")

# Subsample for testing purposes
fraction = 0.5
train_features = train_features.sample(frac=fraction, random_state=42).reset_index(drop=True)
train_targets = train_targets.loc[train_features.index].reset_index(drop=True)

# Preprocessing
X = train_features.drop(columns=["sig_id"])
y = train_targets.drop(columns=["sig_id"])

# Encoding of categorical features
X = pd.get_dummies(X, columns=["cp_type", "cp_time", "cp_dose"], drop_first=True)

# Standardize features
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Neural Network Model (MLPClassifier)
mlp_model = MLPClassifier(
    hidden_layer_sizes=(256, 128, 64),  # Deeper and wider network
    activation='relu',  # ReLU activation function
    solver='adam',  # Adam optimizer
    alpha=0.001,  # L2 regularization
    batch_size=64,  # Smaller batch size
    learning_rate_init=0.0005,  # Smaller learning rate
    max_iter=2000,  # Allow more iterations
    early_stopping=True,  # Enable early stopping
    n_iter_no_change=20,  # Patience for early stopping
    random_state=42,
    verbose=True
)

# XGBoost Model
xgb_model = XGBClassifier(
    n_estimators=300,
    learning_rate=0.05,
    max_depth=8,
    random_state=42
)

# Train both models
print("Training the MLP model...")
mlp_model.fit(X_train, y_train)

print("Training the XGBoost model...")
xgb_model.fit(X_train, y_train)

# Predict probabilities for both models
mlp_pred_prob = mlp_model.predict_proba(X_val)
xgb_pred_prob = xgb_model.predict_proba(X_val)

# Combine the predictions using weighted averaging (soft voting)
mlp_weight = 0.6
xgb_weight = 0.4
combined_pred_prob = mlp_weight * mlp_pred_prob + xgb_weight * xgb_pred_prob

# Calculate log loss for multilabel classification
log_loss_score = log_loss(y_val, combined_pred_prob, eps=1e-15)
print(f"Validation Log Loss (ensemble): {log_loss_score:.4f}")

# Feature importance analysis using permutation importance on XGBoost (as it supports feature importance)
print("Calculating feature importance from XGBoost...")

# Using permutation importance on the XGBoost model
result = permutation_importance(xgb_model, X_val, y_val, n_repeats=10, random_state=42)

# Sorting and displaying the feature importances
importance_df = pd.DataFrame({
    'Feature': X.columns,
    'Importance': result.importances_mean
}).sort_values(by='Importance', ascending=False)

print(importance_df.head(10))  # Displaying top 10 important features


Training the MLP model...
Iteration 1, loss = 33.27243452
Validation score: 0.418678
Iteration 2, loss = 4.58276695
Validation score: 0.418678
Iteration 3, loss = 4.37525461
Validation score: 0.418678
Iteration 4, loss = 4.27606034
Validation score: 0.418678
Iteration 5, loss = 4.19290671
Validation score: 0.418678
Iteration 6, loss = 4.10397044
Validation score: 0.418678
Iteration 7, loss = 4.01420491
Validation score: 0.418678
Iteration 8, loss = 3.92008747
Validation score: 0.418678
Iteration 9, loss = 3.80943635
Validation score: 0.418678
Iteration 10, loss = 3.67907091
Validation score: 0.418678
Iteration 11, loss = 3.51407271
Validation score: 0.418678
Iteration 12, loss = 3.33405585
Validation score: 0.418678
Iteration 13, loss = 3.11555705
Validation score: 0.417629
Iteration 14, loss = 2.88997837
Validation score: 0.410283
Iteration 15, loss = 2.63678578
Validation score: 0.411333
Iteration 16, loss = 2.39996863
Validation score: 0.412382
Iteration 17, loss = 2.15153786
Valida



Validation Log Loss (ensemble): 3.6018
Calculating feature importance from XGBoost...
    Feature  Importance
0       g-0         0.0
588   g-588         0.0
577   g-577         0.0
578   g-578         0.0
579   g-579         0.0
580   g-580         0.0
581   g-581         0.0
582   g-582         0.0
583   g-583         0.0
584   g-584         0.0


In [39]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from imblearn.over_sampling import SMOTE
from sklearn.multioutput import MultiOutputClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
import numpy as np

# Preprocessing pipeline
categorical_columns = X_train.select_dtypes(include=['object']).columns  # Identify categorical columns
numerical_columns = X_train.select_dtypes(exclude=['object']).columns  # Identify numerical columns

# Preprocessing steps for numerical and categorical columns
preprocessor = ColumnTransformer(
    transformers=[
        ('num', SimpleImputer(strategy='mean'), numerical_columns),  # Handle missing numerical values
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # Handle categorical columns with OneHotEncoder
    ])

# Apply preprocessing to the dataset before SMOTE
X_train_processed = preprocessor.fit_transform(X_train)

# Initialize y_resampled as an empty list
y_resampled = []

# Resample each label using SMOTE individually with k_neighbors set to 1
for i in range(y_train.shape[1]):  # Loop over each label in the multi-label problem
    # Check if there is more than one class in the label column
    if len(np.unique(y_train.iloc[:, i])) > 1:
        smote = SMOTE(random_state=42, k_neighbors=1) 
        X_resampled_label, y_resampled_label = smote.fit_resample(X_train_processed, y_train.iloc[:, i])  # Access label correctly using iloc
        
        # Append resampled labels to y_resampled
        y_resampled.append(y_resampled_label)
    else:
        # If there is only one class, append the original label to y_resampled without modification
        y_resampled.append(y_train.iloc[:, i].values)

# Convert y_resampled to a numpy array after resampling all labels
y_resampled = np.array(y_resampled).T  # Transpose so it matches the shape of X_resampled

# Now, X_resampled should be used for training the models
X_resampled = X_resampled_label  # Assign the X_resampled data to the variable

# Define multi-label classifiers
mlp_model = MultiOutputClassifier(MLPClassifier(hidden_layer_sizes=(50, 25), max_iter=100, random_state=42))
xgb_model = MultiOutputClassifier(XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='mlogloss'))

# Train MLP Model
print("Training MLP model...")
mlp_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', mlp_model)
])
mlp_pipeline.fit(X_resampled, y_resampled)

# Train XGBoost Model
print("Training XGBoost model...")
xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb_model)
])
xgb_pipeline.fit(X_resampled, y_resampled)

# Evaluate the models
print("Evaluating MLP model...")
mlp_score = mlp_pipeline.score(X_test, y_test)
print(f"MLP Model Accuracy: {mlp_score}")

print("Evaluating XGBoost model...")
xgb_score = xgb_pipeline.score(X_test, y_test)
print(f"XGBoost Model Accuracy: {xgb_score}")


ValueError: Expected n_neighbors <= n_samples_fit, but n_neighbors = 2, n_samples_fit = 1, n_samples = 1