In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss

# Load data
train_features = pd.read_csv("train_features.csv")
train_targets = pd.read_csv("train_targets_scored.csv")

# Subsample for testing purposes
fraction = 0.1
train_features = train_features.sample(frac=fraction, random_state=42).reset_index(drop=True)
train_targets = train_targets.loc[train_features.index].reset_index(drop=True)

# Preprocessing
X = train_features.drop(columns=["sig_id"])
y = train_targets.drop(columns=["sig_id"])

# One-hot encode categorical features
X = pd.get_dummies(X, columns=["cp_type", "cp_time", "cp_dose"], drop_first=True)

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the base RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)

# Wrap the RandomForestClassifier in MultiOutputClassifier
multi_rf_model = MultiOutputClassifier(rf_model)

# Define hyperparameter grid for Random Forest
param_grid = {
    "estimator__n_estimators": [50, 100, 200],
    "estimator__max_depth": [10, 20, None],
    "estimator__min_samples_split": [2, 5, 10],
    "estimator__min_samples_leaf": [1, 2, 4],
    "estimator__max_features": ["sqrt", "log2", None]
}

# Use GridSearchCV to find the best parameters
grid_search = GridSearchCV(
    estimator=multi_rf_model,
    param_grid=param_grid,
    scoring="neg_log_loss",
    cv=3,
    n_jobs=-1,
    verbose=1
)

# Train the grid search
print("Optimizing hyperparameters...")
grid_search.fit(X_train, y_train)

# Retrieve the best model
best_rf_model = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Predict probabilities on the validation set
y_val_pred = []
for clf in best_rf_model.estimators_:
    proba = clf.predict_proba(X_val)
    if proba.shape[1] == 2:  # Binary classification
        y_val_pred.append(proba[:, 1])  # Use the probability for class 1
    else:  # Handle cases with only one class in training data
        y_val_pred.append(np.zeros(proba.shape[0]))

y_val_pred = np.column_stack(y_val_pred)

# Calculate log loss
log_loss_score = log_loss(y_val, y_val_pred, eps=1e-15)
print(f"Validation Log Loss (optimized): {log_loss_score:.4f}")


Optimizing hyperparameters...
Fitting 3 folds for each of 243 candidates, totalling 729 fits


Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/model_selection/_validation.py", line 982, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 253, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/metrics/_scorer.py", line 350, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
                        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.12/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib

KeyboardInterrupt: 

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler

# Load data
train_features = pd.read_csv("train_features.csv")
train_targets = pd.read_csv("train_targets_scored.csv")

# Subsample for testing purposes
fraction = 0.1
train_features = train_features.sample(frac=fraction, random_state=42).reset_index(drop=True)
train_targets = train_targets.loc[train_features.index].reset_index(drop=True)

# Preprocessing
X = train_features.drop(columns=["sig_id"])
y = train_targets.drop(columns=["sig_id"])

# Encoding categorical features
X = pd.get_dummies(X, columns=["cp_type", "cp_time", "cp_dose"], drop_first=True)

# Standardize features
scaler = StandardScaler()
X[X.columns] = scaler.fit_transform(X[X.columns])

# Train-validation split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
multi_rf_model = MultiOutputClassifier(rf_model)

# Train the model
print("Training the model...")
multi_rf_model.fit(X_train, y_train)

# Predict probabilities and handle shape inconsistencies
y_val_pred = []
for clf in multi_rf_model.estimators_:
    proba = clf.predict_proba(X_val)
    if proba.shape[1] == 2:  # Binary classification: use probability for class 1
        y_val_pred.append(proba[:, 1])
    else:  # Single-class case: all probabilities are 0 for class 1
        y_val_pred.append(np.zeros(proba.shape[0]))

# Combine predictions into a single array
y_val_pred = np.column_stack(y_val_pred)

# Calculate log loss
log_loss_score = log_loss(y_val, y_val_pred, eps=1e-15)
print(f"Validation Log Loss: {log_loss_score:.4f}")


Training the model...
Validation Log Loss: 11.1043


