In [2]:
!pip install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m365.7/365.7 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.10.4-py3-none-any.whl (212 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m212.9/212.9 kB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting cmaes>=0.9.1 (from optuna)
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.7/78.7 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: Mako, colorlog, cmaes, alembic, optuna
Successfully i

In [3]:
!wget https://technionmail-my.sharepoint.com/:u:/g/personal/ploznik_campus_technion_ac_il/EQc79uRBeO1FqtH6ILFDx78BuuWui3DuRaBtnzTB6Aqxqg?download=1 -O data.tar
!tar -xvf data.tar

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
data/train/patient_5499.psv
data/train/patient_55.psv
data/train/patient_550.psv
data/train/patient_5500.psv
data/train/patient_5501.psv
data/train/patient_5502.psv
data/train/patient_5503.psv
data/train/patient_5504.psv
data/train/patient_5505.psv
data/train/patient_5506.psv
data/train/patient_5507.psv
data/train/patient_5508.psv
data/train/patient_5509.psv
data/train/patient_551.psv
data/train/patient_5510.psv
data/train/patient_5511.psv
data/train/patient_5512.psv
data/train/patient_5513.psv
data/train/patient_5514.psv
data/train/patient_5515.psv
data/train/patient_5516.psv
data/train/patient_5517.psv
data/train/patient_5518.psv
data/train/patient_5519.psv
data/train/patient_552.psv
data/train/patient_5520.psv
data/train/patient_5521.psv
data/train/patient_5522.psv
data/train/patient_5523.psv
data/train/patient_5524.psv
data/train/patient_5525.psv
data/train/patient_5526.psv
data/train/patient_5527.psv
data/train/patie

In [4]:
# Import libraries
import pandas as pd
import numpy as np
import os
import optuna # For hyperparameter optimization
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
import xgboost as xgb
import matplotlib.pyplot as plt

# Define data directories
train_data_dir = "/content/data/train/"
test_data_dir = "/content/data/test/"

# Define a function to load and preprocess data from a directory
def load_and_preprocess_data(data_dir):
  data = []
  labels = []
  for file in os.listdir(data_dir):
    patient_data = pd.read_csv(os.path.join(data_dir, file), sep='|')
    # Fill missing values with forward fill method
    patient_data = patient_data.fillna(method='ffill')
    # Find the index of the first row where sepsis label is 1
    sepsis_index = patient_data[patient_data["SepsisLabel"] == 
    1].first_valid_index()
    # If there is no sepsis row, use the last row and label 0
    if sepsis_index is None:
      sepsis_index = len(patient_data) - 1
      label = 0
    # Otherwise, use the sepsis row and label 1
    else:
      label = 1
    # Drop the sepsis label column from the data
    patient_data = patient_data.drop(columns=["SepsisLabel"])
    # Append the data and label to the lists
    data.append(patient_data.iloc[sepsis_index])
    labels.append(label)
  return pd.DataFrame(data), np.array(labels)

# Load and preprocess train and test data using the function
X_train, y_train = load_and_preprocess_data(train_data_dir)
X_test, y_test = load_and_preprocess_data(test_data_dir)

In [None]:
# Define a function to create a classifier pipeline with a given name and parameters
def create_classifier(name, params):
  # Create a preprocessing pipeline to impute missing values and scale features
  preprocessing = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
  ])
  # Create a classifier object based on the name and parameters
  if name == "LogisticRegression":
    classifier = LogisticRegression(**params)
  elif name == "RandomForestClassifier":
    classifier = RandomForestClassifier(**params)
  elif name == "SVC":
    classifier = SVC(**params)
  elif name == "XGBClassifier":
    classifier = xgb.XGBClassifier(**params)
  else:
    raise ValueError(f"Invalid classifier name: {name}")
  # Create a pipeline that combines preprocessing and classification
  pipeline = Pipeline([
    ("preprocessing", preprocessing),
    ("classifier", classifier)
  ])
  return pipeline

# Define a function to optimize a classifier using Optuna
def optimize_classifier(trial):
  # Define the classifier name and hyperparameters to be optimized
  classifier_name = trial.suggest_categorical("classifier", ["LogisticRegression", "RandomForestClassifier", "SVC", "XGBClassifier"])
  if classifier_name == "LogisticRegression":
    params = {
      "C": trial.suggest_float("C", 1e-5, 1e5, log=True),
      "penalty": trial.suggest_categorical("penalty", ["l1", "l2"]),
      "solver": trial.suggest_categorical("solver", ["liblinear"])
    }
  elif classifier_name == "RandomForestClassifier":
    params = {
      "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
      "max_depth": trial.suggest_int("max_depth", 2, 32),
      "max_features": trial.suggest_categorical("max_features", ["auto", "sqrt", "log2"]),
      "bootstrap": trial.suggest_categorical("bootstrap", [True, False])
    }
  elif classifier_name == "SVC":
    params = {
      "C": trial.suggest_float("C", 1e-5, 1e5, log=True),
      "kernel": trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"]),
      "gamma": trial.suggest_categorical("gamma", ["scale", "auto"])
    }
  elif classifier_name == "XGBClassifier":
    params = {
      "n_estimators": trial.suggest_int("n_estimators", 10, 1000),
      "max_depth": trial.suggest_int("max_depth", 2, 32),
      "learning_rate": trial.suggest_float("learning_rate", 1e-5, 1e-1, log=True),
      "subsample": trial.suggest_float("subsample", 0.5, 1.0),
      "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0)
    }
  
  # Create a classifier pipeline using the create_classifier function
  pipeline = create_classifier(classifier_name, params)

  # Split the train data into train and validation sets
  X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(X_train, y_train, test_size=0.2)

  # Fit the pipeline on the train set
  pipeline.fit(X_train_split, y_train_split)

  # Predict on the validation set
  y_pred = pipeline.predict(X_valid_split)

  # Calculate and return the F1 score
  f1 = f1_score(y_valid_split, y_pred)
  return f1

# Create a study object and optimize the classifier using the optimize_classifier function
study = optuna.create_study(direction="maximize")
study.optimize(optimize_classifier, n_trials=20)

# Print the best trial information
print(f"Best F1 score: {study.best_value}")
print(f"Best parameters: {study.best_params}")

# Create the best classifier pipeline using the best parameters
best_classifier_name = study.best_params["classifier"]
best_params = {k: v for k, v in study.best_params.items() if k != "classifier"}
best_pipeline = create_classifier(best_classifier_name, best_params)

# Fit the best pipeline on the whole train set
best_pipeline.fit(X_train, y_train)

# Predict on the test set
y_pred = best_pipeline.predict(X_test)

# Calculate and print the F1 score on the test set
test_f1 = f1_score(y_test, y_pred)
print(f"Test F1 score: {test_f1}")

# Plot the optimization history
optuna.visualization.plot_optimization_history(study)
plt.show()

# Plot the parameter importance
optuna.visualization.plot_param_importances(study)
plt.show()

[32m[I 2023-05-14 20:07:46,105][0m A new study created in memory with name: no-name-ef607b1d-5afd-460f-b568-d4c8d41448a4[0m


In [6]:
# Save the best model to a file
import joblib
joblib.dump(best_pipeline, "best_model.pkl")

['best_model.pkl']