<a href="https://colab.research.google.com/github/dkapitan/jads-nhs-proms/blob/master/notebooks/4.0-modeling-classification-auto-sklearn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!sudo apt-get install build-essential swig
!curl https://raw.githubusercontent.com/automl/auto-sklearn/master/requirements.txt | xargs -n 1 -L 1 pip install
!pip install auto-sklearn

Reading package lists... Done
Building dependency tree       
Reading state information... Done
build-essential is already the newest version (12.4ubuntu1).
swig is already the newest version (3.0.12-1).
The following package was automatically installed and is no longer required:
  libnvidia-common-440
Use 'sudo apt autoremove' to remove it.
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100   206  100   206    0     0   1450      0 --:--:-- --:--:-- --:--:--  1450


In [10]:
!pip install fastparquet

Collecting fastparquet
[?25l  Downloading https://files.pythonhosted.org/packages/28/b9/844e32d0e3739e5695057dff3a3b9f4abc0fcccff466fdaadb8fedb0ee1d/fastparquet-0.4.1.tar.gz (28.6MB)
[K     |████████████████████████████████| 28.6MB 128kB/s 
Collecting thrift>=0.11.0
[?25l  Downloading https://files.pythonhosted.org/packages/97/1e/3284d19d7be99305eda145b8aa46b0c33244e4a496ec66440dac19f8274d/thrift-0.13.0.tar.gz (59kB)
[K     |████████████████████████████████| 61kB 7.2MB/s 
Building wheels for collected packages: fastparquet, thrift
  Building wheel for fastparquet (setup.py) ... [?25l[?25hdone
  Created wheel for fastparquet: filename=fastparquet-0.4.1-cp36-cp36m-linux_x86_64.whl size=7125488 sha256=a9758c5162869cfef4b9d8a2b69a1443c265fb8d1a3da1973c228ebbfed03cc0
  Stored in directory: /root/.cache/pip/wheels/10/45/cf/492ccb908adde1dd2551bb509a56e4096cce9487167f525120
  Building wheel for thrift (setup.py) ... [?25l[?25hdone
  Created wheel for thrift: filename=thrift-0.13.0-cp3

In [14]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import StratifiedShuffleSplit
import autosklearn.classification


In [15]:
df = pd.read_parquet('https://github.com/dkapitan/jads-nhs-proms/blob/master/data/interim/knee-provider.parquet?raw=true')

# handy function to select oks columns
def oks_questions(t='t0'):
  return [
    col for col in df.columns if col.startswith(f"oks_{t}") and not col.endswith("_score")
]

# replace sentinel values in oks columns
# note we are doing imputation on original dataframe (rather than in pipeline later on)
# so we can perform it prior to StratefiedShuffleSplit
oks_no9 = oks_questions('t0') + oks_questions('t1')
impute_oks = SimpleImputer(missing_values=9, strategy="most_frequent")
df.loc[:, oks_no9] = impute_oks.fit_transform(df[oks_no9])

# group columns t0
age_band = ["age_band"]
gender = ["gender"]
age_band_categories = sorted([x for x in df.age_band.unique() if isinstance(x, str)])
comorb = [
    "heart_disease",
    "high_bp",
    "stroke",
    "circulation",
    "lung_disease",
    "diabetes",
    "kidney_disease",
    "nervous_system",
    "liver_disease",
    "cancer",
    "depression",
    "arthritis",
]
boolean = ["t0_assisted", "t0_previous_surgery", "t0_disability"]
eq5d = ["t0_mobility", "t0_self_care", "t0_activity", "t0_discomfort", "t0_anxiety"]
eq_vas = ["t0_eq_vas"]
categorical = ["t0_symptom_period", "t0_previous_surgery", "t0_living_arrangements"]
oks_score = ["oks_t0_score"]

# add number of comorbidities as extra feature
impute_comorb = SimpleImputer(missing_values=9, strategy="constant", fill_value=0)
df.loc[:, comorb] = impute_comorb.fit_transform(df[comorb])
df["n_comorb"] = df.loc[:, comorb].sum()


# define outcome Y
CUT_OFF_PAIN = 4
CUT_OFF_FUNCTIONING = 26

for t in ("t0", "t1"):
    df[f"oks_{t}_pain_total"] = df[f"oks_{t}_pain"] + df[f"oks_{t}_night_pain"]
    df[f"oks_{t}_functioning_total"] = (
        df.loc[:, [col for col in oks_questions(t) if "pain" not in col]]
        .sum(axis=1)
    )
    df[f"y_{t}_pain_good"] = df[f"oks_{t}_pain_total"].apply(
        lambda s: True if s >= CUT_OFF_PAIN else False
    )
    df[f"y_{t}_functioning_good"] = df[f"oks_{t}_functioning_total"].apply(
        lambda s: True if s >= CUT_OFF_FUNCTIONING else False
    )

# define binary outcome parameter
df["y_binary"] = np.logical_and(df.y_t1_pain_good, df.y_t1_functioning_good)

# Only using 1 split for stratefied sampling, more folds are used later on in cross-validation
split = StratifiedShuffleSplit(n_splits=1, test_size=0.3, random_state=42)
for train_index, test_index in split.split(df, df["y_binary"]):
    df_train = df.loc[train_index]
    df_test = df.loc[test_index]
    
y_train_pain_good = df_train.y_t1_pain_good
y_train_pain_good = df_train.y_t1_functioning_good
y_train_binary = df_train.y_binary

INFO:numexpr.utils:NumExpr defaulting to 2 threads.


In [17]:
# same pipeline as lecture 3
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer


# preprocessing pipelines for specific columns
age_band_pipe = Pipeline(
    steps=[
        ("impute", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ("ordinal", OrdinalEncoder(categories=[age_band_categories])),
    ]
)
gender_pipe = Pipeline(
    steps=[
        ("impute", SimpleImputer(missing_values=np.nan, strategy="most_frequent")),
        ("onehot", OneHotEncoder()),
    ]
)

# ColumnTransformer on all included columns.
# Note columns that are not specified are dropped by default
transformers = {
    "age": ("age", age_band_pipe, age_band),
    "gender": ("gender", gender_pipe, gender),
    "comorb": (
        "comorb",
        'passthrough',
        comorb,
    ),
    "categorical": (
        "categorical",
        SimpleImputer(missing_values=9, strategy="most_frequent"),
        boolean + eq5d + categorical,
    ),
    "oks": (
        "oks",
        'passthrough',
        oks_questions('t0'),
    ),
    "eq_vas": ("eqvas", SimpleImputer(missing_values=999, strategy="median"), eq_vas),
}
prep = ColumnTransformer(
    transformers=[v for _, v in transformers.items()])

X_train = prep.fit_transform(df_train)
X_test = prep.fit_transform(df_test)

In [19]:
automl = autosklearn.classification.AutoSklearnClassifier()
automl.fit(X_train, y_train_binary)



AutoSklearnClassifier(delete_output_folder_after_terminate=True,
                      delete_tmp_folder_after_terminate=True,
                      disable_evaluator_output=False,
                      ensemble_memory_limit=1024, ensemble_nbest=50,
                      ensemble_size=50, exclude_estimators=None,
                      exclude_preprocessors=None, get_smac_object_callback=None,
                      include_estimators=None, include_preprocessors=None,
                      initial_configurations_via_metalearning=25,
                      logging_config=None, max_models_on_disc=50,
                      metadata_directory=None, metric=None,
                      ml_memory_limit=3072, n_jobs=None, output_folder=None,
                      per_run_time_limit=360, resampling_strategy='holdout',
                      resampling_strategy_arguments=None, seed=1,
                      shared_mode=False, smac_scenario_args=None,
                      time_left_for_this_task=3600,

In [21]:
import sklearn
y_hat = automl.predict(X_train)
print("Accuracy score", sklearn.metrics.accuracy_score(y_train_binary, y_hat))

Accuracy score 0.741250705381419
