In [268]:
# import necessary modules and libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA

# in order to supress the warnings
import warnings
warnings.filterwarnings('ignore')

In [269]:
# load the dataset
from scipy.io import arff
data = arff.loadarff('bone-marrow.arff')
df = pd.DataFrame(data[0])
df.drop(columns=['Disease'], inplace=True)

# convert all columns to numeric, coerce errors to null values
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')
    
# make sure binary columns are encoded as 0 and 1
for c in df.columns[df.nunique()==2]:
    df[c] = (df[c]==1)*1.0
    
# inspect the data
df.head()

Unnamed: 0,Recipientgender,Stemcellsource,Donorage,Donorage35,IIIV,Gendermatch,DonorABO,RecipientABO,RecipientRh,ABOmatch,...,extcGvHD,CD34kgx10d6,CD3dCD34,CD3dkgx10d8,Rbodymass,ANCrecovery,PLTrecovery,time_to_aGvHD_III_IV,survival_time,survival_status
0,1.0,1.0,22.830137,0.0,1.0,0.0,1,1.0,1.0,0.0,...,1.0,7.2,1.33876,5.38,35.0,19.0,51.0,32.0,999.0,0.0
1,1.0,0.0,23.342466,0.0,1.0,0.0,-1,-1.0,1.0,0.0,...,1.0,4.5,11.078295,0.41,20.6,16.0,37.0,1000000.0,163.0,1.0
2,1.0,0.0,26.394521,0.0,1.0,0.0,-1,-1.0,1.0,0.0,...,1.0,7.94,19.01323,0.42,23.4,23.0,20.0,1000000.0,435.0,1.0
3,0.0,0.0,39.684932,1.0,1.0,0.0,1,2.0,1.0,1.0,...,0.0,4.25,29.481647,0.14,50.0,23.0,29.0,19.0,53.0,1.0
4,0.0,1.0,33.358904,0.0,0.0,0.0,1,2.0,0.0,1.0,...,1.0,51.85,3.972255,13.05,9.0,14.0,14.0,1000000.0,2043.0,0.0


In [270]:
# 1. Calculate the number of unique values for each column
print('Count of unique values in each column:')
print(df.nunique())

Count of unique values in each column:
Recipientgender           2
Stemcellsource            2
Donorage                187
Donorage35                2
IIIV                      2
Gendermatch               2
DonorABO                  4
RecipientABO              4
RecipientRh               2
ABOmatch                  2
CMVstatus                 4
DonorCMV                  2
RecipientCMV              2
Riskgroup                 2
Txpostrelapse             2
Diseasegroup              2
HLAmatch                  4
HLAmismatch               2
Antigen                   4
Alel                      5
HLAgrI                    7
Recipientage            125
Recipientage10            2
Recipientageint           3
Relapse                   2
aGvHDIIIIV                2
extcGvHD                  2
CD34kgx10d6             183
CD3dCD34                182
CD3dkgx10d8             163
Rbodymass               130
ANCrecovery              18
PLTrecovery              50
time_to_aGvHD_III_IV     28
survival_

In [271]:
# 2. Set target, survival_status, as y; features (dropping survival status and time) as X
y = df.survival_status
X = df.drop(columns=['survival_status'])

# 3. Define lists of numeric and categorical columns based on number of unique values
num_cols = X.columns[X.nunique() > 7]
cat_cols = X.columns[X.nunique() <= 7]

# 4. Check to see what, if any, columns in X have missing values and print them.
missing_cols = X.columns[X.isnull().any()]
print('Columns with missing values:')
print(missing_cols)

Columns with missing values:
Index(['RecipientABO', 'CMVstatus', 'Antigen', 'Alel', 'CD3dCD34',
       'CD3dkgx10d8', 'Rbodymass'],
      dtype='object')


In [272]:
# 5. Split data into train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# 6. Create categorical preprocessing pipeline
# Using mode to fill in missing values and OHE
cat_vals = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ("ohe", OneHotEncoder(sparse_output=False, drop="first", handle_unknown='ignore'))
])

# 7. Create numerical preprocessing pipeline
# Using mean to fill in missing values and standard scaling of features
num_vals = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ("scaler", StandardScaler())
])

In [273]:
# 8. Create column transformer that will preprocess the numerical and categorical features separately
preprocess = ColumnTransformer(
    transformers=[
    ("cat", cat_vals, cat_cols),
    ("num", num_vals, num_cols)
]
)

# 9. Create a pipeline with preprocess, PCA, and a logistic regression model
pipeline = Pipeline([
    ("preprocessor", preprocess),
    ("pca", PCA()),
    ("clf", LogisticRegression())
])

# 10. Fit the pipeline on the training data and predict it on the test data
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)
print(f"Pipeline Accuracy Test Set: {pipeline.score(X_test, y_test)}")

Pipeline Accuracy Test Set: 0.868421052631579


In [274]:
# 11. Define search space of hyperparameters
search_space = [
    {
        "clf__C": np.logspace(-4, 2, 7),
        "pca__n_components": np.arange(5, 37).astype(int),
    }
]

# 12. Search over hyperparameters above to optimize pipeline and fit
gs = GridSearchCV(pipeline, search_space, cv=5)
gs.fit(X_train, y_train)

In [275]:
# 13. Save the best estimator from the grid search
best_model = gs.best_estimator_

# 14. Print attributes of best_model
print(f"Best model: {best_model.named_steps["clf"]}")
print(f"Best hyperparameter: {best_model.named_steps["clf"].get_params()["C"]}")
print(f"The number of components: {best_model.named_steps["pca"].n_components}")

Best model: LogisticRegression(C=np.float64(1.0))
Best hyperparameter: 1.0
The number of components: 12


In [276]:
# 15. Print final accuracy score 
print('Best Model Accuracy Test Set:')
print(best_model.score(X_test,y_test))

Best Model Accuracy Test Set:
0.868421052631579
