In [27]:
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from scipy.io import arff

data = arff.loadarff('bone-marrow.arff')
df = pd.DataFrame(data[0])
df.drop(columns=['Disease'], inplace=True)

#Convert all columns to numeric, coerce errors to null values
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')

#Make sure binary columns are encoded as 0 and 1
for c in df.columns[df.nunique()==2]:
    df[c] = (df[c]==1)*1.0


In [28]:
df.describe()

Unnamed: 0,Recipientgender,Stemcellsource,Donorage,Donorage35,IIIV,Gendermatch,DonorABO,RecipientABO,RecipientRh,ABOmatch,...,extcGvHD,CD34kgx10d6,CD3dCD34,CD3dkgx10d8,Rbodymass,ANCrecovery,PLTrecovery,time_to_aGvHD_III_IV,survival_time,survival_status
count,187.0,187.0,187.0,187.0,187.0,187.0,187.0,186.0,187.0,187.0,...,187.0,187.0,182.0,182.0,185.0,187.0,187.0,187.0,187.0,187.0
mean,0.59893,0.775401,33.472068,0.44385,0.59893,0.171123,0.390374,0.274194,0.84492,0.716578,...,0.684492,11.891781,5.385096,4.745714,35.801081,26752.86631,90937.919786,775408.042781,938.743316,0.454545
std,0.491431,0.418438,8.271826,0.498171,0.491431,0.377627,0.837632,0.938706,0.362953,0.45187,...,0.465965,9.914386,9.598716,3.859128,19.650922,161747.200525,288242.407688,418425.252689,849.589495,0.499266
min,0.0,0.0,18.646575,0.0,0.0,0.0,-1.0,-1.0,0.0,0.0,...,0.0,0.79,0.204132,0.04,6.0,9.0,9.0,10.0,6.0,0.0
25%,0.0,1.0,27.039726,0.0,0.0,0.0,0.0,-1.0,1.0,0.0,...,0.0,5.35,1.786683,1.6875,19.0,13.0,16.0,1000000.0,168.5,0.0
50%,1.0,1.0,33.550685,0.0,1.0,0.0,0.0,0.0,1.0,1.0,...,1.0,9.72,2.734462,4.325,33.0,15.0,21.0,1000000.0,676.0,0.0
75%,1.0,1.0,40.117809,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,15.415,5.823565,6.785,50.6,17.0,37.0,1000000.0,1604.0,1.0
max,1.0,1.0,55.553425,1.0,1.0,1.0,2.0,2.0,1.0,1.0,...,1.0,57.78,99.56097,20.02,103.4,1000000.0,1000000.0,1000000.0,3364.0,1.0


In [29]:
df.head()

Unnamed: 0,Recipientgender,Stemcellsource,Donorage,Donorage35,IIIV,Gendermatch,DonorABO,RecipientABO,RecipientRh,ABOmatch,...,extcGvHD,CD34kgx10d6,CD3dCD34,CD3dkgx10d8,Rbodymass,ANCrecovery,PLTrecovery,time_to_aGvHD_III_IV,survival_time,survival_status
0,1.0,1.0,22.830137,0.0,1.0,0.0,1,1.0,1.0,0.0,...,1.0,7.2,1.33876,5.38,35.0,19.0,51.0,32.0,999.0,0.0
1,1.0,0.0,23.342466,0.0,1.0,0.0,-1,-1.0,1.0,0.0,...,1.0,4.5,11.078295,0.41,20.6,16.0,37.0,1000000.0,163.0,1.0
2,1.0,0.0,26.394521,0.0,1.0,0.0,-1,-1.0,1.0,0.0,...,1.0,7.94,19.01323,0.42,23.4,23.0,20.0,1000000.0,435.0,1.0
3,0.0,0.0,39.684932,1.0,1.0,0.0,1,2.0,1.0,1.0,...,0.0,4.25,29.481647,0.14,50.0,23.0,29.0,19.0,53.0,1.0
4,0.0,1.0,33.358904,0.0,0.0,0.0,1,2.0,0.0,1.0,...,1.0,51.85,3.972255,13.05,9.0,14.0,14.0,1000000.0,2043.0,0.0


In [30]:
# 1. Calculate the number of unique values for each column
print('Count of unique values in each column:')
print(df.nunique())

Count of unique values in each column:
Recipientgender           2
Stemcellsource            2
Donorage                187
Donorage35                2
IIIV                      2
Gendermatch               2
DonorABO                  4
RecipientABO              4
RecipientRh               2
ABOmatch                  2
CMVstatus                 4
DonorCMV                  2
RecipientCMV              2
Riskgroup                 2
Txpostrelapse             2
Diseasegroup              2
HLAmatch                  4
HLAmismatch               2
Antigen                   4
Alel                      5
HLAgrI                    7
Recipientage            125
Recipientage10            2
Recipientageint           3
Relapse                   2
aGvHDIIIIV                2
extcGvHD                  2
CD34kgx10d6             183
CD3dCD34                182
CD3dkgx10d8             163
Rbodymass               130
ANCrecovery              18
PLTrecovery              50
time_to_aGvHD_III_IV     28
survival_

In [31]:
# 2. Set target, survival_status,as y; features (dropping survival status and time) as X
y = df.survival_status
X= df.drop(columns=['survival_time','survival_status'])

In [32]:
# 3. Define lists of numeric and categorical columns based on number of unique values
num_cols = X.columns[X.nunique()>7]
cat_cols = X.columns[X.nunique()<=7]

In [33]:
# 4. Print columns with missing values
print('Columns with missing values:')
print(X.columns[X.isnull().sum()>0])

Columns with missing values:
Index(['RecipientABO', 'CMVstatus', 'Antigen', 'Alel', 'CD3dCD34',
       'CD3dkgx10d8', 'Rbodymass'],
      dtype='object')


In [34]:
# 5. Split data into train/test split
x_train, x_test, y_train, y_test = train_test_split(X,y, random_state=0, test_size=0.2)

In [35]:
cat_vals = Pipeline([("imputer",SimpleImputer(strategy='most_frequent')), ("ohe",OneHotEncoder(sparse_output=False, drop='first', handle_unknown = 'ignore'))])

In [36]:
# 7. Create numerical preprocessing pipeline
# Using mean to fill in missing values and standard scaling of features
num_vals = Pipeline([("imputer",SimpleImputer(strategy='mean')), ("scale",StandardScaler())])

In [37]:
# 8. Create column transformer that will preprocess the numerical and categorical features separately
preprocess = ColumnTransformer(
    transformers=[
        ("cat_process", cat_vals, cat_cols),
        ("num_process", num_vals, num_cols)
    ]
)

In [38]:
# 9. Create a pipeline with preprocess, PCA, and a logistic regresssion model
pipeline = Pipeline([("preprocess",preprocess),
                     ("pca", PCA()),
                     ("clf",LogisticRegression())])


In [39]:
# 10. Fit the pipeline on the training data
pipeline.fit(x_train, y_train)
#Predict the pipeline on the test data
print('Pipeline Accuracy Test Set:')
print(pipeline.score(x_test,y_test))

Pipeline Accuracy Test Set:
0.7894736842105263


In [40]:
# 11. Define search space of hyperparameters
search_space = [{'clf':[LogisticRegression()],
                     'clf__C': np.logspace(-4, 2, 10),
                'pca__n_components':np.linspace(30,37,3).astype(int)},
                   ]

In [41]:
#12. Search over hyperparameters abolve to optimize pipeline and fit
gs = GridSearchCV(pipeline, search_space, cv=5)
gs.fit(x_train, y_train)



In [42]:
# 13. Save the best estimator from the gridsearch and print attributes and final accuracy on test set
best_model = gs.best_estimator_

In [43]:
# 14. Print attributes of best_model
print('The best classification model is:')
print(best_model.named_steps['clf'])
print('The hyperparameters of the best classification model are:')
print(best_model.named_steps['clf'].get_params())
print('The number of components selected in the PCA step are:')
print(best_model.named_steps['pca'].n_components)

The best classification model is:
LogisticRegression(C=np.float64(1.0))
The hyperparameters of the best classification model are:
{'C': np.float64(1.0), 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
The number of components selected in the PCA step are:
37


In [44]:
# 15. Print final accuracy score
print('Best Model Accuracy Test Set:')
print(best_model.score(x_test,y_test))

Best Model Accuracy Test Set:
0.8157894736842105
