## we will be using a dataset containing bone marrow transplantation characteristics for pediatric patients from UCI’s Machine Learning Repository.
### Steps for creating the Machine Learning Model includes
    1. importing the necessary libraries
    2. Loading and explore the dataset
    3. Model building 
    4. Communicating results
    

In [1]:
# importing the necessary libraries
import numpy as np
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA


from sklearn.metrics import confusion_matrix

from scipy.io import arff

### 2. Explore

In [5]:
data = arff.loadarff('bone-marrow.arff')
df = pd.DataFrame(data[0])

In [14]:
df.head(10)

Unnamed: 0,Recipientgender,Stemcellsource,Donorage,Donorage35,IIIV,Gendermatch,DonorABO,RecipientABO,RecipientRh,ABOmatch,...,extcGvHD,CD34kgx10d6,CD3dCD34,CD3dkgx10d8,Rbodymass,ANCrecovery,PLTrecovery,time_to_aGvHD_III_IV,survival_time,survival_status
0,1.0,1.0,22.830137,0.0,1.0,0.0,1,1.0,1.0,0.0,...,1.0,7.2,1.33876,5.38,35.0,19.0,51.0,32.0,999.0,0.0
1,1.0,0.0,23.342466,0.0,1.0,0.0,-1,-1.0,1.0,0.0,...,1.0,4.5,11.078295,0.41,20.6,16.0,37.0,1000000.0,163.0,1.0
2,1.0,0.0,26.394521,0.0,1.0,0.0,-1,-1.0,1.0,0.0,...,1.0,7.94,19.01323,0.42,23.4,23.0,20.0,1000000.0,435.0,1.0
3,0.0,0.0,39.684932,1.0,1.0,0.0,1,2.0,1.0,1.0,...,0.0,4.25,29.481647,0.14,50.0,23.0,29.0,19.0,53.0,1.0
4,0.0,1.0,33.358904,0.0,0.0,0.0,1,2.0,0.0,1.0,...,1.0,51.85,3.972255,13.05,9.0,14.0,14.0,1000000.0,2043.0,0.0
5,1.0,0.0,27.391781,0.0,0.0,0.0,2,0.0,1.0,1.0,...,1.0,3.27,8.412758,0.39,40.0,16.0,70.0,1000000.0,2800.0,0.0
6,0.0,1.0,34.520548,0.0,1.0,0.0,0,1.0,0.0,1.0,...,0.0,17.78,2.406248,7.39,51.0,17.0,29.0,18.0,41.0,1.0
7,1.0,0.0,21.435616,0.0,1.0,0.0,0,1.0,1.0,1.0,...,0.0,6.41,,,56.0,22.0,58.0,22.0,45.0,1.0
8,1.0,1.0,32.641096,0.0,0.0,0.0,2,0.0,1.0,1.0,...,1.0,23.54,3.772555,6.24,20.5,15.0,14.0,1000000.0,671.0,0.0
9,1.0,1.0,28.783562,0.0,1.0,1.0,1,0.0,1.0,1.0,...,1.0,7.69,1.035244,7.43,16.5,16.0,17.0,1000000.0,676.0,0.0


In [9]:
df.shape

(187, 36)

In [8]:
df.drop(columns=['Disease'], inplace=True)

In [12]:
#Convert all columns to numeric, coerce errors to null values
for c in df.columns:
    df[c] = pd.to_numeric(df[c], errors='coerce')

In [13]:
#Make sure binary columns are encoded as 0 and 1
for c in df.columns[df.nunique()==2]:
    df[c] = (df[c]==1)*1.0

In [16]:
df.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 36 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Recipientgender       187 non-null    float64
 1   Stemcellsource        187 non-null    float64
 2   Donorage              187 non-null    float64
 3   Donorage35            187 non-null    float64
 4   IIIV                  187 non-null    float64
 5   Gendermatch           187 non-null    float64
 6   DonorABO              187 non-null    int64  
 7   RecipientABO          186 non-null    float64
 8   RecipientRh           187 non-null    float64
 9   ABOmatch              187 non-null    float64
 10  CMVstatus             171 non-null    float64
 11  DonorCMV              187 non-null    float64
 12  RecipientCMV          187 non-null    float64
 13  Riskgroup             187 non-null    float64
 14  Txpostrelapse         187 non-null    float64
 15  Diseasegroup          1

In [17]:
df.isnull().sum()

Recipientgender          0
Stemcellsource           0
Donorage                 0
Donorage35               0
IIIV                     0
Gendermatch              0
DonorABO                 0
RecipientABO             1
RecipientRh              0
ABOmatch                 0
CMVstatus               16
DonorCMV                 0
RecipientCMV             0
Riskgroup                0
Txpostrelapse            0
Diseasegroup             0
HLAmatch                 0
HLAmismatch              0
Antigen                  1
Alel                     1
HLAgrI                   0
Recipientage             0
Recipientage10           0
Recipientageint          0
Relapse                  0
aGvHDIIIIV               0
extcGvHD                 0
CD34kgx10d6              0
CD3dCD34                 5
CD3dkgx10d8              5
Rbodymass                2
ANCrecovery              0
PLTrecovery              0
time_to_aGvHD_III_IV     0
survival_time            0
survival_status          0
dtype: int64

In [18]:
# Unique values for each column 
for c in df.columns:
    print(c, df[c].nunique())

Recipientgender 2
Stemcellsource 2
Donorage 187
Donorage35 2
IIIV 2
Gendermatch 2
DonorABO 4
RecipientABO 4
RecipientRh 2
ABOmatch 2
CMVstatus 4
DonorCMV 2
RecipientCMV 2
Riskgroup 2
Txpostrelapse 2
Diseasegroup 2
HLAmatch 4
HLAmismatch 2
Antigen 4
Alel 5
HLAgrI 7
Recipientage 125
Recipientage10 2
Recipientageint 3
Relapse 2
aGvHDIIIIV 2
extcGvHD 2
CD34kgx10d6 183
CD3dCD34 182
CD3dkgx10d8 163
Rbodymass 130
ANCrecovery 18
PLTrecovery 50
time_to_aGvHD_III_IV 28
survival_time 174
survival_status 2


In [19]:
# Features and target 
y = df.survival_status
X= df.drop(columns=['survival_time','survival_status'])

In [20]:
# Define lists of numeric and categorical columns based on number of unique values
num_cols = X.columns[X.nunique()>5].tolist()
cat_cols = X.columns[X.nunique()<=5].tolist()

In [22]:
# Split data into train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
# Categorical preprocessing pipeline
# Using mode to fill in missing values and OHE
cat_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [24]:
# Numerical preprocessing pipeline
# Using mean to fill in missing values and standard scaling of features
num_preprocessor = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

In [25]:
#Column transformer that will preprocess the numerical and categorical features separately
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_preprocessor, num_cols),
        ('cat', cat_preprocessor, cat_cols)
    ])

In [26]:
# Pipeline with preprocess, PCA, and a logistic regresssion model
pipe = Pipeline(steps=[('preprocessor', preprocessor),
                       ('pca', PCA()),
                       ('classifier', LogisticRegression())])

In [27]:
# Fit the pipeline on the training data
pipe.fit(X_train, y_train)

In [28]:
#Predict the pipeline on the test data
y_pred = pipe.predict(X_test)

In [34]:
pipe.score(X_test,y_test)

0.7105263157894737

In [29]:
# Search space of hyperparameters
param_grid = {
    'pca__n_components': [5, 10, 15, 20],
    'classifier__C': np.logspace(-4, 4, 4),
    'classifier__penalty': ['l1', 'l2']
}

In [35]:
#Searching over hyperparameters abolve to optimize pipeline and fit
gs = GridSearchCV(pipe, param_grid, cv=5)
gs.fit(X_train, y_train)


80 fits failed out of a total of 160.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
80 fits failed with the following error:
Traceback (most recent call last):
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1389, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/pipeline.py", line 660, in fit
    self._final_estimator.fit(Xt, y, **last_step_params["fit"])
  File "/opt/anaconda3/lib/python3.11/site-packages/sklearn/base.py

In [None]:
# 13. Save the best estimator from the gridsearch and print attributes and final accuracy on test set
best_model = gs.best_estimator_

In [40]:
# Print attributes of best_model
print(best_model.named_steps['classifier'])
print(best_model.named_steps['pca'])
print(best_model.named_steps['classifier'].get_params())


LogisticRegression(C=0.046415888336127774)
PCA(n_components=20)
{'C': 0.046415888336127774, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'deprecated', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}


In [38]:
# Print final accuracy score 
print('Test Accuracy: ', best_model.score(X_test, y_test))


Test Accuracy:  0.631578947368421
