<a href="https://colab.research.google.com/github/Annabelle2915/Machine-Learning-Projects/blob/main/BoneMarrowTransplantSuccessRateClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Group 5: ITS61504 Assessment 3 August 2022**

Data Mining Coursework: This project aims to predict the success rate of
bone marrow transplant towards patients


In [1]:
# Line Wrapping in Collaboratory Google results

from IPython.display import HTML, display

def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

In [None]:
# Mount Googlde Drive
import io
import requests
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

### **Q1 Data Preprocessing**





**Data Understanding**

In [None]:
# Import necessary libraries for data understanding
import pandas as pd # pandas libraries to manipulate or analyse data
import numpy as np # numpy libraries for mathematical arrays functions

In [None]:
# Import bone_marrow.csv dataset to path variable from google drive
path = "/content/gdrive/MyDrive/Colab Notebooks/bone_marrow.csv"

# Assign dataset to dataframe named ori_df from variable path
ori_df = pd.read_csv (path)


In [None]:
#show basic information about the dataset such as entries, data types, etc.
ori_df.info ()

In [None]:
# Show the top 30 rows of records from dataset corresonding its columns
ori_df.head(30)

In [None]:
# Show the statistical information of numerical variables in the dataset
ori_df.describe()

In [None]:
# Show the statistical information of categorical variables in the dataset
ori_df.describe(include='object')



```
# This is formatted as code
```

**Data Cleaning (Missing Value Identification)**

In [None]:
# Show if there's any duplicated records from the dataframe
ori_df.duplicated().any()

In [None]:
# Detect standard blank missing values in dataframe using pandas function
ori_df.isnull().sum()

In [None]:
# Since there are no standard blank missing values for pandas function to detect,
# but there's non standard missing records with '?' values seen such as at row 5 donor_CMV column,
# we will need to identify the number of non standard blank missing values in this dataframe

# Identify the non standard missing value of '?' in each column by descending order
print("Number of non standard '?' missing value in the dataset: ")
df_replaced = ori_df.replace(['?'], np.NaN)
df_replaced.head(15)
print(df_replaced.isna().sum().sort_values(ascending=False))


In [None]:
# List the non standard missing values out in % for every records in the dataset
for column in df_replaced.columns:
  if df_replaced [column].isna().sum() != 0:
    missing = df_replaced[column].isna().sum()
    portion =(missing/df_replaced.shape [0]) * 100
    print(f"'{column}': Number of missing values '{missing}' ==>'{portion:.3f}%'")

In [None]:
# Every non standard missing values of '?' will need to be replaced to 'NaN' value
# Replace "?" input with NaN
df_replaced = ori_df.replace(['?'], np.nan)

# Show the top 30 rows of records after the missing values of ? has been replaced
df_replaced.head(30)
# As seen at row 5 donor_CMV column, the '?' record has been replaced to 'NaN' value

**Data Cleaning (Data Imputation)**

In [None]:
# Replacing missing values with means and most frequent
# change noise data like '?' to nan and change columns types to float
# Replacing missing numerical values with mean
# Must be converted from string to float i norder to get the mean.
df_replaced['CD3_x1e8_per_kg'] = df_replaced['CD3_x1e8_per_kg'].astype(float)
df_replaced['CD3_x1e8_per_kg'] = df_replaced['CD3_x1e8_per_kg'].fillna((df_replaced['CD3_x1e8_per_kg'].mean()))
df_replaced['recipient_body_mass'] = df_replaced['recipient_body_mass'].astype(float)
df_replaced['recipient_body_mass'] = df_replaced['recipient_body_mass'].fillna((df_replaced['recipient_body_mass'].mean()))
df_replaced['CMV_status'] = df_replaced['CMV_status'].astype(float)
df_replaced['CMV_status'] = df_replaced['CMV_status'].fillna((df_replaced['CMV_status'].mean()))
df_replaced['antigen'] = df_replaced['antigen'].astype(float)
df_replaced['antigen'] = df_replaced['antigen'].fillna((df_replaced['antigen'].mean()))
df_replaced['allel'] = df_replaced['allel'].astype(float)
df_replaced['allel'] = df_replaced['allel'].fillna((df_replaced['allel'].mean()))
df_replaced['CD3_to_CD34_ratio'] = df_replaced['CD3_to_CD34_ratio'].astype(float)
df_replaced['CD3_to_CD34_ratio'] = df_replaced['CD3_to_CD34_ratio'].fillna((df_replaced['CD3_to_CD34_ratio'].mean()))


# Replacing missing cateogrical value with most frequent.
df_replaced['donor_CMV'] = df_replaced['donor_CMV'].fillna(df_replaced['donor_CMV'].mode()[0])
df_replaced['recipient_ABO'] = df_replaced['recipient_ABO'].fillna(df_replaced['recipient_ABO'].mode()[0])
df_replaced['recipient_rh'] = df_replaced['recipient_rh'].fillna(df_replaced['recipient_rh'].mode()[0])
df_replaced['recipient_CMV'] = df_replaced['recipient_CMV'].fillna(df_replaced['recipient_CMV'].mode()[0])
df_replaced['ABO_match'] = df_replaced['ABO_match'].fillna(df_replaced['ABO_match'].mode()[0])
df_replaced['extensive_chronic_GvHD'] = df_replaced['extensive_chronic_GvHD'].fillna(df_replaced['extensive_chronic_GvHD'].mode()[0])

In [None]:
df_replaced.head(10)

In [None]:
df_replaced[50:80]

In [None]:
# Verify that there are no non standard and standard missing data left after replacement
print("Number of missing values by column after cleaning missing values:\n")
print (df_replaced.isna().sum())

In [None]:
df_replaced.describe()

In [None]:
import seaborn as sbn
# Removing outliers for recipient_body_mass column

# Visualising data with a boxplot
sbn.boxplot(data = df_replaced['recipient_body_mass'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["recipient_body_mass"].quantile(0.25)
Q3 = df_replaced["recipient_body_mass"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["recipient_body_mass"] < Lower_Fence) | (df_replaced["recipient_body_mass"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df1 = df_replaced[~((df_replaced["recipient_body_mass"] < Lower_Fence) | (df_replaced["recipient_body_mass"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df1.shape}\n")

In [None]:
# Removing outliers for allel column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['allel'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["allel"].quantile(0.25)
Q3 = df_replaced["allel"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["allel"] < Lower_Fence) | (df_replaced["allel"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df2 = df1[~((df1["allel"] < Lower_Fence) | (df1["allel"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df2.shape}\n")

In [None]:
# Removing outliers for CD34_x1e6_per_kg column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['CD34_x1e6_per_kg'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["CD34_x1e6_per_kg"].quantile(0.25)
Q3 = df_replaced["CD34_x1e6_per_kg"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["CD34_x1e6_per_kg"] < Lower_Fence) | (df_replaced["CD34_x1e6_per_kg"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df3 = df2[~((df2["CD34_x1e6_per_kg"] < Lower_Fence) | (df2["CD34_x1e6_per_kg"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df3.shape}\n")

In [None]:
# Removing outliers for CD3_x1e8_per_kg column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['CD3_x1e8_per_kg'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["CD3_x1e8_per_kg"].quantile(0.25)
Q3 = df_replaced["CD3_x1e8_per_kg"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["CD3_x1e8_per_kg"] < Lower_Fence) | (df_replaced["CD3_x1e8_per_kg"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df4 = df3[~((df3["CD3_x1e8_per_kg"] < Lower_Fence) | (df3["CD3_x1e8_per_kg"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df4.shape}\n")

In [None]:
# Removing outliers for CD3_to_CD34_ratio column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['CD3_to_CD34_ratio'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["CD3_to_CD34_ratio"].quantile(0.25)
Q3 = df_replaced["CD3_to_CD34_ratio"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["CD3_to_CD34_ratio"] < Lower_Fence) | (df_replaced["CD3_to_CD34_ratio"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df5 = df4[~((df4["CD3_to_CD34_ratio"] < Lower_Fence) | (df4["CD3_to_CD34_ratio"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df5.shape}\n")

In [None]:
# Removing outliers for ANC_recovery column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['ANC_recovery'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["ANC_recovery"].quantile(0.25)
Q3 = df_replaced["ANC_recovery"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["ANC_recovery"] < Lower_Fence) | (df_replaced["ANC_recovery"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df6 = df5[~((df5["ANC_recovery"] < Lower_Fence) | (df5["ANC_recovery"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df6.shape}\n")

In [None]:
# Removing outliers for PLT_recovery column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['PLT_recovery'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["PLT_recovery"].quantile(0.25)
Q3 = df_replaced["PLT_recovery"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["PLT_recovery"] < Lower_Fence) | (df_replaced["PLT_recovery"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df7 = df6[~((df6["PLT_recovery"] < Lower_Fence) | (df6["PLT_recovery"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df7.shape}\n")

In [None]:
# Removing outliers for time_to_acute_GvHD_III_IV column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['time_to_acute_GvHD_III_IV'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["time_to_acute_GvHD_III_IV"].quantile(0.25)
Q3 = df_replaced["time_to_acute_GvHD_III_IV"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["time_to_acute_GvHD_III_IV"] < Lower_Fence) | (df_replaced["time_to_acute_GvHD_III_IV"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df8 = df7[~((df7["time_to_acute_GvHD_III_IV"] < Lower_Fence) | (df7["time_to_acute_GvHD_III_IV"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df8.shape}\n")

In [None]:
# Removing outliers for time_to_acute_GvHD_III_IV column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['time_to_acute_GvHD_III_IV'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["time_to_acute_GvHD_III_IV"].quantile(0.25)
Q3 = df_replaced["time_to_acute_GvHD_III_IV"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["time_to_acute_GvHD_III_IV"] < Lower_Fence) | (df_replaced["time_to_acute_GvHD_III_IV"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df8 = df7[~((df7["time_to_acute_GvHD_III_IV"] < Lower_Fence) | (df7["time_to_acute_GvHD_III_IV"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df8.shape}\n")

In [None]:
# Removing outliers for time_to_acute_GvHD_III_IV column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['time_to_acute_GvHD_III_IV'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["time_to_acute_GvHD_III_IV"].quantile(0.25)
Q3 = df_replaced["time_to_acute_GvHD_III_IV"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["time_to_acute_GvHD_III_IV"] < Lower_Fence) | (df_replaced["time_to_acute_GvHD_III_IV"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df8 = df7[~((df7["time_to_acute_GvHD_III_IV"] < Lower_Fence) | (df7["time_to_acute_GvHD_III_IV"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df8.shape}\n")

In [None]:
# Removing outliers for survival_time column

# Visualising data with a boxplot
sbn.boxplot(df_replaced['survival_time'])

# Compute the Interquartile Range (IQR)
Q1 = df_replaced["survival_time"].quantile(0.25)
Q3 = df_replaced["survival_time"].quantile(0.75)
IQR = Q3 - Q1
print ("IQR: %.2f" %IQR)

# Calculate the Lower and Upper Fence
Lower_Fence = Q1 - (1.5 * IQR)
print ("Lower_Fence: %.2f" %Lower_Fence)
Upper_Fence = Q3 + (1.5 * IQR)
print ("Upper_Fence: %.2f" %Upper_Fence)

# Display Outliers
print("Data Outliers: \n")
print (df_replaced[((df_replaced["survival_time"] < Lower_Fence) | (df_replaced["survival_time"] > Upper_Fence))])

# Removing Outliers and storing remaining data
print(f"ori_df shape: {df_replaced.shape}\n")
df9 = df8[~((df8["survival_time"] < Lower_Fence) | (df8["survival_time"] > Upper_Fence))]
print(f"df_replaced shape after removed outliers : {df9.shape}\n")

### **Q2 Exploratory Data Analysis**

In [None]:
# Import necesary plots libraries for visualisation
import matplotlib.pyplot as plt
import seaborn as sns

# show box plot for numeric type columns
for columns in df9.select_dtypes(exclude = 'object').drop(columns =['id','survival_status']).columns:
    boxdata = df9 [[columns] + ['survival_status']]
    sns.boxplot(data = boxdata,x='survival_status',y=columns)
    plt.title(columns)
    plt.show()

In [None]:
# show box plot for object type columns
for columns in df9.select_dtypes(exclude = 'float').columns[1:-1]:
    boxdf = df9[[columns] + ['survival_status']]
    r =pd.crosstab(boxdf.survival_status,boxdf[columns]).unstack().reset_index()
    sns.barplot(data = r,x=columns,y=0,hue='survival_status')
    plt.title(columns)
    plt.show()

**Feature Selection**

In [None]:
# Check which variables correlates with the target variable for feature selection of more than 0.5 r value
import seaborn as sbn
plt.figure(figsize=(12,10))
cor=df9.corr()
sbn.heatmap(cor, xticklabels=cor.columns, yticklabels=cor.columns, annot=True)
plt.show()

In [None]:
# show the variables that has pearson value of more than 0.3 for feature selection
matrix = cor.corr()
matrix = matrix.unstack()
matrix = matrix[abs(matrix) >= 0.3]
print(matrix. to_string())

In [None]:
# Dropping attributes not involved in the EDA and printing the attributes involved in the EDA
plt.figure(figsize=(10,10))
eda = df9.drop(["id", "donor_age", "donor_age_below_35", "donor_ABO", "donor_CMV", "recipient_age_below_10", "recipient_age_int", "recipient_gender", "recipient_ABO", "recipient_rh", "recipient_CMV",
"disease_group", "gender_match", "ABO_match", "CMV_status", "HLA_match", "HLA_mismatch", "HLA_group_1", "risk_group",
"stem_cell_source", "tx_post_relapse", "acute_GvHD_II_III_IV",	"acute_GvHD_III_IV" , "extensive_chronic_GvHD", "time_to_acute_GvHD_III_IV"], axis=1)

eda.head(20)

In [None]:
# Calculating the r values and plotting a heatmap using pearson method
plt.figure(figsize=(12,10))
correlation = eda.corr(method='pearson')
sbn.heatmap(correlation, xticklabels=correlation.columns, yticklabels=correlation.columns, annot=True)

In [None]:
# show the variables that has pearson value of more than 0.7
matrix = eda.corr()
matrix = matrix.unstack()
matrix = matrix[abs(matrix) >= 0.7]
print(matrix)


In [None]:
# Plotting the pairplot of all 9 attributes
import seaborn as sbn

sbn.pairplot(eda)

In [None]:
# Plotting scatter plots for the  recipient age attribute and colouring them by their survival status due to achieving highest r value
sbn.relplot(x="recipient_age", y="recipient_body_mass", hue="survival_status", data=eda)
sbn.relplot(x="recipient_body_mass", y="CD34_x1e6_per_kg", hue="survival_status", data=eda)
sbn.relplot(x="recipient_body_mass", y="CD3_x1e8_per_kg", hue="survival_status", data=eda)
sbn.relplot(x="recipient_body_mass", y="CD3_to_CD34_ratio", hue="survival_status", data=eda)
sbn.relplot(x="recipient_body_mass", y="ANC_recovery", hue="survival_status", data=eda)
sbn.relplot(x="recipient_body_mass", y="PLT_recovery", hue="survival_status", data=eda)
sbn.relplot(x="recipient_body_mass", y="survival_time", hue="survival_status", data=eda)
sbn.relplot(x="recipient_body_mass", y="recipient_age", hue="survival_status", data=eda)


**According to the Scatterplot principle to determine whether if the variables has strong or negative relationship, below shows the criteria of it:**
1) If the value of y increases with the value of x, then we can say that the variables have a positive correlation. (Ans: recipient_body_mass and recipient_age, ‘CD34_x1e6_per_kg’ and ‘CD3_x1e8_per_kg’, )

2) If the value of y decreases with the value of x, then we can say that the variables have a negative correlation. (Ans: CD34_x1e6_per_kg )

3) If the value of y changes randomly independent of x, then it is said to have a zero corelation. (Ans: ANC_recovery)

In [None]:
# PLot histogram and dsitribution curve for the recipient age and recipient body mass
sbn.distplot (eda ["recipient_age"])

In [None]:
# PLot histogram and dsitribution curve for the recipient age and recipient body mass
sbn.distplot (eda ["recipient_body_mass"])

### **Q3 Classification Models**

**Ensure balance data on dependent variable (survival status)**

In [None]:
pip install -U imbalanced-learn

In [None]:
# Check the imbalance value of the original form of dataset
# assign all attributes excluding the survival status attribute to the variable x.
x = ori_df.drop(["survival_status"], axis=1)

# assign survival status attribute to the variable y.
y = ori_df["survival_status"]

# print the values of the two classes under the survival status column
print(y.value_counts())

# plot a pie chart of for the survival status column
y.value_counts().plot.pie(autopct="%.2f")

In [None]:
# Show the preprocessed dataset of dependent variable to check for imbalance data

# assign all attributes excluding the survival status attribute to the variable x.
x = df9.drop(["survival_status"], axis=1)

# assign survival status attribute to the variable y.
y = df9["survival_status"]

# print the values of the two classes under the survival status column
print(y.value_counts())

# plot a pie chart for the survival status column
y.value_counts().plot.pie(autopct="%.2f")

In [None]:
from sklearn.utils import resample
#create two different dataframe of majority and minority class
df_majority = df9[(df9['survival_status']==0)]
df_minority = df9[(df9['survival_status']==1)]
# upsample minority class
df_minority_upsampled = resample(df_minority,
                                 replace=True,    # sample with replacement
                                 n_samples= 59, # to match majority class
                                 random_state=42)  # reproducible results
# Combine majority class with upsampled minority class
df_upsampled = pd.concat([df_minority_upsampled, df_majority])

In [None]:
# assign all attributes excluding the survival status attribute to the variable x.
x = df_upsampled.drop(["survival_status"], axis=1)

# assign the survival status attribute to the variable y.
y = df_upsampled["survival_status"]

# print the values of the two classes under the survival status column
print(y.value_counts())

# plot a pie chart for the survival status column
y.value_counts().plot.pie(autopct="%.2f")

In [None]:
# Read balanced dependent variable
print(df_upsampled)


In [None]:
# ID column will be removed for classification model
print('[Data frame without id column:-]')
no_id_bonemarrowDF = df_upsampled.drop('id', axis = 1)
print (no_id_bonemarrowDF.head())

**Data Transformation (Label Encoding)**

In [None]:
# import LabelEncoder from sklearn.preprocessing package
from sklearn.preprocessing import LabelEncoder

# Select Non-Numerical Columns
categorical_col = no_id_bonemarrowDF.select_dtypes (exclude=[np.number]).columns
print (categorical_col)
print (no_id_bonemarrowDF[categorical_col].head())

# Iterate through column of categorical data to convert to numeric data using LabelEncoder()
label_encoder = LabelEncoder()
for i in categorical_col:
  no_id_bonemarrowDF[i] = label_encoder.fit_transform (no_id_bonemarrowDF[i])

print("\nLabel Encoder Data:")
print(no_id_bonemarrowDF.head(20))

**Data Transformation (MinMax Scaler)**

In [None]:
# Data Transformation with MinMax Scaler Method
from sklearn import preprocessing

minmax_scale = preprocessing.MinMaxScaler().fit_transform (no_id_bonemarrowDF)
scaled_frame = pd.DataFrame (minmax_scale, columns = no_id_bonemarrowDF.columns)
print (scaled_frame.head())

**Data Shuffling**

In [None]:

# Checking the Number of Levels in dependent variable
levels = len (pd.value_counts(scaled_frame['survival_status']))
print ('There are {} levels in the survival status column'.format (levels))

# Shuffle Rows Prior to Splitting Data into Features (X) and Outcome (Y) to avoid overfitting
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split

df_shuffled = shuffle (scaled_frame, random_state=42)

# Set the survival status attribute as the dependent variable to x
# All other attributes are set as independent variables to y
DV = 'survival_status'

x = df_shuffled.drop (DV, axis=1)
y = df_shuffled [DV]

# Split data in 80:20 ratio. 80% for training, 20% for testing
# random_state = 42 is used to select the 20% of the data set for testing randomly
x_train, x_test, y_train, y_test = train_test_split (x, y, test_size=0.20, random_state=42)

print (y_train.head ())


## Logistic Regression

In [None]:
# Import sklearn linear library to perform logistic regression
from sklearn.linear_model import LogisticRegression

# Logistic Regression model function will be assigned into LR model variable
LRmodel=LogisticRegression()

# Train the model using x_train and y_train of the dataset variables
LRmodel.fit(x_train, y_train)

In [None]:
# Predict and store survival status based on predictors (train and test dataset)
lr_pred_train = LRmodel.predict(x_train)
lr_pred_test = LRmodel.predict(x_test)

In [None]:
# Print the predicted values of test dataset using logistic regression
print(lr_pred_test)

In [None]:
# Show the accuracy result of train and test dataset using logistic regression model
from sklearn.metrics import accuracy_score
print("Accuracy for train set: {0:0.4f}".format(accuracy_score(y_train, lr_pred_train)))
print("Accuracy for test set: {0:0.4f}".format(accuracy_score(y_test, lr_pred_test)))

In [None]:
from sklearn.metrics import accuracy_score, precision_score,  recall_score

print('Accuracy: %0.4f' % accuracy_score(y_test, lr_pred_test))
print('Precision: %0.4f' % precision_score(y_test, lr_pred_test))
print('Recall: %0.4f' % recall_score(y_test, lr_pred_test))

In [None]:
# Perfomance evaluation of the logistics regression model
# via confusion matrix and classification report.

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = pd.DataFrame (confusion_matrix(y_test, lr_pred_test))
cm['Total'] = np.sum(cm,axis =1)
cm = cm.append (np.sum(cm, axis=0),ignore_index=True)
cm.columns = ['Predicted Survive', 'Predicted Did not survive', 'Total']
cm = cm.set_index([['Actual Survive', 'Actual Did not survive', 'Total']])
print("Confusion matrix:")
print (cm)

# Print classification report
from sklearn.metrics import classification_report
print("\nClassification report:")
print(classification_report (y_test, lr_pred_test))

In [None]:
logisticRegression_cm = confusion_matrix(y_test, lr_pred_test)
logisticRegression_cm_disp = ConfusionMatrixDisplay(confusion_matrix=logisticRegression_cm, display_labels=["Survive", "Did Not Survive"])
logisticRegression_cm_disp.plot()
logisticRegression_cm_disp.ax_.set_title("Decision Tree Model")

In [None]:
# Import roc library function
from sklearn.metrics import  roc_curve, auc
lr_pred_train = LRmodel.predict(x_train)
lr_pred_test = LRmodel.predict(x_test)

# Get AUC and ROC
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, lr_pred_train)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, lr_pred_test)

train_roc_auc = auc(train_fpr, train_tpr)
test_roc_auc = auc(test_fpr, test_tpr)

print('AUC for train set: %0.4f' % train_roc_auc)
print('AUC for test set: %0.4f' % test_roc_auc)

In [None]:
# Visualize ROC curve
plt.figure(figsize=(7, 5), dpi=80)
plt.plot(test_fpr,
         test_tpr,
         color='tomato',
         label='ROC curve for test set (area = %0.4f)' % test_roc_auc)
plt.plot(train_fpr,
         train_tpr,
         color='dodgerblue',
         label='ROC curve for train set (area = %0.4f)' % train_roc_auc)

plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC Curve', fontsize=16)
plt.legend(loc="lower right")

plt.show()

## Decision Tree

In [None]:
# Import decision tree model from sklearn library to perform decision tree classification
from sklearn.tree import DecisionTreeClassifier

In [None]:
# Train the shuffled x_train and t_train for decision tree classification
DecisionTree = DecisionTreeClassifier(random_state =42)
DecisionTree.fit(x_train,y_train)

In [None]:
# Predict and store survival status based on predictors (train and test dataset)
DecisionTree_pred_train = DecisionTree.predict(x_train)
DecisionTree_pred_test = DecisionTree.predict(x_test)

In [None]:
# Print the predicted values of test dataset using decision tree
print(DecisionTree_pred_test)

In [None]:
# Show the accuracy result of train and test dataset using decision tree model
from sklearn.metrics import accuracy_score
print("Accuracy for train set: {0:0.4f}".format(accuracy_score(y_train, DecisionTree_pred_train)))
print("Accuracy for test set: {0:0.4f}".format(accuracy_score(y_test, DecisionTree_pred_test)))

In [None]:
from sklearn.metrics import accuracy_score, precision_score,  recall_score, roc_curve, auc

print('Accuracy: %0.4f' % accuracy_score(y_test, DecisionTree_pred_test))
print('Precision: %0.4f' % precision_score(y_test, DecisionTree_pred_test))
print('Recall: %0.4f' % recall_score(y_test, DecisionTree_pred_test))

In [None]:
decisionTree_cm = confusion_matrix(y_test, DecisionTree_pred_test)
decisionTree_cm_disp = ConfusionMatrixDisplay(confusion_matrix=decisionTree_cm, display_labels=["Survive", "Did Not Survive"])
decisionTree_cm_disp.plot()
decisionTree_cm_disp.ax_.set_title("Decision Tree Model")


In [None]:
# Perfomance evaluation of the logistics regression model
# via confusion matrix and classification report.

from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
cm = pd.DataFrame (confusion_matrix(y_test, DecisionTree_pred_test))
cm['Total'] = np.sum(cm,axis =1)
cm = cm.append (np.sum(cm, axis=0),ignore_index=True)
cm.columns = ['Predicted Survive', 'Predicted Did not survive', 'Total']
cm = cm.set_index([['Actual Survive', 'Actual Did not survive', 'Total']])
print("Confusion matrix:")
print (cm)

# Print classification report
from sklearn.metrics import classification_report
print("\nClassification report:")
print(classification_report (y_test, DecisionTree_pred_test))

In [None]:
DecisionTree_pred_train = DecisionTree.predict(x_train)
DecisionTree_pred_test = DecisionTree.predict(x_test)

# Get AUC and ROC
train_fpr, train_tpr, train_thresholds = roc_curve(y_train, DecisionTree_pred_train)
test_fpr, test_tpr, test_thresholds = roc_curve(y_test, DecisionTree_pred_test)

train_roc_auc = auc(train_fpr, train_tpr)
test_roc_auc = auc(test_fpr, test_tpr)

print('AUC for train set: %0.4f' % train_roc_auc)
print('AUC for test set: %0.4f' % test_roc_auc)

In [None]:
# Visualize ROC curve
plt.figure(figsize=(7, 5), dpi=80)
plt.plot(test_fpr,
         test_tpr,
         color='tomato',
         label='ROC curve for test set (area = %0.4f)' % test_roc_auc)
plt.plot(train_fpr,
         train_tpr,
         color='dodgerblue',
         label='ROC curve for train set (area = %0.4f)' % train_roc_auc)

plt.plot([0, 1], [0, 1], color='gray', lw=1, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate', fontsize=14)
plt.ylabel('True Positive Rate', fontsize=14)
plt.title('ROC Curve', fontsize=16)
plt.legend(loc="lower right")

plt.show()

**Model Comparison**

In [None]:
from sklearn.metrics import roc_curve
from sklearn.metrics import auc

# Compute false positive rates, true positive rates and thresholds
# based on the logistic regression, random forest and neural network
# models' prediction.
logm_test_fpr, logm_test_tpr, logm_test_thresholds = roc_curve(y_test, lr_pred_test)
dt_test_fpr, dt_test_tpr, dt_test_thresholds = roc_curve(y_test, DecisionTree_pred_test)


# Compute area under curve for all three prediction models.
logm_auc = auc(logm_test_fpr, logm_test_tpr)
dt_auc = auc(dt_test_fpr, dt_test_tpr)


In [None]:
# Set figure size
sbn.set(rc={"figure.figsize": (7, 6)})

# Plot ROC curve for all three prediction models based on
# respective false positive rate and true positive rate.
plt.plot(logm_test_fpr, logm_test_tpr, label="Logistic regression model (AUC): {0:0.4f}".format(logm_auc), marker=".")
plt.plot(dt_test_fpr, dt_test_tpr, label="Decision Tree model (AUC): {0:0.4f}".format(dt_auc), marker=".")


# Plot line with 0.5 AUC (Random prediction)
plt.plot([0, 1], [0, 1], color='grey', lw=1, linestyle='--')

# Labeling axis and graph title
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("ROC Curve")
plt.legend(loc="lower right")