In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Read Dataset

In [2]:
df = pd.read_csv("../notebook/data/EHR.csv")

In [3]:
df.head(5)

Unnamed: 0,Patient_ID,Age,Gender,Tumor_Size(cm),Tumor_Type,Biopsy_Result,Treatment,Response_to_Treatment,Survival_Status
0,c044501a-43ca-4a0c-8b8b-991439ba1b6a,52,Female,5.08,Benign,Positive,Surgery,No Response,Survived
1,b8900c4c-1232-4084-9432-5d02eba74d20,32,Female,0.8,Benign,Negative,Surgery,Complete Response,Survived
2,3004e2bc-8037-49cb-a542-d5612b73beab,70,Female,9.56,Benign,Positive,Radiation Therapy,Complete Response,Deceased
3,1df86af7-6745-4dea-b127-cbc9915079fc,21,Female,3.07,Malignant,Negative,Surgery,Partial Response,Survived
4,128e00c3-72e3-4031-a7f4-1165d7199cce,62,Male,7.17,Malignant,Positive,Radiation Therapy,Complete Response,Deceased


In [4]:
df.columns

Index(['Patient_ID', 'Age', 'Gender', 'Tumor_Size(cm)', 'Tumor_Type',
       'Biopsy_Result', 'Treatment', 'Response_to_Treatment',
       'Survival_Status'],
      dtype='object')

### Independented and target feature Selection

In [5]:
X= df.drop(columns=["Patient_ID","Survival_Status"],axis=1)

In [6]:
X.head(5)

Unnamed: 0,Age,Gender,Tumor_Size(cm),Tumor_Type,Biopsy_Result,Treatment,Response_to_Treatment
0,52,Female,5.08,Benign,Positive,Surgery,No Response
1,32,Female,0.8,Benign,Negative,Surgery,Complete Response
2,70,Female,9.56,Benign,Positive,Radiation Therapy,Complete Response
3,21,Female,3.07,Malignant,Negative,Surgery,Partial Response
4,62,Male,7.17,Malignant,Positive,Radiation Therapy,Complete Response


### Preprocessing the data for fitting

In [7]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
lbl_transformer = LabelEncoder()
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [8]:
X = preprocessor.fit_transform(X)

In [9]:
X.shape

(20000, 14)

In [35]:
X

array([[ 1.        ,  0.        ,  1.        , ...,  0.        ,
         0.11623573, -0.06106263],
       [ 1.        ,  0.        ,  1.        , ...,  0.        ,
        -1.02590568, -1.62490211],
       [ 1.        ,  0.        ,  1.        , ...,  0.        ,
         1.144163  ,  1.57585346],
       ...,
       [ 1.        ,  0.        ,  1.        , ...,  1.        ,
         0.34466401,  0.43220449],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
        -0.62615619,  0.30432042],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.68730644,  0.80854904]])

In [10]:
y = lbl_transformer.fit_transform(df["Survival_Status"])

In [11]:
y.shape

(20000,)

## Testing for the best Params in the models


In [12]:
from sklearn.model_selection import GridSearchCV


### For Logistic Regression

In [13]:
from sklearn.linear_model import LogisticRegression
lsgtc_model = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs'],
    'penalty': ['l2'],
}
bin_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear'],
    'penalty': ['l2',"l1"],
}
grid_search = GridSearchCV(estimator=lsgtc_model, param_grid=param_grid, cv=50, n_jobs=-1, verbose=2)
grid_search.fit(X, y)

bin_gs = GridSearchCV(estimator=lsgtc_model,param_grid=bin_param_grid,cv = 10,n_jobs=-1, verbose=2)
bin_gs.fit(X,y)


Fitting 50 folds for each of 10 candidates, totalling 500 fits
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, solver=newton-cg; total time=   0.0s
[CV] END ...............C=0.01, penalty=l2, so

In [14]:
grid_search.best_params_

{'C': 0.01, 'penalty': 'l2', 'solver': 'lbfgs'}

In [15]:
bin_gs.best_params_

{'C': 0.1, 'penalty': 'l1', 'solver': 'liblinear'}

In [16]:
grid_search.best_score_

np.float64(0.501)

In [17]:
bin_gs.best_score_

np.float64(0.50415)

In [18]:
grid_search.best_estimator_

### For Descision Tree

In [19]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=41)

param_grid={
    'criterion':["gini","entropy","log_loss"],
    "splitter":["best","random"],
    "max_depth":[None,2,5,10, 20, 30, 40,],
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}



In [20]:
cvs = [5,10,15,20,25,30,40]
insight = {
}
for cv in cvs:
    dt_gscv = GridSearchCV(estimator=dt_model,param_grid=param_grid,cv=cv,n_jobs=-1,verbose=2)
    dt_gscv.fit(X,y)
    # print(f"cv_val : {cv} --> best_score : {dt_gscv.best_score_}")
    insight[cv] ={
    "Score" : dt_gscv.best_score_,
    "model":dt_gscv.best_estimator_,
    "params": dt_gscv.best_params_
    } 
        

Fitting 5 folds for each of 126 candidates, totalling 630 fits
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=best; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=best; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=random; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=best; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=random; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=best; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=best; total time=   0.1s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=random; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=None, splitter=random; total time=   0.0s
[CV] END criterion=gini, max_depth=None, max_features=sqrt, splitter=best; total ti

In [21]:
insight_df= pd.DataFrame(insight)

In [22]:
insight_df

Unnamed: 0,5,10,15,20,25,30,40
Score,0.5102,0.50755,0.507801,0.50935,0.5077,0.505503,0.50705
model,"DecisionTreeClassifier(criterion='entropy', ma...","DecisionTreeClassifier(criterion='entropy', ma...","DecisionTreeClassifier(max_depth=5, random_sta...","DecisionTreeClassifier(max_depth=10, random_st...","DecisionTreeClassifier(criterion='entropy', ma...","DecisionTreeClassifier(max_depth=30, random_st...","DecisionTreeClassifier(max_depth=2, random_sta..."
params,"{'criterion': 'entropy', 'max_depth': None, 'm...","{'criterion': 'entropy', 'max_depth': 20, 'max...","{'criterion': 'gini', 'max_depth': 5, 'max_fea...","{'criterion': 'gini', 'max_depth': 10, 'max_fe...","{'criterion': 'entropy', 'max_depth': 30, 'max...","{'criterion': 'gini', 'max_depth': 30, 'max_fe...","{'criterion': 'gini', 'max_depth': 2, 'max_fea..."


In [23]:
dt_gscv.best_params_

{'criterion': 'gini',
 'max_depth': 2,
 'max_features': None,
 'splitter': 'random'}

In [24]:
dt_gscv.best_score_

np.float64(0.50705)

### Training Models

In [25]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [26]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    # "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),   
}


In [27]:
def model_eval(test_size,model,folds):
    training_acc_list = []
    testing_acc_list = []
    for i in range(0,folds):
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size)
        model.fit(X_train,y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        training_acc_list.append(accuracy_score(y_train,y_train_pred))
        testing_acc_list.append(accuracy_score(y_test,y_test_pred))
    # print(f"testing_ accuracy_list : ",testing_acc_list)
    # print(f"training_accuracy_list : ",training_acc_list)
    training_mean_accuracy = np.mean(training_acc_list)
    testing_mean_accuracy = np.mean(testing_acc_list)
    training_std_deviation = np.std(training_acc_list)
    testing_std_deviation = np.std(testing_acc_list)
    training_max_accuracy = np.max(training_acc_list)
    testing_max_accuracy = np.max(testing_acc_list)
    return (training_mean_accuracy,training_max_accuracy,training_std_deviation,testing_mean_accuracy,testing_max_accuracy,testing_std_deviation)


In [28]:
cv_folds = [10,25,35,50,65,80,100]
test_size_list = [.10,.25,.33,.40]

In [29]:
result={
    "model":[],
    "No of Cross Validations":[],
    "Test Size":[],
    "Training Mean Accuracy":[],
    "Training Max Accuracy":[],
    "Testing Mean Accuracy":[],
    "Testing Max Accuracy":[],
    "Training Standard Deviation":[],
    "Testing Standard Deviation":[]

}

In [30]:
for model_name, model in models.items():
    for cv_fold in cv_folds:
        for test_size in test_size_list:
            training_mean_accuracy,training_max_accuracy,training_std_deviation,testing_mean_accuracy,testing_max_accuracy,testing_std_deviation = model_eval(model=model,test_size=test_size,folds=cv_fold)
            print(f"Model : {model_name} cross_val_folds : {cv_fold} test_size : {test_size}")
            print("Training Mean Accuracy",training_mean_accuracy)
            print("Training Max Accuracy", training_max_accuracy)                
            print("Training Standard Deviation",training_std_deviation)
            print("Testing Mean Accuracy", testing_mean_accuracy)
            print("Testing Max Accuracy", testing_max_accuracy)
            print("Testing Standard Deviation",testing_std_deviation)
            print("-"*80)
            result["model"].append(model_name)
            result["No of Cross Validations"].append(cv_fold)
            result["Test Size"].append(test_size)
            result["Training Mean Accuracy"].append(round(training_mean_accuracy,4)),
            result["Training Max Accuracy"].append(round(training_max_accuracy,4)),
            result["Training Standard Deviation"].append(round(training_std_deviation,4)),
            result["Testing Mean Accuracy"].append(round(testing_mean_accuracy,4)),
            result["Testing Max Accuracy"].append(round(testing_max_accuracy,4)),
            result["Testing Standard Deviation"].append(round(testing_std_deviation,4)),

Model : Logistic Regression cross_val_folds : 10 test_size : 0.1
Training Mean Accuracy 0.5090333333333332
Training Max Accuracy 0.5105
Training Standard Deviation 0.0014290202116947256
Testing Mean Accuracy 0.4967
Testing Max Accuracy 0.505
Testing Standard Deviation 0.005339475629684992
--------------------------------------------------------------------------------
Model : Logistic Regression cross_val_folds : 10 test_size : 0.25
Training Mean Accuracy 0.5094800000000002
Training Max Accuracy 0.514
Training Standard Deviation 0.00296330146214583
Testing Mean Accuracy 0.5024
Testing Max Accuracy 0.515
Testing Standard Deviation 0.006736764802188071
--------------------------------------------------------------------------------
Model : Logistic Regression cross_val_folds : 10 test_size : 0.33
Training Mean Accuracy 0.5111417910447761
Training Max Accuracy 0.5131343283582089
Training Standard Deviation 0.0011049061290539256
Testing Mean Accuracy 0.4998030303030303
Testing Max Accuracy

In [31]:
result_df = pd.DataFrame(result)

In [32]:
result_df.shape

(84, 9)

In [33]:
result_df.tail(35)

Unnamed: 0,model,No of Cross Validations,Test Size,Training Mean Accuracy,Training Max Accuracy,Testing Mean Accuracy,Testing Max Accuracy,Training Standard Deviation,Testing Standard Deviation
49,Decision Tree,80,0.25,0.9988,0.9993,0.4986,0.5208,0.0002,0.0085
50,Decision Tree,80,0.33,0.9989,0.9996,0.4998,0.5161,0.0002,0.0068
51,Decision Tree,80,0.4,0.999,0.9995,0.4997,0.5186,0.0002,0.0056
52,Decision Tree,100,0.1,0.9985,0.9989,0.4996,0.522,0.0001,0.0105
53,Decision Tree,100,0.25,0.9988,0.9994,0.4991,0.5144,0.0002,0.0065
54,Decision Tree,100,0.33,0.9989,0.9996,0.4994,0.5168,0.0002,0.0055
55,Decision Tree,100,0.4,0.999,0.9994,0.4986,0.5144,0.0002,0.0058
56,Naive Bayes,10,0.1,0.5087,0.5121,0.5041,0.5235,0.0023,0.0104
57,Naive Bayes,10,0.25,0.5095,0.5161,0.5036,0.5148,0.0028,0.0065
58,Naive Bayes,10,0.33,0.5116,0.5181,0.5004,0.5071,0.0028,0.0043


In [34]:
result_df.to_csv("../notebook/results/results.csv",index=False)