In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

## Read Dataset

In [None]:
df = pd.read_csv("../notebook/data/EHR.csv")

In [None]:
df.head(5)

In [None]:
df.columns

### Independented and target feature Selection

In [None]:
X= df.drop(columns=["Patient_ID","Survival_Status"],axis=1)

In [None]:
X.head(5)

### Preprocessing the data for fitting

In [None]:
num_features = X.select_dtypes(exclude="object").columns
cat_features = X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler,LabelEncoder
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()
lbl_transformer = LabelEncoder()
preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),        
    ]
)

In [None]:
X = preprocessor.fit_transform(X)

In [None]:
X.shape

In [None]:
y = lbl_transformer.fit_transform(df["Survival_Status"])

In [None]:
y.shape

## Testing for the best Params in the models


In [None]:
from sklearn.model_selection import GridSearchCV


### For Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression
lsgtc_model = LogisticRegression(max_iter=1000)
param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['newton-cg', 'lbfgs'],
    'penalty': ['l2'],
}
bin_param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear'],
    'penalty': ['l2',"l1"],
}
grid_search = GridSearchCV(estimator=lsgtc_model, param_grid=param_grid, cv=50, n_jobs=-1, verbose=2)
grid_search.fit(X, y)

bin_gs = GridSearchCV(estimator=lsgtc_model,param_grid=bin_param_grid,cv = 10,n_jobs=-1, verbose=2)
bin_gs.fit(X,y)


In [None]:
grid_search.best_params_

In [None]:
bin_gs.best_params_

In [None]:
grid_search.best_score_

In [None]:
bin_gs.best_score_

In [None]:
grid_search.best_estimator_

### For Descision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier

dt_model = DecisionTreeClassifier(random_state=41)

param_grid={
    'criterion':["gini","entropy","log_loss"],
    "splitter":["best","random"],
    "max_depth":[None,2,5,10, 20, 30, 40,],
    'max_features': [None, 'sqrt', 'log2']  # Number of features to consider when looking for the best split
}



In [None]:
cvs = [5,10,15,20,25,30,40]
insight = {
}
for cv in cvs:
    dt_gscv = GridSearchCV(estimator=dt_model,param_grid=param_grid,cv=cv,n_jobs=-1,verbose=2)
    dt_gscv.fit(X,y)
    # print(f"cv_val : {cv} --> best_score : {dt_gscv.best_score_}")
    insight[cv] ={
    "Score" : dt_gscv.best_score_,
    "model":dt_gscv.best_estimator_,
    "params": dt_gscv.best_params_
    } 
        

In [None]:
insight_df= pd.DataFrame(insight)

In [None]:
insight_df

In [None]:
dt_gscv.best_params_

In [None]:
dt_gscv.best_score_

### Training Models

In [None]:
from sklearn.model_selection import cross_val_score,train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [None]:
models = {
    "Logistic Regression": LogisticRegression(max_iter=10000, random_state=42),
    "Decision Tree": DecisionTreeClassifier(random_state=42),
    # "K-Nearest Neighbors": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),   
}


In [None]:
def model_eval(test_size,model,folds):
    training_acc_list = []
    testing_acc_list = []
    for i in range(0,folds):
        X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=test_size)
        model.fit(X_train,y_train)
        y_train_pred = model.predict(X_train)
        y_test_pred = model.predict(X_test)
        training_acc_list.append(accuracy_score(y_train,y_train_pred))
        testing_acc_list.append(accuracy_score(y_test,y_test_pred))
    # print(f"testing_ accuracy_list : ",testing_acc_list)
    # print(f"training_accuracy_list : ",training_acc_list)
    training_mean_accuracy = np.mean(training_acc_list)
    testing_mean_accuracy = np.mean(testing_acc_list)
    training_std_deviation = np.std(training_acc_list)
    testing_std_deviation = np.std(testing_acc_list)
    training_max_accuracy = np.max(training_acc_list)
    testing_max_accuracy = np.max(testing_acc_list)
    return (training_mean_accuracy,training_max_accuracy,training_std_deviation,testing_mean_accuracy,testing_max_accuracy,testing_std_deviation)


In [None]:
cv_folds = [10,25,35,50,65,80,100]
test_size_list = [.10,.25,.33,.40]

In [None]:
result={
    "model":[],
    "No of Cross Validations":[],
    "Test Size":[],
    "Training Mean Accuracy":[],
    "Training Max Accuracy":[],
    "Testing Mean Accuracy":[],
    "Testing Max Accuracy":[],
    "Training Standard Deviation":[],
    "Testing Standard Deviation":[]

}

In [None]:
for model_name, model in models.items():
    for cv_fold in cv_folds:
        for test_size in test_size_list:
            training_mean_accuracy,training_max_accuracy,training_std_deviation,testing_mean_accuracy,testing_max_accuracy,testing_std_deviation = model_eval(model=model,test_size=test_size,folds=cv_fold)
            print(f"Model : {model_name} cross_val_folds : {cv_fold} test_size : {test_size}")
            print("Training Mean Accuracy",training_mean_accuracy)
            print("Training Max Accuracy", training_max_accuracy)                
            print("Training Standard Deviation",training_std_deviation)
            print("Testing Mean Accuracy", testing_mean_accuracy)
            print("Testing Max Accuracy", testing_max_accuracy)
            print("Testing Standard Deviation",testing_std_deviation)
            print("-"*80)
            result["model"].append(model_name)
            result["No of Cross Validations"].append(cv_fold)
            result["Test Size"].append(test_size)
            result["Training Mean Accuracy"].append(round(training_mean_accuracy,4)),
            result["Training Max Accuracy"].append(round(training_max_accuracy,4)),
            result["Training Standard Deviation"].append(round(training_std_deviation,4)),
            result["Testing Mean Accuracy"].append(round(testing_mean_accuracy,4)),
            result["Testing Max Accuracy"].append(round(testing_max_accuracy,4)),
            result["Testing Standard Deviation"].append(round(testing_std_deviation,4)),

In [None]:
result_df = pd.DataFrame(result)

In [None]:
result_df.shape

In [None]:
result_df.tail(35)

In [None]:
result_df.to_csv("../notebook/results/results.csv",index=False)