In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
import warnings
warnings.filterwarnings('ignore')

In [2]:
df=pd.read_csv(r'E:\DiseasesDiagnosis\notebook\final.csv')

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,Patient_ID,Gender,Date_of_Birth,Blood Pressure (systolic),Blood Pressure (diastolic),Heart Rate,Respiratory Rate,Temperature (Celsius),Oxygen Saturation,Glucose Level,Cholesterol Level,Diagnosis Code,Diagnosis,Age
0,0,1,Male,2006-05-05,144,72,106,26,37.3,86%,90,237,51881,Acute Respiratory Failure,18
1,1,2,Female,1949-08-02,95,65,102,26,37.4,93%,141,176,4019,Hypertension,75
2,2,3,Male,1996-08-18,173,112,60,20,36.8,94%,139,226,4100,Coronary Artery Disease,28
3,3,4,Male,1982-06-10,139,69,54,19,37.0,98%,150,180,4100,Coronary Artery Disease,42
4,4,5,Male,1956-01-14,173,85,83,19,36.7,97%,109,291,41401,Chronic Ischemic Heart Disease,69


In [4]:
df.head()

Unnamed: 0.1,Unnamed: 0,Patient_ID,Gender,Date_of_Birth,Blood Pressure (systolic),Blood Pressure (diastolic),Heart Rate,Respiratory Rate,Temperature (Celsius),Oxygen Saturation,Glucose Level,Cholesterol Level,Diagnosis Code,Diagnosis,Age
0,0,1,Male,2006-05-05,144,72,106,26,37.3,86%,90,237,51881,Acute Respiratory Failure,18
1,1,2,Female,1949-08-02,95,65,102,26,37.4,93%,141,176,4019,Hypertension,75
2,2,3,Male,1996-08-18,173,112,60,20,36.8,94%,139,226,4100,Coronary Artery Disease,28
3,3,4,Male,1982-06-10,139,69,54,19,37.0,98%,150,180,4100,Coronary Artery Disease,42
4,4,5,Male,1956-01-14,173,85,83,19,36.7,97%,109,291,41401,Chronic Ischemic Heart Disease,69


In [5]:
df['Oxygen Saturation'] = df['Oxygen Saturation'].str.replace('%', '').astype(int)

In [6]:
df.columns = df.columns.str.replace(' ', '_') \
                    .str.replace('(', '', regex=False) \
                    .str.replace(')', '', regex=False)


In [7]:
df.drop(['Unnamed:_0'],axis=1,inplace=True)

In [8]:
df.to_csv(r'E:\DiseasesDiagnosis\notebook\data\final_model_data.csv')

In [9]:
df.head()

Unnamed: 0,Patient_ID,Gender,Date_of_Birth,Blood_Pressure_systolic,Blood_Pressure_diastolic,Heart_Rate,Respiratory_Rate,Temperature_Celsius,Oxygen_Saturation,Glucose_Level,Cholesterol_Level,Diagnosis_Code,Diagnosis,Age
0,1,Male,2006-05-05,144,72,106,26,37.3,86,90,237,51881,Acute Respiratory Failure,18
1,2,Female,1949-08-02,95,65,102,26,37.4,93,141,176,4019,Hypertension,75
2,3,Male,1996-08-18,173,112,60,20,36.8,94,139,226,4100,Coronary Artery Disease,28
3,4,Male,1982-06-10,139,69,54,19,37.0,98,150,180,4100,Coronary Artery Disease,42
4,5,Male,1956-01-14,173,85,83,19,36.7,97,109,291,41401,Chronic Ischemic Heart Disease,69


In [10]:
df.drop(['Patient_ID','Date_of_Birth'],axis=1,inplace=True)

In [11]:
X=df.drop(['Diagnosis'],axis=1)
Y=df['Diagnosis']

In [12]:
categorical_column=X.select_dtypes(include='O').columns.tolist()
numerical_column=X.select_dtypes(exclude='O').columns.tolist()

In [13]:
numerical_column=['Blood_Pressure_systolic',
 'Blood_Pressure_diastolic',
 'Heart_Rate',
 'Respiratory_Rate',
 'Temperature_Celsius',
 'Glucose_Level',
 'Cholesterol_Level',
 'Diagnosis_Code',
 'Age',
 'Oxygen_Saturation']

In [14]:
categorical_column=['Gender']

In [15]:
## Numerical pipeline:
num_pipeline=Pipeline(
    steps=[
        ('Imputer',SimpleImputer(strategy='median')),
        ('Scaler',StandardScaler())
    ]
)

## Categorical Pipeline:
cat_pipeline = Pipeline(
    steps=[
        ('Imputer', SimpleImputer(strategy='most_frequent')),
        ('Encoder', OneHotEncoder(handle_unknown='ignore')),
        ('Scaler', StandardScaler(with_mean=False))  # with_mean=False because one-hot encoded data is sparse
    ]
)


In [16]:
preprocessor=ColumnTransformer([
    ('numerical_pipeline',num_pipeline,numerical_column),
    ('categorical_pipeline',cat_pipeline,categorical_column)
])

In [17]:
preprocessor

- Train -Test Split

In [18]:
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.30,random_state=42)

In [19]:
X_train.dtypes

Gender                       object
Blood_Pressure_systolic       int64
Blood_Pressure_diastolic      int64
Heart_Rate                    int64
Respiratory_Rate              int64
Temperature_Celsius         float64
Oxygen_Saturation             int32
Glucose_Level                 int64
Cholesterol_Level             int64
Diagnosis_Code                int64
Age                           int64
dtype: object

In [20]:
X_train.shape

(700000, 11)

In [21]:
X_test.shape

(300000, 11)

In [22]:
X_train=pd.DataFrame(preprocessor.fit_transform(X_train),columns=preprocessor.get_feature_names_out())
X_test=pd.DataFrame(preprocessor.transform(X_test),columns=preprocessor.get_feature_names_out())

In [23]:
X_train.head()

Unnamed: 0,numerical_pipeline__Blood_Pressure_systolic,numerical_pipeline__Blood_Pressure_diastolic,numerical_pipeline__Heart_Rate,numerical_pipeline__Respiratory_Rate,numerical_pipeline__Temperature_Celsius,numerical_pipeline__Glucose_Level,numerical_pipeline__Cholesterol_Level,numerical_pipeline__Diagnosis_Code,numerical_pipeline__Age,numerical_pipeline__Oxygen_Saturation,categorical_pipeline__Gender_Female,categorical_pipeline__Gender_Male
0,1.557441,-1.530953,-0.864419,-0.484337,0.198563,1.504239,0.774303,-0.4286,0.700281,-0.464166,0.0,2.0
1,1.249671,0.143938,-0.814918,0.480141,0.198563,0.120319,-0.81958,1.662153,1.289203,-1.158707,2.0,0.0
2,1.326614,0.028429,0.323595,0.480141,0.598077,-0.385344,0.658804,-0.4286,0.942778,0.461889,0.0,2.0
3,-0.327648,-0.722385,1.412607,1.251723,-0.001194,-0.278889,-0.172787,2.278638,-0.477563,-0.001138,0.0,2.0
4,-1.097072,1.068017,-0.270412,1.251723,-0.600465,-0.571641,0.959101,-0.518485,0.111359,-1.621735,0.0,2.0


In [24]:
X_train.shape

(700000, 12)

In [25]:
len(df['Diagnosis'].unique().tolist())

10

In [26]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score

In [27]:
models = {
    "Logistic Regression": LogisticRegression(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Support Vector Classifier": SVC(),
    "Naive Bayes (Gaussian)": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "AdaBoost": AdaBoostClassifier(),
    "Bagging Classifier": BaggingClassifier(),
    "XGBoost Classifier": XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

In [28]:
params = {
    "Logistic Regression": {
        'penalty': ['l1', 'l2', 'elasticnet', None],
        'C': [0.01, 0.1, 1, 10, 100],
        'solver': ['liblinear', 'saga', 'lbfgs', 'newton-cg'],
        'max_iter': [100, 200, 500]
    },

    "K-Nearest Neighbors": {
        'n_neighbors': [3, 5, 7, 9],
        'weights': ['uniform', 'distance'],
        'metric': ['euclidean', 'manhattan', 'minkowski']
    },

    "Decision Tree": {
        'criterion': ['gini', 'entropy', 'log_loss'],
        'splitter': ['best', 'random'],
        'max_depth': [None, 5, 10, 20],
        'max_features': ['sqrt', 'log2', None]
    },

    "Random Forest": {
        'n_estimators': [50, 100, 200],
        'criterion': ['gini', 'entropy', 'log_loss'],
        'max_features': ['sqrt', 'log2', None],
        'max_depth': [None, 5, 10, 20]
    },

    "Support Vector Classifier": {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto']
    },

    "Naive Bayes (Gaussian)": {
        'var_smoothing': [1e-9, 1e-8, 1e-7]
    },

    "Gradient Boosting": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'subsample': [0.6, 0.8, 1.0],
        'max_depth': [3, 5, 7]
    },
    
    "AdaBoost": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 1.0]
    },

    "Bagging Classifier": {
        'n_estimators': [10, 50, 100],
        'max_samples': [0.5, 0.7, 1.0],
        'max_features': [0.5, 0.7, 1.0]
    },

    "XGBoost Classifier": {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 7],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0]
    }
}


In [29]:
from sklearn.model_selection import GridSearchCV
def get_best_model(X_train,y_train,X_test,y_test,model_dict,params_,base_accuracy=0.7):
    models = model_dict.keys()
    for model in models:
        print(f'Initial base accuracy is {base_accuracy}')
        print(20*'*',"Estimator: ",model,20*'*')
        gs = GridSearchCV(estimator=model_dict[model],param_grid=params_[model],cv=3)
        gs.fit(X_train,y_train)
        print(f'Best parameters are :{gs.best_params_} With Accuracy score:{gs.best_score_}')
        
        Classifier = model_dict[model].set_params(**gs.best_params_)
        Classifier.fit(X_train,y_train)

        y_train_pred = Classifier.predict(X_train)
        y_test_pred = Classifier.predict(X_test)
        
        train_model_score = accuracy_score(y_train, y_train_pred)

        test_model_score = accuracy_score(y_test, y_test_pred)
        
        if test_model_score >= base_accuracy:
            print('Acceptabel model found!')
            base_accuracy = test_model_score
            best_model = {model:Classifier}
        else:
            print('Model rejected\n',30*'__')

    print(f"Best model is:{list(best_model.keys())} with accuracy Score: {round(test_model_score,2)}")        
    return best_model

In [30]:
get_best_model(X_train,y_train,X_test,y_test,model_dict=models,params_=params)

In [35]:
model=RandomForestClassifier(n_estimators=50)

In [36]:
model.fit(X_train,y_train)

In [37]:
y_pred=model.predict(X_test)

In [38]:
accuracy_score(y_test,y_pred)

1.0