In [46]:
import pickle

with open('cleaned_data.pkl', 'rb') as f:
    df = pickle.load(f)


In [47]:
#Required Libraries
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

#Preprocessing
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score, confusion_matrix

#Logisitc Models
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier


In [48]:
df.head()


Unnamed: 0,Fever_Yes,Cough_Yes,Fatigue_Yes,Difficulty Breathing_Yes,Gender_Female,Gender_Male,Blood Pressure_High,Blood Pressure_Low,Blood Pressure_Normal,Cholesterol Level_High,Cholesterol Level_Low,Cholesterol Level_Normal,Outcome Variable_Negative,Outcome Variable_Positive,Disease
0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1.0,56
1,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,24
2,0.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,37
3,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,6
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,1.0,37


In [49]:
df = pd.read_csv('D:\money\data analytics _ml\AI powered doctor self\AI-powered-doctor-assistant\dataset\Disease_symptom_and_patient_profile_dataset.csv')

In [50]:
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


In [51]:
NUMERICAL_FEATURES = []
CATEGORICAL_FEATURES = []

for name in df.columns:
    if df[name].dtype == 'object':
        CATEGORICAL_FEATURES.append(name)
    else:
        NUMERICAL_FEATURES.append(name)

CATEGORICAL_FEATURES.remove('Outcome Variable')
FEATURES = NUMERICAL_FEATURES + CATEGORICAL_FEATURES
print(FEATURES)


['Age', 'Disease', 'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing', 'Gender', 'Blood Pressure', 'Cholesterol Level']


In [52]:
#Creating a preprocessor
numeric_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='constant', fill_value='missing')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers = [
        ('num', numeric_transformer, NUMERICAL_FEATURES),
        ('cat', categorical_transformer, CATEGORICAL_FEATURES)
    ]
)

In [53]:
X = df[FEATURES]
y = df['Outcome Variable']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [54]:
#Loading models
models = {
    'Standard Logistic Regression': LogisticRegression(max_iter=1000, random_state=42),
    'L1 Regularized Logistic Regression': LogisticRegression(penalty='l1', solver='liblinear', max_iter=1000, random_state=42),
    'ElasticNet': LogisticRegression(penalty='elasticnet', solver='saga', l1_ratio=0.5, random_state=42),
    'SGD': SGDClassifier(loss='modified_huber', random_state=42),
    'Random Forest Classifier': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting Classifier': GradientBoostingClassifier(n_estimators=100, random_state=42),
    'SVM': SVC(kernel='rbf', probability=True, random_state=42),
}

In [55]:
#Training and Evaluating the models
for name, model in models.items():
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])

    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)
    try:
        y_pred_proba = pipeline.predict_proba(X_test)
        auc = roc_auc_score(y_test, y_pred_proba)
    except:
        auc = None

    print(f"\nResults for {name}:")
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))
    if auc:
        print(f"ROC-AUC Score: {auc:.4f}")
    print('----------------------------------------------------------')


Results for Standard Logistic Regression:

Confusion Matrix:
[[17 13]
 [15 25]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.53      0.57      0.55        30
    Positive       0.66      0.62      0.64        40

    accuracy                           0.60        70
   macro avg       0.59      0.60      0.59        70
weighted avg       0.60      0.60      0.60        70

----------------------------------------------------------

Results for L1 Regularized Logistic Regression:

Confusion Matrix:
[[16 14]
 [16 24]]

Classification Report:
              precision    recall  f1-score   support

    Negative       0.50      0.53      0.52        30
    Positive       0.63      0.60      0.62        40

    accuracy                           0.57        70
   macro avg       0.57      0.57      0.57        70
weighted avg       0.58      0.57      0.57        70

----------------------------------------------------------

Results for