In [3]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 

In [4]:
df = pd.read_csv('improved_disease_dataset.csv')
df.head()

Unnamed: 0,fever,headache,nausea,vomiting,fatigue,joint_pain,skin_rash,cough,weight_loss,yellow_eyes,disease
0,1,0,1,0,1,0,0,1,1,0,Paralysis (brain hemorrhage)
1,0,0,1,0,0,0,1,1,0,0,Paralysis (brain hemorrhage)
2,0,0,1,0,1,0,1,1,0,0,Paralysis (brain hemorrhage)
3,0,0,1,1,1,0,1,1,1,1,Paralysis (brain hemorrhage)
4,1,0,1,0,1,0,0,0,1,1,Paralysis (brain hemorrhage)


In [5]:
df.isnull().sum()

fever          0
headache       0
nausea         0
vomiting       0
fatigue        0
joint_pain     0
skin_rash      0
cough          0
weight_loss    0
yellow_eyes    0
disease        0
dtype: int64

In [6]:
df.dtypes

fever           int64
headache        int64
nausea          int64
vomiting        int64
fatigue         int64
joint_pain      int64
skin_rash       int64
cough           int64
weight_loss     int64
yellow_eyes     int64
disease        object
dtype: object

In [7]:
df.disease.unique()

array(['Paralysis (brain hemorrhage)', 'Hypertension', 'Hepatitis B',
       'Impetigo', 'Chronic cholestasis', 'Hepatitis C', 'Typhoid',
       'Dimorphic hemorrhoids(piles)',
       'Vertigo (Benign paroxysmal Positional Vertigo)',
       'Cervical spondylosis', 'Tuberculosis', 'Hyperthyroidism',
       'Malaria', 'Gastroenteritis', 'Osteoarthritis', 'Heart attack',
       'Dengue', 'Pneumonia', 'Urinary tract infection', 'Hypoglycemia',
       'Bronchial Asthma', 'Arthritis', 'Hepatitis D', 'Hypothyroidism',
       'Acne', 'GERD', 'Peptic ulcer disease', 'Psoriasis',
       'Drug Reaction', 'Diabetes', 'Varicose veins', 'Hepatitis A',
       'Hepatitis E', 'Migraine', 'Allergy', 'Jaundice', 'AIDS',
       'Alcoholic hepatitis'], dtype=object)

In [8]:
df['disease'].value_counts()

Paralysis (brain hemorrhage)                      90
Hepatitis B                                       80
Impetigo                                          80
Hypertension                                      80
Vertigo (Benign paroxysmal Positional Vertigo)    70
Hyperthyroidism                                   70
Tuberculosis                                      70
Cervical spondylosis                              70
Dimorphic hemorrhoids(piles)                      70
Typhoid                                           70
Hepatitis C                                       70
Chronic cholestasis                               70
Malaria                                           60
Gastroenteritis                                   60
Osteoarthritis                                    60
Heart attack                                      60
Arthritis                                         50
Bronchial Asthma                                  50
Hypoglycemia                                  

In [9]:
## Getting All Different Types OF Features

num_feature = [feature for feature in df.columns if df[feature].dtype != 'o']
num_feature

['fever',
 'headache',
 'nausea',
 'vomiting',
 'fatigue',
 'joint_pain',
 'skin_rash',
 'cough',
 'weight_loss',
 'yellow_eyes',
 'disease']

In [10]:
cat_feature = [feature for feature in df.columns if df[feature].dtype == 'O']
cat_feature

['disease']

In [11]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df['disease'] = le.fit_transform(df['disease'])

In [12]:
## Indpendent and dependent features

from sklearn.model_selection import train_test_split
x = df.drop(['disease'], axis=1)
y = df['disease']

In [13]:
x.head()

Unnamed: 0,fever,headache,nausea,vomiting,fatigue,joint_pain,skin_rash,cough,weight_loss,yellow_eyes
0,1,0,1,0,1,0,0,1,1,0
1,0,0,1,0,0,0,1,1,0,0
2,0,0,1,0,1,0,1,1,0,0
3,0,0,1,1,1,0,1,1,1,1
4,1,0,1,0,1,0,0,0,1,1


In [14]:
y.head()

0    29
1    29
2    29
3    29
4    29
Name: disease, dtype: int32

In [15]:
from sklearn.model_selection import train_test_split
xtrain, xtest, ytrain, ytest = train_test_split(x, y, random_state=42, test_size=0.2)
xtrain.shape, x.shape

((1600, 10), (2000, 10))

In [16]:
xtrain

Unnamed: 0,fever,headache,nausea,vomiting,fatigue,joint_pain,skin_rash,cough,weight_loss,yellow_eyes
968,0,1,0,1,1,0,0,1,1,0
240,1,1,0,1,1,0,0,1,0,0
819,0,1,0,1,1,0,1,1,1,0
692,1,0,1,0,0,0,0,0,1,0
420,0,1,1,1,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...
1130,0,0,0,1,0,0,0,1,1,0
1294,0,1,1,1,1,1,0,1,0,0
860,0,0,0,1,0,1,0,1,0,0
1459,0,1,0,0,1,0,1,1,0,1


## Model Training And Model Selection

In [17]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Evaluation function for classification
def evaluate_model(true, predicted):
    accuracy = accuracy_score(true, predicted)
    precision = precision_score(true, predicted, average='weighted')
    recall = recall_score(true, predicted, average='weighted')
    f1 = f1_score(true, predicted, average='weighted')
    return accuracy, precision, recall, f1

# Define classification models
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "K-Neighbors Classifier": KNeighborsClassifier(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest Classifier": RandomForestClassifier()
}

# Train and evaluate each model
for name, model in models.items():
    model.fit(xtrain, ytrain)
    y_train_pred = model.predict(xtrain)
    y_test_pred = model.predict(xtest)
    
    train_acc, train_prec, train_rec, train_f1 = evaluate_model(ytrain, y_train_pred)
    test_acc, test_prec, test_rec, test_f1 = evaluate_model(ytest, y_test_pred)
    
    print(f"{name}")
    print("Model performance for Training set")
    print(f"- Accuracy: {train_acc:.4f}")
    print(f"- Precision: {train_prec:.4f}")
    print(f"- Recall: {train_rec:.4f}")
    print(f"- F1 Score: {train_f1:.4f}")
    print("----------------------------------")
    print("Model performance for Test set")
    print(f"- Accuracy: {test_acc:.4f}")
    print(f"- Precision: {test_prec:.4f}")
    print(f"- Recall: {test_rec:.4f}")
    print(f"- F1 Score: {test_f1:.4f}")
    print("=" * 35, "\n")


Logistic Regression
Model performance for Training set
- Accuracy: 0.4481
- Precision: 0.4486
- Recall: 0.4481
- F1 Score: 0.4415
----------------------------------
Model performance for Test set
- Accuracy: 0.4375
- Precision: 0.4647
- Recall: 0.4375
- F1 Score: 0.4385

K-Neighbors Classifier
Model performance for Training set
- Accuracy: 0.5000
- Precision: 0.5278
- Recall: 0.5000
- F1 Score: 0.4976
----------------------------------
Model performance for Test set
- Accuracy: 0.3475
- Precision: 0.3996
- Recall: 0.3475
- F1 Score: 0.3542

Decision Tree
Model performance for Training set
- Accuracy: 0.6531
- Precision: 0.6796
- Recall: 0.6531
- F1 Score: 0.6484
----------------------------------
Model performance for Test set
- Accuracy: 0.3200
- Precision: 0.3516
- Recall: 0.3200
- F1 Score: 0.3210

Random Forest Classifier
Model performance for Training set
- Accuracy: 0.6531
- Precision: 0.6615
- Recall: 0.6531
- F1 Score: 0.6535
----------------------------------
Model performance

In [18]:

# Hyperparameter Tuning for K-Nearest Neighbors (KNN)
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# KNN
param_grid_knn = {
    'n_neighbors': [3, 5, 7, 9],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan']
}

knn = KNeighborsClassifier()
grid_knn = GridSearchCV(knn, param_grid_knn, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_knn.fit(xtrain, ytrain)
best_knn = grid_knn.best_estimator_
y_pred_knn = best_knn.predict(xtest)
print("Tuned KNN Accuracy:", accuracy_score(ytest, y_pred_knn))

# Decision Tree
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt = DecisionTreeClassifier(random_state=42)
grid_dt = GridSearchCV(dt, param_grid_dt, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_dt.fit(xtrain, ytrain)
best_dt = grid_dt.best_estimator_
y_pred_dt = best_dt.predict(xtest)
print("Tuned Decision Tree Accuracy:", accuracy_score(ytest, y_pred_dt))

# Logistic Regression
param_grid_lr = {
    'C': [0.1, 1, 10, 100],
    'solver': ['liblinear', 'lbfgs'],
    'penalty': ['l2']
}

lr = LogisticRegression(max_iter=1000, random_state=42)
grid_lr = GridSearchCV(lr, param_grid_lr, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_lr.fit(xtrain, ytrain)
best_lr = grid_lr.best_estimator_
y_pred_lr = best_lr.predict(xtest)
print("Tuned Logistic Regression Accuracy:", accuracy_score(ytest, y_pred_lr))

# Random Forest
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

rf = RandomForestClassifier(random_state=42)
grid_rf = GridSearchCV(estimator=rf, param_grid=param_grid_rf, cv=5, n_jobs=-1, verbose=2, scoring='accuracy')
grid_rf.fit(xtrain, ytrain)
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(xtest)
print("Tuned Random Forest Accuracy:", accuracy_score(ytest, y_pred_rf))


Fitting 5 folds for each of 16 candidates, totalling 80 fits
Tuned KNN Accuracy: 0.3775
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Tuned Decision Tree Accuracy: 0.3425
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Tuned Logistic Regression Accuracy: 0.4375
Fitting 5 folds for each of 24 candidates, totalling 120 fits
Tuned Random Forest Accuracy: 0.4025


In [19]:
import pickle
pickle.dump(lr,open('Logistic Regression.pkl','wb'))