## Importing Necessary Libraries

In [1]:
# for dataframe manipulation and analysis
import pandas as pd
import numpy as np

# for data visualiztion
import matplotlib.pyplot as plt
import seaborn as sns

# for warning filtering
import warnings
warnings.filterwarnings('ignore')

# for data preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

## Loading Dataset

In [2]:
df = pd.read_csv('Disease_symptom_and_patient_profile_dataset.csv')
df.head()

Unnamed: 0,Disease,Fever,Cough,Fatigue,Difficulty Breathing,Age,Gender,Blood Pressure,Cholesterol Level,Outcome Variable
0,Influenza,Yes,No,Yes,Yes,19,Female,Low,Normal,Positive
1,Common Cold,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
2,Eczema,No,Yes,Yes,No,25,Female,Normal,Normal,Negative
3,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive
4,Asthma,Yes,Yes,No,Yes,25,Male,Normal,Normal,Positive


## Exploratory Data Analysis

In [3]:
df.duplicated().sum()

49

In [4]:
df.drop_duplicates(inplace = True)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 300 entries, 0 to 347
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Disease               300 non-null    object
 1   Fever                 300 non-null    object
 2   Cough                 300 non-null    object
 3   Fatigue               300 non-null    object
 4   Difficulty Breathing  300 non-null    object
 5   Age                   300 non-null    int64 
 6   Gender                300 non-null    object
 7   Blood Pressure        300 non-null    object
 8   Cholesterol Level     300 non-null    object
 9   Outcome Variable      300 non-null    object
dtypes: int64(1), object(9)
memory usage: 25.8+ KB


In [6]:
df['Cholesterol Level'].value_counts()

High      140
Normal    130
Low        30
Name: Cholesterol Level, dtype: int64

## Data preprocessing

In [8]:
le = LabelEncoder()
for col in df.drop(['Disease','Age'],axis = 1).columns:
    df[col] = le.fit_transform(df[col])

In [9]:
df = pd.get_dummies(df,columns = ['Disease'])

## Creating Training,validation and Test set

In [10]:
X = df.drop('Outcome Variable',axis = 1)
y = df['Outcome Variable']

In [11]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.1,random_state = 5)

## Model Selection

### Training on Different Models

In [12]:
# ml models
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier,GradientBoostingClassifier
from xgboost import XGBClassifier

# for model evaluation
from sklearn.metrics import accuracy_score,confusion_matrix,f1_score,precision_score

# cross validation
from sklearn.model_selection import cross_val_score,StratifiedKFold

In [13]:
def model_builder(estimators):
    for estimator in estimators:
        estimator.fit(X_train,y_train)
        
        # model evaluation
        training_accuracy = estimator.score(X_train,y_train)
        y_pred = estimator.predict(X_test)
        validation_accuracy = accuracy_score(y_test,y_pred)
        conf_matrix = confusion_matrix(y_test,y_pred)
        f1score = f1_score(y_test,y_pred,average = 'weighted')
        
        print('MODEL NAME:',type(estimator).__name__)
        print('\nTraining Accuracy:',round(training_accuracy,4),end = '\t')
        print('Validation Accuracy:',round(validation_accuracy,4),end = '\t')
        print('f1 score:',round(f1score,4))
        print('confusion matrix:\n',conf_matrix)
        print('\n')

In [14]:
model_builder([LogisticRegression(),MultinomialNB(),SVC(),RandomForestClassifier(),GradientBoostingClassifier(),
              XGBClassifier()])

MODEL NAME: LogisticRegression

Training Accuracy: 0.763	Validation Accuracy: 0.6	f1 score: 0.6
confusion matrix:
 [[ 8  6]
 [ 6 10]]


MODEL NAME: MultinomialNB

Training Accuracy: 0.7519	Validation Accuracy: 0.5667	f1 score: 0.5652
confusion matrix:
 [[9 5]
 [8 8]]


MODEL NAME: SVC

Training Accuracy: 0.5222	Validation Accuracy: 0.5333	f1 score: 0.371
confusion matrix:
 [[ 0 14]
 [ 0 16]]


MODEL NAME: RandomForestClassifier

Training Accuracy: 1.0	Validation Accuracy: 0.6667	f1 score: 0.6637
confusion matrix:
 [[ 8  6]
 [ 4 12]]


MODEL NAME: GradientBoostingClassifier

Training Accuracy: 0.8963	Validation Accuracy: 0.7667	f1 score: 0.7659
confusion matrix:
 [[12  2]
 [ 5 11]]


MODEL NAME: XGBClassifier

Training Accuracy: 0.9889	Validation Accuracy: 0.6333	f1 score: 0.6321
confusion matrix:
 [[ 8  6]
 [ 5 11]]


