# Human illness prediction

In [42]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [9]:
data=pd.read_csv("toy_dataset.csv")
data.head(5)

Unnamed: 0,Number,City,Gender,Age,Income,Illness
0,1,Dallas,Male,41,40367.0,No
1,2,Dallas,Male,54,45084.0,No
2,3,Dallas,Male,42,52483.0,No
3,4,Dallas,Male,40,40941.0,No
4,5,Dallas,Male,46,50289.0,No


In [10]:
data.shape

(150000, 6)

In [11]:
data.Illness.value_counts()

No     137861
Yes     12139
Name: Illness, dtype: int64

In [12]:
data.isnull().sum()

Number     0
City       0
Gender     0
Age        0
Income     0
Illness    0
dtype: int64

In [14]:
data=data.drop("Number",axis=1)

In [23]:
data.dtypes

City        object
Gender      object
Age          int64
Income     float64
Illness     object
dtype: object

# Model fitting

In [29]:
from sklearn.metrics import confusion_matrix,classification_report,f1_score,accuracy_score,precision_score,recall_score

In [30]:
from sklearn.preprocessing import LabelEncoder
le=LabelEncoder()
for i in data:
    if(data[i].dtype=="object"):
        data[i]=le.fit_transform(data[i])

In [31]:
x=data.iloc[:,0:-1]
y=data.iloc[:,-1]

In [32]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=101)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((120000, 4), (30000, 4), (120000,), (30000,))

In [33]:
data.head()

Unnamed: 0,City,Gender,Age,Income,Illness
0,2,1,41,40367.0,0
1,2,1,54,45084.0,0
2,2,1,42,52483.0,0
3,2,1,40,40941.0,0
4,2,1,46,50289.0,0


# Logistic Regression

In [34]:
from sklearn.linear_model import LogisticRegression
lg=LogisticRegression()

In [35]:
lg.fit(x_train,y_train)

In [36]:
p1=lg.predict(x_test)
p1

array([0, 0, 0, ..., 0, 0, 0])

In [37]:
prob=lg.predict_proba(x_test)
#prob

In [38]:
from sklearn.metrics import confusion_matrix
lg_cf=confusion_matrix(y_test,p1)
lg_cf 

array([[27571,     0],
       [ 2429,     0]], dtype=int64)

In [39]:
lg_ac=lg_cf.diagonal().sum()/lg_cf.sum()*100
lg_ac

91.90333333333334

In [43]:
a=f1_score(y_test,p1)
b=precision_score(y_test,p1)
c=recall_score(y_test,p1)

In [44]:
from sklearn.metrics import classification_report
print(classification_report(y_test,p1))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     27571
           1       0.00      0.00      0.00      2429

    accuracy                           0.92     30000
   macro avg       0.46      0.50      0.48     30000
weighted avg       0.84      0.92      0.88     30000



# Decision Tree

In [45]:
from sklearn.tree import
dt=DecisionTreeClassifier()

In [46]:
dt.fit(x_train,y_train)

In [48]:
dt_p=dt.predict(x_test)
dt_p

array([0, 0, 0, ..., 1, 0, 0])

In [49]:
dt_c=confusion_matrix(y_test,dt_p)
dt_c

array([[25126,  2445],
       [ 2228,   201]], dtype=int64)

In [50]:
dt_ac=dt_c.diagonal().sum()/dt_c.sum()*100
dt_ac

84.42333333333333

In [51]:
a1=f1_score(y_test,dt_p)
b1=precision_score(y_test,dt_p)
c1=recall_score(y_test,dt_p)

In [52]:
print(classification_report(y_test,dt_p))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91     27571
           1       0.08      0.08      0.08      2429

    accuracy                           0.84     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.85      0.84      0.85     30000



# Random Forest

In [53]:
from sklearn.ensemble import RandomForestClassifier
rf=RandomForestClassifier()

In [54]:
rf.fit(x_train,y_train)

In [55]:
rf_p=rf.predict(x_test)

In [56]:
rf_c=confusion_matrix(y_test,rf_p)
rf_c

array([[25990,  1581],
       [ 2281,   148]], dtype=int64)

In [57]:
rf_ac=rf_c.diagonal().sum()/rf_c.sum()*100
rf_ac

87.12666666666667

In [58]:
a2=f1_score(y_test,rf_p)
b2=precision_score(y_test,rf_p)
c2=recall_score(y_test,rf_p)

In [59]:
print(classification_report(y_test,rf_p))

              precision    recall  f1-score   support

           0       0.92      0.94      0.93     27571
           1       0.09      0.06      0.07      2429

    accuracy                           0.87     30000
   macro avg       0.50      0.50      0.50     30000
weighted avg       0.85      0.87      0.86     30000



# KNN

In [60]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier()

In [61]:
knn.fit(x_train,y_train)

In [63]:
knn_p=knn.predict(x_test)

In [64]:
knn_c1=confusion_matrix(y_test,knn_p)
knn_c1

array([[27427,   144],
       [ 2412,    17]], dtype=int64)

In [65]:
knn_ac1=knn_c1.diagonal().sum()/knn_c1.sum()*100
knn_ac1

91.47999999999999

In [66]:
a3=f1_score(y_test,knn_p)
b3=precision_score(y_test,knn_p)
c3=recall_score(y_test,knn_p)

In [67]:
print(classification_report(y_test,knn_p))

              precision    recall  f1-score   support

           0       0.92      0.99      0.96     27571
           1       0.11      0.01      0.01      2429

    accuracy                           0.91     30000
   macro avg       0.51      0.50      0.48     30000
weighted avg       0.85      0.91      0.88     30000



# XGB

In [68]:
import xgboost
from xgboost import XGBClassifier
xgb=XGBClassifier()

In [69]:
xgb.fit(x_train,y_train)

In [70]:
xgb_p=xgb.predict(x_test)

In [71]:
xgb_cf=confusion_matrix(y_test,xgb_p)
xgb_cf

array([[27570,     1],
       [ 2429,     0]], dtype=int64)

In [72]:
xgb_ac=xgb_cf.diagonal().sum()/xgb_cf.sum()*100
xgb_ac

91.9

In [73]:
a4=f1_score(y_test,xgb_p)
b4=precision_score(y_test,xgb_p)
c4=recall_score(y_test,xgb_p)

In [74]:
print(classification_report(y_test,xgb_p))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     27571
           1       0.00      0.00      0.00      2429

    accuracy                           0.92     30000
   macro avg       0.46      0.50      0.48     30000
weighted avg       0.84      0.92      0.88     30000



# Naive Bayes

In [75]:
from sklearn.naive_bayes import MultinomialNB
nb=MultinomialNB()

In [76]:
nb.fit(x_train,y_train)

In [77]:
nb_p=nb.predict(x_test)
nb_p

array([0, 0, 0, ..., 0, 0, 0])

In [78]:
nb_cf=confusion_matrix(y_test,nb_p)
nb_cf

array([[27571,     0],
       [ 2429,     0]], dtype=int64)

In [79]:
nb_acc=nb_cf.diagonal().sum()/nb_cf.sum()*100
nb_acc

91.90333333333334

In [80]:
a5=f1_score(y_test,nb_p)
b5=precision_score(y_test,nb_p)
c5=recall_score(y_test,nb_p)

In [81]:
print(classification_report(y_test,nb_p))

              precision    recall  f1-score   support

           0       0.92      1.00      0.96     27571
           1       0.00      0.00      0.00      2429

    accuracy                           0.92     30000
   macro avg       0.46      0.50      0.48     30000
weighted avg       0.84      0.92      0.88     30000



In [82]:
algo=["logistic","decision tree","Random Forest","KNN","XGB","Naive Bayes"]
accuracy=[lg_ac,dt_ac,rf_ac,knn_ac1,xgb_ac,nb_acc]
f1=[a,a1,a2,a3,a4,a5]
precision=[b,b1,b2,b3,b4,b5]
recall=[c,c1,c2,c3,c4,c5]

In [83]:
basic=pd.DataFrame({"Algorithm":algo,"Accuracy":accuracy,"F1-score":f1,"Precision":precision,"Recall":recall})
basic

Unnamed: 0,Algorithm,Accuracy,F1-score,Precision,Recall
0,logistic,91.903333,0.0,0.0,0.0
1,decision tree,84.423333,0.079212,0.075964,0.08275
2,Random Forest,87.126667,0.071188,0.085599,0.06093
3,KNN,91.48,0.013127,0.10559,0.006999
4,XGB,91.9,0.0,0.0,0.0
5,Naive Bayes,91.903333,0.0,0.0,0.0
