In [66]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import RobustScaler
import warnings
warnings.filterwarnings('ignore')


In [67]:
df = pd.read_csv(r"datasets/heart.csv")
df.shape 
df.head() #viewing first 5


Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.0,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.0,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.0,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.0,2.7,116,0,0,8,1


In [68]:
#renaming some columns
col_names = ['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine',
             'serum_sodium', 'sex', 'smoking', 'time', 'DEATH_EVENT']

df.columns = col_names
df.columns

Index(['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes',
       'ejection_fraction', 'high_blood_pressure', 'platelets',
       'serum_creatinine', 'serum_sodium', 'sex', 'smoking', 'time',
       'DEATH_EVENT'],
      dtype='object')

In [69]:

df.head()   #review after renaming
df.info()   #about the dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 299 entries, 0 to 298
Data columns (total 13 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   age                       299 non-null    float64
 1   anaemia                   299 non-null    int64  
 2   creatinine_phosphokinase  299 non-null    int64  
 3   diabetes                  299 non-null    int64  
 4   ejection_fraction         299 non-null    int64  
 5   high_blood_pressure       299 non-null    int64  
 6   platelets                 299 non-null    float64
 7   serum_creatinine          299 non-null    float64
 8   serum_sodium              299 non-null    int64  
 9   sex                       299 non-null    int64  
 10  smoking                   299 non-null    int64  
 11  time                      299 non-null    int64  
 12  DEATH_EVENT               299 non-null    int64  
dtypes: float64(3), int64(10)
memory usage: 30.5 KB


In [70]:
# find categorical variables
categorical = [var for var in df.columns if df[var].dtype=='O']
print('The categorical variables are :\n', categorical)
# view the categorical variables
df[categorical].head()

The categorical variables are :
 []


0
1
2
3
4


In [71]:
X = df.drop(['smoking'], axis=1)
y = df['smoking']

In [72]:
# ii. Split the dataset into training and testing sets Training =80% and testing = 20%

# split X and y into training and testing sets

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)


In [73]:
# encode remaining variables with one-hot encoding
import category_encoders as ce
encoder = ce.OneHotEncoder(cols=['age', 'anaemia', 'creatinine_phosphokinase', 'diabetes', 'ejection_fraction', 'high_blood_pressure', 'platelets', 'serum_creatinine',
                                 'serum_sodium', 'sex', 'time', 'DEATH_EVENT'])

X_train = encoder.fit_transform(X_train)
X_test = encoder.transform(X_test)

In [74]:
# i. Print first 5 rows of the dataset after one-hot encoding
X_train.head()
print(X_train.head())
X_train.shape
X_test.head()
X_test.shape

     age_1  age_2  age_3  age_4  age_5  age_6  age_7  age_8  age_9  age_10  \
134      1      0      0      0      0      0      0      0      0       0   
145      0      1      0      0      0      0      0      0      0       0   
63       0      0      1      0      0      0      0      0      0       0   
292      0      0      0      1      0      0      0      0      0       0   
284      0      1      0      0      0      0      0      0      0       0   

     ...  time_124  time_125  time_126  time_127  time_128  time_129  \
134  ...         0         0         0         0         0         0   
145  ...         0         0         0         0         0         0   
63   ...         0         0         0         0         0         0   
292  ...         0         0         0         0         0         0   
284  ...         0         0         0         0         0         0   

     time_130  time_131  DEATH_EVENT_1  DEATH_EVENT_2  
134         0         0              1    

(60, 585)

In [75]:
cols = X_train.columns
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
X_train = pd.DataFrame(X_train, columns=[cols])
X_test = pd.DataFrame(X_test, columns=[cols])
X_train.head()

Unnamed: 0,age_1,age_2,age_3,age_4,age_5,age_6,age_7,age_8,age_9,age_10,...,time_124,time_125,time_126,time_127,time_128,time_129,time_130,time_131,DEATH_EVENT_1,DEATH_EVENT_2
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,1.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
# iii. Train Naïve bayes classifier using the training set; Validate the trained model on the test set.
model=GaussianNB()
model.fit(X_train,y_train)

GaussianNB()

In [77]:
# iv. Confusion Matrix
y_pred=model.predict(X_test)
print(y_pred)
print(y_test)
print(confusion_matrix(y_test,y_pred))


[0 0 0 0 0 0 0 1 1 0 1 1 0 1 0 1 0 0 1 0 0 0 1 1 1 0 1 1 0 1 0 1 1 1 0 0 0
 0 1 0 0 0 0 0 1 0 0 1 1 1 1 1 1 1 1 0 0 0 1 1]
206    0
188    0
12     0
219    0
237    0
136    0
228    0
205    0
52     0
108    1
240    0
15     0
184    1
246    0
22     1
74     1
269    0
90     1
227    0
255    1
190    0
103    1
124    0
129    0
220    0
116    0
194    0
285    0
294    1
249    0
92     0
66     0
8      0
122    0
217    0
150    0
176    1
182    1
298    1
5      1
263    0
89     0
81     1
34     0
55     0
139    0
234    1
64     0
7      1
45     1
73     1
291    0
173    0
106    0
59     1
230    0
168    0
26     0
283    1
153    1
Name: smoking, dtype: int64
[[20 20]
 [12  8]]


In [78]:
# v. Accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(y_test,y_pred))


0.4666666666666667
