## DATASET
Roberto Pichardo Mier 

In [8]:
%pip install ucimlrepo

Note: you may need to restart the kernel to use updated packages.


In [37]:
from ucimlrepo import fetch_ucirepo
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.metrics import confusion_matrix

In [10]:
heart_disease = fetch_ucirepo(id=45)

In [11]:
X= heart_disease.data.features
y=heart_disease.data.targets

In [12]:
print(heart_disease.metadata)

{'uci_id': 45, 'name': 'Heart Disease', 'repository_url': 'https://archive.ics.uci.edu/dataset/45/heart+disease', 'data_url': 'https://archive.ics.uci.edu/static/public/45/data.csv', 'abstract': '4 databases: Cleveland, Hungary, Switzerland, and the VA Long Beach', 'area': 'Health and Medicine', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 303, 'num_features': 13, 'feature_types': ['Categorical', 'Integer', 'Real'], 'demographics': ['Age', 'Sex'], 'target_col': ['num'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1989, 'last_updated': 'Fri Nov 03 2023', 'dataset_doi': '10.24432/C52P4X', 'creators': ['Andras Janosi', 'William Steinbrunn', 'Matthias Pfisterer', 'Robert Detrano'], 'intro_paper': {'title': 'International application of a new probability algorithm for the diagnosis of coronary artery disease.', 'authors': 'R. Detrano, A. Jánosi, W. Steinbrunn, M. Pfisterer, J. Schmid, S. Sa

In [13]:
print(heart_disease.variables)

        name     role  ...  units missing_values
0        age  Feature  ...  years             no
1        sex  Feature  ...   None             no
2         cp  Feature  ...   None             no
3   trestbps  Feature  ...  mm Hg             no
4       chol  Feature  ...  mg/dl             no
5        fbs  Feature  ...   None             no
6    restecg  Feature  ...   None             no
7    thalach  Feature  ...   None             no
8      exang  Feature  ...   None             no
9    oldpeak  Feature  ...   None             no
10     slope  Feature  ...   None             no
11        ca  Feature  ...   None            yes
12      thal  Feature  ...   None            yes
13       num   Target  ...   None             no

[14 rows x 7 columns]


In [23]:
pd.set_option('display.max_columns', None)
df = pd.concat([X, y], axis=1)

In [24]:
print(df)

     age  sex  cp  trestbps  chol  fbs  restecg  thalach  exang  oldpeak  \
0     63    1   1       145   233    1        2      150      0      2.3   
1     67    1   4       160   286    0        2      108      1      1.5   
2     67    1   4       120   229    0        2      129      1      2.6   
3     37    1   3       130   250    0        0      187      0      3.5   
4     41    0   2       130   204    0        2      172      0      1.4   
..   ...  ...  ..       ...   ...  ...      ...      ...    ...      ...   
298   45    1   1       110   264    0        0      132      0      1.2   
299   68    1   4       144   193    1        0      141      0      3.4   
300   57    1   4       130   131    0        0      115      1      1.2   
301   57    0   2       130   236    0        2      174      0      0.0   
302   38    1   3       138   175    0        0      173      0      0.0   

     slope   ca  thal  num  
0        3  0.0   6.0    0  
1        2  3.0   3.0    2  


In [26]:
categorias = X.select_dtypes(include=['object']).columns

In [27]:
X_encoded = pd.get_dummies(X, columns=categorias)

In [28]:
remp = X_encoded.fillna(0, inplace=True)


In [29]:
X_encoded

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,1,145,233,1,2,150,0,2.3,3,0.0,6.0
1,67,1,4,160,286,0,2,108,1,1.5,2,3.0,3.0
2,67,1,4,120,229,0,2,129,1,2.6,2,2.0,7.0
3,37,1,3,130,250,0,0,187,0,3.5,3,0.0,3.0
4,41,0,2,130,204,0,2,172,0,1.4,1,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,45,1,1,110,264,0,0,132,0,1.2,2,0.0,7.0
299,68,1,4,144,193,1,0,141,0,3.4,2,2.0,7.0
300,57,1,4,130,131,0,0,115,1,1.2,2,1.0,7.0
301,57,0,2,130,236,0,2,174,0,0.0,2,1.0,3.0


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

In [31]:
modelo = GaussianNB()

In [32]:
modelo.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


In [33]:
y_pred = modelo.predict(X_test)

In [34]:
y_pred

array([2, 1, 2, 1, 4, 3, 4, 4, 0, 4, 0, 0, 4, 4, 2, 0, 0, 3, 4, 0, 4, 0,
       4, 0, 3, 0, 0, 1, 4, 0, 0, 1, 0, 0, 0, 4, 4, 0, 4, 0, 4, 0, 4, 0,
       0, 4, 0, 0, 4, 4, 0, 0, 0, 0, 3, 0, 0, 4, 4, 2, 0], dtype=int64)

In [35]:
accuracy_score(y_test, y_pred)

0.4918032786885246

In [38]:
confusion_matrix(y_test, y_pred)

array([[25,  0,  1,  0,  3],
       [ 3,  1,  1,  2,  5],
       [ 0,  2,  1,  1,  5],
       [ 1,  1,  1,  0,  4],
       [ 0,  0,  0,  1,  3]], dtype=int64)