# Data information

In [1]:
#referencias
# Dataset: https://www.kaggle.com/datasets/mexwell/heart-disease-dataset/data

## Heart Disease Dataset Attribute Description



| S.No. | Attribute                | Code given | Unit          | Data type |
|-------|--------------------------|------------|---------------|-----------|
| 1     | age                      | Age | in years | Numeric      |
| 2     | sex                      | Sex        | 1, 0          | Binary    |
| 3     | chest pain type          | chest pain type | 1, 2, 3, 4 | Nominal   |
| 4     | resting blood pressure  | resting bp | mm Hg         | Numeric   |
| 5     | serum cholesterol       | cholesterol | mg/dl         | Numeric   |
| 6     | fasting blood sugar     | fasting blood sugar | 1, 0 > 120 mg/dl | Binary |
| 7     | resting electrocardiogram results | resting ecg | 0, 1, 2 | Nominal |
| 8     | maximum heart rate achieved | max heart rate | 71–202 | Numeric |
| 9     | exercise induced angina | exercise angina | 0, 1 | Binary |
| 10    | oldpeak = ST depression | oldpeak | depression | Numeric |
| 11    | the slope of the peak exercise ST segment | ST slope | 0, 1, 2 | Nominal |
| 12    | class                    | target     | 0, 1          | Binary    |

**Description of Nominal Attributes**

- **Sex:** 1 = male, 0 = female;
- **Chest Pain Type:**
  - Value 1: typical angina
  - Value 2: atypical angina
  - Value 3: non-anginal pain
  - Value 4: asymptomatic
- **Fasting Blood Sugar (fasting blood sugar > 120 mg/dl):** (1 = true; 0 = false)
- **Resting Electrocardiogram Results:**
  - Value 0: normal
  - Value 1: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV)
  - Value 2: showing probable or definite left ventricular hypertrophy by Estes' criteria
- **Exercise Induced Angina:** 1 = yes; 0 = no
- **The Slope of the Peak Exercise ST Segment:**
  - Value 1: upsloping
  - Value 2: flat
  - Value 3: downsloping
- **Class:** 1 = heart disease, 0 = Normal


# importing and loading

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm import tqdm

In [31]:
df = pd.read_csv('/content/heart_statlog_cleveland_hungary_final.csv')

In [None]:
(df.head(2)

In [None]:
df.describe()

### Data cleaning

In [6]:
#verificando se tem dados nulos
df.isnull().sum()

age                    0
sex                    0
chest pain type        0
resting bp s           0
cholesterol            0
fasting blood sugar    0
resting ecg            0
max heart rate         0
exercise angina        0
oldpeak                0
ST slope               0
target                 0
dtype: int64

In [7]:
# Normalização Min-Max
df_normalizado = (df - df.min()) / (df.max() - df.min())

In [8]:
# Calcule o primeiro e terceiro quartis
Q1 = df_normalizado.quantile(0.25)
Q3 = df_normalizado.quantile(0.75)

# Calcule o intervalo interquartil (IQR)
IQR = Q3 - Q1

# Defina os limites superior e inferior
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filtre os outliers
df_no_outliers = df_normalizado[(df_normalizado >= lower_bound) & (df_normalizado <= upper_bound)].dropna()

In [9]:
len(df_no_outliers)

552

In [10]:
# Detectar outliers em cada coluna
outliers_mask = ((df_normalizado < lower_bound) | (df_normalizado > upper_bound))

# Verificar quais colunas têm pelo menos um outlier
colunas_com_outliers = outliers_mask.any()

# Agora, 'colunas_com_outliers' é uma série booleana indicando quais colunas têm outliers
# Você pode imprimi-la para ver quais colunas têm outliers
print("Colunas com outliers:")
print(colunas_com_outliers[colunas_com_outliers])

# Se você quiser obter uma lista das colunas com outliers, você pode fazer assim:
colunas_com_outliers_lista = colunas_com_outliers[colunas_com_outliers].index.tolist()
print("Lista de colunas com outliers:", colunas_com_outliers_lista)

# Contar outliers em cada coluna
outliers_por_coluna = outliers_mask.sum()

# Imprimir o número de outliers em cada coluna
print("Número de outliers em cada coluna:")
print(outliers_por_coluna)

Colunas com outliers:
sex                    True
chest pain type        True
resting bp s           True
cholesterol            True
fasting blood sugar    True
max heart rate         True
oldpeak                True
dtype: bool
Lista de colunas com outliers: ['sex', 'chest pain type', 'resting bp s', 'cholesterol', 'fasting blood sugar', 'max heart rate', 'oldpeak']
Número de outliers em cada coluna:
age                      0
sex                    281
chest pain type         66
resting bp s            53
cholesterol            193
fasting blood sugar    254
resting ecg              0
max heart rate           1
exercise angina          0
oldpeak                 11
ST slope                 0
target                   0
dtype: int64


In [11]:
# Colunas que quero tirar os outliers
colunas_selecionadas = ['cholesterol', 'sex']

# Filtre os outliers apenas nas colunas selecionadas
df_no_outliers = df_normalizado.copy()  # Crie uma cópia do DataFrame original
for coluna in colunas_selecionadas:
    outliers_mask = ((df_normalizado[coluna] < lower_bound[coluna]) | (df_normalizado[coluna] > upper_bound[coluna]))
    df_no_outliers[coluna] = df_normalizado[coluna][~outliers_mask]  # Remova os outliers e substitua pela entrada NaN

In [12]:
df_no_outliers.isnull().sum()

age                      0
sex                    281
chest pain type          0
resting bp s             0
cholesterol            193
fasting blood sugar      0
resting ecg              0
max heart rate           0
exercise angina          0
oldpeak                  0
ST slope                 0
target                   0
dtype: int64

In [13]:
df_no_outliers = df_no_outliers.dropna()

In [14]:
len(df_no_outliers[coluna])

740

Pt-Br: "Em casos como este, é importante considerar manter os outliers, pois podem desempenhar um papel significativo na identificação de doenças cardíacas."

EN: "In cases like this, it's important to consider keeping the outliers, as they may play a significant role in identifying cardiac diseases."

# Data manipulation

## Spliting

In [15]:
# Split the data into train and temp sets
train_df, test_df = train_test_split(df,test_size=0.3,random_state=42)
test_df, valid_df = train_test_split(test_df,test_size=0.5,random_state=42)

In [16]:
print(len(train_df)/len(df))
print(len(test_df)/len(df))
print(len(valid_df)/len(df))

0.7
0.1495798319327731
0.15042016806722688


## Separating features

In [17]:
target_column = 'target'

#targeting Train data
X_train = train_df.drop(columns=[target_column])
Y_train = train_df[target_column]
#targeting test data
X_test = test_df.drop(columns=[target_column])
Y_test = test_df[target_column]
#targeting valid data
X_valid = valid_df.drop(columns=[target_column])
Y_valid = valid_df[target_column]

# Applying model

### DecisionTreeClassifier 🌳  <!-- Tree icon -->

In [18]:
from sklearn.tree import DecisionTreeClassifier

# Define the number of iterations
num_iterations = 100

best_accuracy = 0
best_dtc = None

for i in tqdm(range(num_iterations)):
      # Initialize the Decision Tree classifier
    dtc = DecisionTreeClassifier()

    # Train the classifier on the training data
    dtc.fit(X_train, Y_train)

    # Make predictions on the testing data
    y_pred = dtc.predict(X_test)

    # Evaluate the classifier's performance
    accuracy = accuracy_score(Y_test, y_pred)
    #print(f"Iteration {i+1}: Accuracy test - {accuracy}")

    # Update best accuracy and classifier if current accuracy is better
    if accuracy > best_accuracy:
        best_accuracy_dtc = accuracy
        best_dtc = dtc

print()
print("Best accuracy dtc :", best_accuracy_dtc)

# Make predictions on the valid data
y_pred = best_dtc.predict(X_valid)

# Evaluate the classifier's performance
accuracy_dtc = accuracy_score(Y_valid, y_pred)
print("Accuracy valid dtc:", accuracy_dtc)

100%|██████████| 100/100 [00:01<00:00, 93.30it/s]


Best accuracy dtc : 0.898876404494382
Accuracy valid dtc: 0.8324022346368715





### kNN

In [19]:
from sklearn.neighbors import KNeighborsClassifier

# Define the number of iterations
num_iterations = 100

best_accuracy = 0
best_knn = None

for i in tqdm(range(num_iterations)):
    # Initialize the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=8)

    # Train the classifier on the training data
    knn.fit(X_train, Y_train)

    # Make predictions on the testing data
    y_pred = knn.predict(X_test)

    # Evaluate the classifier's performance
    accuracy = accuracy_score(Y_test, y_pred)
    #print(f"Iteration {i+1}: Accuracy test - {accuracy}")

    # Update best accuracy and classifier if current accuracy is better
    if accuracy > best_accuracy:
        best_accuracy_knn = accuracy
        best_knn = knn

print()
print("Best accuracy knn:", best_accuracy_knn)

# Make predictions on the valid data
y_pred = best_knn.predict(X_valid)

# Evaluate the classifier's performance
accuracy_knn = accuracy_score(Y_valid, y_pred)
print("Accuracy valid knn:", accuracy_knn)

100%|██████████| 100/100 [00:04<00:00, 22.91it/s]


Best accuracy knn: 0.7696629213483146
Accuracy valid knn: 0.6871508379888268





### Random Forest 🌲🌲🌲🌲🌲🌲



In [20]:
from sklearn.ensemble import RandomForestClassifier

# Define the number of iterations
num_iterations = 100

best_accuracy = 0
best_knn = None

for i in tqdm(range(num_iterations)):
    # Initialize the KNN classifier
    RFC = RandomForestClassifier()

    # Train the classifier on the training data
    RFC.fit(X_train, Y_train)

    # Make predictions on the testing data
    y_pred = RFC.predict(X_test)

    # Evaluate the classifier's performance
    accuracy = accuracy_score(Y_test, y_pred)
    #print(f"Iteration {i+1}: Accuracy test - {accuracy}")

    # Update best accuracy and classifier if current accuracy is better
    if accuracy > best_accuracy:
        best_accuracy_RFC = accuracy
        best_RFC = RFC

print()
print("Best accuracy RFC:", best_accuracy_RFC)

# Make predictions on the valid data
y_pred = best_RFC.predict(X_valid)

# Evaluate the classifier's performance
accuracy_RFC = accuracy_score(Y_valid, y_pred)
print("Accuracy valid RFC:", accuracy_RFC)

100%|██████████| 100/100 [00:37<00:00,  2.65it/s]


Best accuracy RFC: 0.9550561797752809
Accuracy valid RFC: 0.9050279329608939





# Final Result

In [21]:
print("Best accuracy DTC: ", best_accuracy_dtc)
print("Best accuracy KNN: ", best_accuracy_knn)
print("Best accuracy RFC: ", best_accuracy_RFC)

Best accuracy DTC:  0.898876404494382
Best accuracy KNN:  0.7696629213483146
Best accuracy RFC:  0.9550561797752809


In [22]:
# Dicionário para mapear os nomes dos modelos para suas precisões
modelos = {'Decision Tree Classifier': best_accuracy_dtc,
           'K-Nearest Neighbors': best_accuracy_knn,
           'Random Forest Classifier': best_accuracy_RFC}

# Encontre o nome do modelo com a melhor precisão
melhor_modelo = max(modelos, key=modelos.get)

# Obtenha a precisão do melhor modelo
melhor_precisao = modelos[melhor_modelo]

# Imprima a mensagem
print("O melhor modelo é", melhor_modelo, "com precisão: ", melhor_precisao)

O melhor modelo é Random Forest Classifier com precisão:  0.9550561797752809


# Extra Testing and implementation

In [38]:
# Criar um novo DataFrame chamado df_paciente com as mesmas colunas do DataFrame existente
df_paciente = pd.DataFrame(columns=df.columns)
df_paciente = df_paciente.drop(columns=['target'])

In [60]:
def analisarS2(best_RFC,df_paciente):
    # Criar um DataFrame vazio com as colunas especificadas
  df_paciente = pd.DataFrame(columns=[
      'age', 'sex', 'chest pain type', 'resting bp s', 'cholesterol',
      'fasting blood sugar', 'resting ecg', 'max heart rate', 'exercise angina',
      'oldpeak', 'ST slope'
  ])

  # Pedir ao usuário para inserir os valores para cada coluna
  for coluna in df_paciente.columns:
      valor = input("Insira o valor para a coluna '" + coluna + "': ")
      df_paciente[coluna] = valor

  # Adicionar os valores ao DataFrame df_paciente
  df_paciente.loc[len(df_paciente)] = valores

  # Mostrar o DataFrame df_paciente
  print("Ficha do paciente está atualizado:")

  y_paciente = best_RFC.predict(df_paciente)

  if y_paciente == 0:
    print('ele ta bem.')
  elif y_paciente == 1:
    print('corre que ta morrendo!')
  else:
    print('ta errado isso ai!')

In [None]:
analisarS2(best_RFC,df_paciente)