In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

## 2. Carregando o dataset
 
Conjunto de dados **Cleveland** do repositório da UCI sobre doenças cardiovasculares.

In [2]:
# URL do dataset original (formato CSV)
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/heart-disease/processed.cleveland.data"

# Nomes das colunas
column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak',
    'slope', 'ca', 'thal', 'target'
]

# Lendo o dataset
df = pd.read_csv(url, names=column_names)

# Visualizando as 5 primeiras linhas
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63.0,1.0,1.0,145.0,233.0,1.0,2.0,150.0,0.0,2.3,3.0,0.0,6.0,0
1,67.0,1.0,4.0,160.0,286.0,0.0,2.0,108.0,1.0,1.5,2.0,3.0,3.0,2
2,67.0,1.0,4.0,120.0,229.0,0.0,2.0,129.0,1.0,2.6,2.0,2.0,7.0,1
3,37.0,1.0,3.0,130.0,250.0,0.0,0.0,187.0,0.0,3.5,3.0,0.0,3.0,0
4,41.0,0.0,2.0,130.0,204.0,0.0,2.0,172.0,0.0,1.4,1.0,0.0,3.0,0


In [3]:
print('A forma dos dados é ', df.shape)

A forma dos dados é  (303, 14)


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       303 non-null    float64
 1   sex       303 non-null    float64
 2   cp        303 non-null    float64
 3   trestbps  303 non-null    float64
 4   chol      303 non-null    float64
 5   fbs       303 non-null    float64
 6   restecg   303 non-null    float64
 7   thalach   303 non-null    float64
 8   exang     303 non-null    float64
 9   oldpeak   303 non-null    float64
 10  slope     303 non-null    float64
 11  ca        303 non-null    object 
 12  thal      303 non-null    object 
 13  target    303 non-null    int64  
dtypes: float64(11), int64(1), object(2)
memory usage: 33.3+ KB


In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.438944,9.038662,29.0,48.0,56.0,61.0,77.0
sex,303.0,0.679868,0.467299,0.0,0.0,1.0,1.0,1.0
cp,303.0,3.158416,0.960126,1.0,3.0,3.0,4.0,4.0
trestbps,303.0,131.689769,17.599748,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.693069,51.776918,126.0,211.0,241.0,275.0,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
restecg,303.0,0.990099,0.994971,0.0,0.0,1.0,2.0,2.0
thalach,303.0,149.607261,22.875003,71.0,133.5,153.0,166.0,202.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


In [6]:
df.describe()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,target
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,54.438944,0.679868,3.158416,131.689769,246.693069,0.148515,0.990099,149.607261,0.326733,1.039604,1.60066,0.937294
std,9.038662,0.467299,0.960126,17.599748,51.776918,0.356198,0.994971,22.875003,0.469794,1.161075,0.616226,1.228536
min,29.0,0.0,1.0,94.0,126.0,0.0,0.0,71.0,0.0,0.0,1.0,0.0
25%,48.0,0.0,3.0,120.0,211.0,0.0,0.0,133.5,0.0,0.0,1.0,0.0
50%,56.0,1.0,3.0,130.0,241.0,0.0,1.0,153.0,0.0,0.8,2.0,0.0
75%,61.0,1.0,4.0,140.0,275.0,0.0,2.0,166.0,1.0,1.6,2.0,2.0
max,77.0,1.0,4.0,200.0,564.0,1.0,2.0,202.0,1.0,6.2,3.0,4.0


## 3. Pré-processamento dos dados

Substituímos valores ausentes representados por `"?"`, removemos linhas incompletas e transformamos o alvo em binário.

In [7]:
# Substitui '?' por NaN
df.replace('?', np.nan, inplace=True)

# Converte colunas para tipo numérico
df[['ca', 'thal']] = df[['ca', 'thal']].astype('float64')

# Remove linhas com valores ausentes
df.dropna(inplace=True)

# Converte a coluna 'target' em binária: 0 = sem doença, 1 = com doença
df['target'] = df['target'].apply(lambda x: 1 if int(x) > 0 else 0)

# Separando X e y
X = df.drop('target', axis=1)
y = df['target']

# Escalando os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

## 4. Separação dos dados

Vamos dividir os dados em treino e teste (80% treino / 20% teste).

In [8]:
t_size=0.2

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=t_size, random_state=42
)

## 5. Treinando o modelo de Regressão Logística

In [9]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

## 6. Avaliação do modelo

Utilizamos métricas como *accuracy*, *precision*, *recall*, *f1-score* e a matriz de confusão.

In [10]:
y_pred = model.predict(X_test)

print("Relatório de Classificação:\n")
print(classification_report(y_test, y_pred))

print("Matriz de Confusão:\n")
print(confusion_matrix(y_test, y_pred))

Relatório de Classificação:

              precision    recall  f1-score   support

           0       0.89      0.89      0.89        36
           1       0.83      0.83      0.83        24

    accuracy                           0.87        60
   macro avg       0.86      0.86      0.86        60
weighted avg       0.87      0.87      0.87        60

Matriz de Confusão:

[[32  4]
 [ 4 20]]


## 7. Salvando o modelo e o scaler

Salvar modelo com base na porcentagem de treinamento...............

In [11]:
convert_percent_to_integer = int(t_size * 100)

folder_name = f"modelo/{convert_percent_to_integer}_percent"
os.makedirs(folder_name, exist_ok=True)

model_file_name = os.path.join(folder_name, f"heart_disease_model_{convert_percent_to_integer}_percent.pkl")
scaler_file_name = os.path.join(folder_name, "scaler.pkl")

joblib.dump(model, model_file_name)
joblib.dump(scaler, scaler_file_name)

['modelo/20_percent\\scaler.pkl']

## 8. Interface para testar novos pacientes manualmente

In [16]:
def predict_new_patient():
    # Preencha os dados manualmente neste dicionário
    new_patient_data = {
        "age": 63,
        "sex": 1,
        "cp": 3,
        "trestbps": 145,
        "chol": 233,
        "fbs": 1,
        "restecg": 0,
        "thalach": 150,
        "exang": 0,
        "oldpeak": 2.3,
        "slope": 0,
        "ca": 0,
        "thal": 1,
    }

    # Transformar os dados em DataFrame para manter as colunas
    data_df = pd.DataFrame([new_patient_data])

    # Carregar o scaler e o modelo
    scaler = joblib.load(scaler_file_name)
    model = joblib.load(model_file_name)

    # Escalar os dados com o scaler treinado
    data_scaled = scaler.transform(data_df)

    # Fazer a previsão
    result = model.predict(data_scaled)[0]

    if result == 1:
        print("O paciente tem risco de doença cardíaca.")
    else:
        print("O paciente não apresenta risco de doença cardíaca.")

In [17]:
predict_new_patient()

O paciente não apresenta risco de doença cardíaca.
