In [61]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import joblib
import os

## 2. Carregando o dataset
 
Conjunto de dados **Cleveland** do repositório da UCI sobre doenças cardiovasculares.

In [None]:
# URL do dataset original (formato CSV)
url = "./db/processed_all_db.csv"

column_names = [
    'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs',
    'restecg', 'thalach', 'exang', 'oldpeak',
    'slope', 'ca', 'thal', 'target'
]

# Lendo o dataset
df = pd.read_csv(url, names=column_names)

# Visualizando as 5 primeiras linhas
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,28,1,2,130,132,0,2,185,0,0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0,?,?,?,0


In [63]:
print('A forma dos dados é ', df.shape)

A forma dos dados é  (720, 14)


In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 720 entries, 0 to 719
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   age       720 non-null    int64 
 1   sex       720 non-null    int64 
 2   cp        720 non-null    int64 
 3   trestbps  720 non-null    object
 4   chol      720 non-null    object
 5   fbs       720 non-null    object
 6   restecg   720 non-null    object
 7   thalach   720 non-null    object
 8   exang     720 non-null    object
 9   oldpeak   720 non-null    object
 10  slope     720 non-null    object
 11  ca        720 non-null    object
 12  thal      720 non-null    object
 13  target    720 non-null    int64 
dtypes: int64(4), object(10)
memory usage: 78.9+ KB


In [65]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,720.0,51.888889,9.19372,28.0,45.0,53.0,58.0,77.0
sex,720.0,0.738889,0.439546,0.0,0.0,1.0,1.0,1.0
cp,720.0,3.179167,0.953646,1.0,2.0,4.0,4.0,4.0
target,720.0,0.85,1.076878,0.0,0.0,0.5,1.0,4.0


In [66]:
df.describe()

Unnamed: 0,age,sex,cp,target
count,720.0,720.0,720.0,720.0
mean,51.888889,0.738889,3.179167,0.85
std,9.19372,0.439546,0.953646,1.076878
min,28.0,0.0,1.0,0.0
25%,45.0,0.0,2.0,0.0
50%,53.0,1.0,4.0,0.5
75%,58.0,1.0,4.0,1.0
max,77.0,1.0,4.0,4.0


## 3. Pré-processamento dos dados

Substituímos valores ausentes representados por `"?"`, removemos linhas incompletas e transformamos o alvo em binário.

In [67]:

# Substitui '?' por NaN
df.replace('?', np.nan, inplace=True)

# Converte colunas para tipo numérico
df[['ca', 'thal']] = df[['ca', 'thal']].astype('float64')

# Remove linhas com valores ausentes
df.dropna(inplace=True)

# Converte a coluna 'target' em binária: 0 = sem doença, 1 = com doença
df['target'] = df['target'].apply(lambda x: 1 if int(x) > 0 else 0)  

# Separando X e y
X = df.drop('target', axis=1)
y = df['target']

# Escalando os dados
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [68]:
t_size=0.3

X_train, X_test, y_train, y_test = train_test_split(
    X_scaled, y, test_size=t_size, random_state=42
)

## 5. Treinando o modelo de Regressão Logística

In [69]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

## 6. Avaliação do modelo

Utilizamos métricas como *accuracy*, *precision*, *recall*, *f1-score* e a matriz de confusão.

In [70]:
y_pred = model.predict(X_test)

print("Relatório de Classificação:\n")
print(classification_report(y_test, y_pred))

print("Matriz de Confusão:\n")
print(confusion_matrix(y_test, y_pred))

Relatório de Classificação:

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        42
           1       0.85      0.85      0.85        48

    accuracy                           0.84        90
   macro avg       0.84      0.84      0.84        90
weighted avg       0.84      0.84      0.84        90

Matriz de Confusão:

[[35  7]
 [ 7 41]]


## 7. Salvando o modelo e o scaler

Salvar modelo com base na porcentagem de treinamento.

In [71]:
convert_percent_to_integer = int(t_size * 100)

folder_name = f"modelo/{convert_percent_to_integer}_percent"
os.makedirs(folder_name, exist_ok=True)

model_file_name = os.path.join(folder_name, f"heart_disease_model_{convert_percent_to_integer}_percent.pkl")
scaler_file_name = os.path.join(folder_name, "scaler.pkl")

joblib.dump(model, model_file_name)
joblib.dump(scaler, scaler_file_name)

['modelo/30_percent\\scaler.pkl']

## 8. Interface para testar novos pacientes manualmente

In [76]:
def predict_new_patient():
    db_file = './db/processed.cleveland.csv'

    new_patient_data = pd.read_csv(db_file, names=column_names).head(20)

    # Substitui '?' por NaN
    new_patient_data.replace('?', np.nan, inplace=True)

    # Converte colunas para tipo numérico
    new_patient_data[['ca', 'thal']] = new_patient_data[['ca', 'thal']].astype('float64')

    # Remove linhas com valores ausentes
    new_patient_data.dropna(inplace=True)

    # Converte a coluna 'target' em binária: 0 = sem doença, 1 = com doença
    new_patient_data['target'] = new_patient_data['target'].apply(lambda x: 1 if int(x) > 0 else 0)  

    # Transformar os dados em DataFrame para manter as colunas
    # data_df = pd.DataFrame([new_patient_data])

    # Carregar o scaler e o modelo
    scaler = joblib.load(scaler_file_name)
    model = joblib.load(model_file_name)

    # Escalar os dados com o scaler treinado
    data_scaled = scaler.transform(new_patient_data)

    # Fazer a previsão
    result = model.predict(data_scaled)

    print(result)

    # if result == 1:
    #     print("O paciente tem risco de doença cardíaca.")
    # else:
    #     print("O paciente não apresenta risco de doença cardíaca.")

In [77]:
predict_new_patient()

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- target
