# 2.0 Limpeza dos Dados

**Objetivo:** Realizar limpeza dos dados identificando e tratando problemas de qualidade

## Importação de Bibliotecas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

## Carregamento dos Dados

In [2]:
data_path = Path('../data/raw/Cardiovascular_Disease_Dataset.csv')
df = pd.read_csv(data_path)

print(f"Dataset original: {df.shape[0]} linhas x {df.shape[1]} colunas")
df_clean = df.copy()

Dataset original: 1000 linhas x 14 colunas


## Retirando linahs com dados faltantes 

In [3]:
# features com valor 0 que deveriam ser NaN
cols_com_zero_invalido = ['serumcholestrol'] 

# Substituir 0 por NaN 
df_clean[cols_com_zero_invalido] = df_clean[cols_com_zero_invalido].replace(0, np.nan)

total_linhas_com_nan = df_clean.isnull().any(axis=1).sum()

print(f"Total de linhas com dados faltantes: {total_linhas_com_nan}")

Total de linhas com dados faltantes: 53


In [4]:
# retirando as linhas com valores faltantes
df_clean = df_clean.dropna().reset_index(drop=True)

## Verificação Final

In [5]:
df_clean.head(10)

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,119250,40,1,0,94,229.0,0,1,115,0,3.7,1,1,0
1,119372,49,1,2,133,142.0,0,0,202,1,5.0,1,0,0
2,132514,43,1,0,138,295.0,1,1,153,0,3.2,2,2,1
3,168686,79,1,2,130,240.0,0,2,157,0,2.5,2,1,1
4,170498,52,1,0,127,345.0,0,0,192,1,4.9,1,0,0
5,188225,62,1,0,121,357.0,0,1,138,0,2.8,0,0,0
6,192523,61,0,0,190,181.0,0,1,150,0,2.9,2,0,1
7,201030,59,0,1,190,529.0,1,1,151,1,3.2,2,2,1
8,208877,58,1,2,192,409.0,1,0,138,0,2.3,3,1,1
9,223295,27,1,0,129,135.0,0,1,192,1,1.0,0,0,0


In [6]:
duplicates = df_clean.duplicated().sum()

if duplicates > 0:
    print(f"\nRED FLAG: {duplicates} linhas duplicadas encontradas")
    print("\nVisualizando primeiras duplicatas:")
    print(df_clean[df_clean.duplicated(keep=False)].head(10))
else:
    print("OK: Nenhuma duplicata encontrada")

OK: Nenhuma duplicata encontrada


In [7]:
numeric_cols = df_clean.select_dtypes(include=[np.number]).columns.drop('patientid')

outlier_summary = []

for col in numeric_cols:
    Q1 = df_clean[col].quantile(0.25)
    Q3 = df_clean[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    
    outliers = df_clean[(df_clean[col] < lower_bound) | (df_clean[col] > upper_bound)]
    n_outliers = len(outliers)
    pct_outliers = (n_outliers / len(df_clean)) * 100
    
    outlier_summary.append({
        'Variavel': col,
        'N_Outliers': n_outliers,
        'Pct_Outliers': round(pct_outliers, 2),
        'Min': df_clean[col].min(),
        'Max': df_clean[col].max()
    })

outlier_df_clean = pd.DataFrame(outlier_summary)
print(outlier_df_clean.to_string(index=False))

         Variavel  N_Outliers  Pct_Outliers  Min   Max
              age           0          0.00 20.0  80.0
           gender         223         23.55  0.0   1.0
        chestpain           0          0.00  0.0   3.0
        restingBP           0          0.00 94.0 200.0
  serumcholestrol           0          0.00 85.0 602.0
fastingbloodsugar           0          0.00  0.0   1.0
  restingrelectro           0          0.00  0.0   2.0
     maxheartrate           0          0.00 71.0 202.0
    exerciseangia           0          0.00  0.0   1.0
          oldpeak           0          0.00  0.0   6.2
            slope           0          0.00  0.0   3.0
 noofmajorvessels           0          0.00  0.0   3.0
           target           0          0.00  0.0   1.0


## Salvamento dos Dados Limpos

In [8]:
output_path = Path('../data/interim/Cardiovascular_Disease_Dataset_Clean.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)

df_clean.to_csv(output_path, index=False, sep=';')

print(f"Dados limpos salvos em: {output_path}")
print(f"Total de linhas salvas: {len(df_clean)}")

Dados limpos salvos em: ..\data\interim\Cardiovascular_Disease_Dataset_Clean.csv
Total de linhas salvas: 947
