# 2.1 Limpeza dos Dados

**Objetivo:** Realizar limpeza dos dados identificando e tratando problemas de qualidade

## Importação de Bibliotecas

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

## Carregamento dos Dados

In [2]:
data_path = Path('../data/raw/Cardiovascular_Disease_Dataset.csv')
df = pd.read_csv(data_path)

print(f"Dataset original: {df.shape[0]} linhas x {df.shape[1]} colunas")
df_clean = df.copy()

Dataset original: 1000 linhas x 14 colunas


## Tratamento de Valores Nulos

In [3]:
print("Valores nulos antes do tratamento:")
print(df_clean.isnull().sum())

Valores nulos antes do tratamento:
patientid            0
age                  0
gender               0
chestpain            0
restingBP            0
serumcholestrol      0
fastingbloodsugar    0
restingrelectro      0
maxheartrate         0
exerciseangia        0
oldpeak              0
slope                0
noofmajorvessels     0
target               0
dtype: int64


## Remoção de Duplicatas

In [5]:
duplicatas_antes = df_clean.duplicated().sum()
print(f"Duplicatas antes: {duplicatas_antes}")

if duplicatas_antes > 0:
    df_clean = df_clean.drop_duplicates()
    duplicatas_depois = df_clean.duplicated().sum()
    print(f"Duplicatas depois: {duplicatas_depois}")
    print(f"Linhas removidas: {duplicatas_antes - duplicatas_depois}")
else:
    print("Nenhuma duplicata encontrada. Nenhuma remoção realizada.")

print(f"Dataset após remoção: {df_clean.shape[0]} linhas")

Duplicatas antes: 0
Nenhuma duplicata encontrada. Nenhuma remoção realizada.
Dataset após remoção: 1000 linhas


## Tratamento de Blood Pressure

In [6]:
if 'Blood Pressure' in df_clean.columns:
    print("Separando Blood Pressure em Sistólica e Diastólica...")
    
    df_clean[['Systolic_BP', 'Diastolic_BP']] = df_clean['Blood Pressure'].str.split('/', expand=True)
    
    df_clean['Systolic_BP'] = pd.to_numeric(df_clean['Systolic_BP'], errors='coerce')
    df_clean['Diastolic_BP'] = pd.to_numeric(df_clean['Diastolic_BP'], errors='coerce')
    
    df_clean = df_clean.drop(columns=['Blood Pressure'])
    
    print("Colunas criadas: Systolic_BP, Diastolic_BP")
    print(f"Dataset: {df_clean.shape[0]} linhas x {df_clean.shape[1]} colunas")
    

## Verificação Final

In [17]:
df_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 14 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Person ID                374 non-null    int64  
 1   Gender                   374 non-null    object 
 2   Age                      374 non-null    int64  
 3   Occupation               374 non-null    object 
 4   Sleep Duration           374 non-null    float64
 5   Quality of Sleep         374 non-null    int64  
 6   Physical Activity Level  374 non-null    int64  
 7   Stress Level             374 non-null    int64  
 8   BMI Category             374 non-null    object 
 9   Heart Rate               374 non-null    int64  
 10  Daily Steps              374 non-null    int64  
 11  Sleep Disorder           374 non-null    object 
 12  Systolic_BP              374 non-null    int64  
 13  Diastolic_BP             374 non-null    int64  
dtypes: float64(1), int64(9), o

In [10]:
df_clean.head()

Unnamed: 0,Person ID,Gender,Age,Occupation,Sleep Duration,Quality of Sleep,Physical Activity Level,Stress Level,BMI Category,Heart Rate,Daily Steps,Sleep Disorder,Systolic_BP,Diastolic_BP
0,1,Male,27,Software Engineer,6.1,6,42,6,Overweight,77,4200,,126,83
1,2,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
2,3,Male,28,Doctor,6.2,6,60,8,Normal,75,10000,,125,80
3,4,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90
4,5,Male,28,Sales Representative,5.9,4,30,8,Obese,85,3000,Sleep Apnea,140,90


## Salvamento dos Dados Limpos

In [18]:
output_path = Path('../data/interim/sleep_health_clean.csv')
output_path.parent.mkdir(parents=True, exist_ok=True)

df_clean.to_csv(output_path, index=False, sep=';')

print(f"Dados limpos salvos em: {output_path}")
print(f"Total de linhas salvas: {len(df_clean)}")

Dados limpos salvos em: ..\data\interim\sleep_health_clean.csv
Total de linhas salvas: 374
