# 1.0 Carregamento dos Dados

**Objetivo:** Carregar o dataset de doenças cardiovasculares do Kaggle e realizar primeira inspeção dos dados

## Importação de Bibliotecas

In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import kagglehub

sns.set_theme(style="whitegrid")
pd.set_option('display.max_columns', None)

import warnings
warnings.filterwarnings('ignore')

## Carregamento dos Dados

In [30]:
# Download latest version
path = kagglehub.dataset_download("jocelyndumlao/cardiovascular-disease-dataset")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Eduar\.cache\kagglehub\datasets\jocelyndumlao\cardiovascular-disease-dataset\versions\1


In [31]:
# Mover o dataset para data/raw
import os
import shutil

# O dataset está em um subdiretório
dataset_dir = os.path.join(path, 'Cardiovascular_Disease_Dataset')
csv_file = 'Cardiovascular_Disease_Dataset.csv'
source_file = os.path.join(dataset_dir, csv_file)

# Diretório de destino
raw_data_dir = Path('../data/raw')
raw_data_dir.mkdir(parents=True, exist_ok=True)
dest_file = raw_data_dir / csv_file

# Mover (não copiar) o arquivo para data/raw se ainda não existir
if not dest_file.exists():
    shutil.move(source_file, dest_file)
    print(f"Dataset movido para: {dest_file}")
else:
    print(f"Dataset já existe em: {dest_file}")

Dataset movido para: ..\data\raw\Cardiovascular_Disease_Dataset.csv


In [32]:
# Carregar o dataset
df = pd.read_csv(dest_file)

print(f"\nDataset carregado com sucesso!")
print(f"Dimensoes: {df.shape[0]} linhas x {df.shape[1]} colunas")


Dataset carregado com sucesso!
Dimensoes: 1000 linhas x 14 colunas


## Visualização Inicial

In [24]:
df.head()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
0,103368,53,1,2,171,0,0,1,147,0,5.3,3,3,1
1,119250,40,1,0,94,229,0,1,115,0,3.7,1,1,0
2,119372,49,1,2,133,142,0,0,202,1,5.0,1,0,0
3,132514,43,1,0,138,295,1,1,153,0,3.2,2,2,1
4,146211,31,1,1,199,0,0,2,136,0,5.3,3,2,1


In [19]:
df.tail()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
995,9949544,48,1,2,139,349,0,2,183,1,5.6,2,2,1
996,9953423,47,1,3,143,258,1,1,98,1,5.7,1,0,0
997,9965859,69,1,0,156,434,1,0,196,0,1.4,3,1,1
998,9988507,45,1,1,186,417,0,1,117,1,5.9,3,2,1
999,9990855,25,1,0,158,270,0,0,143,1,4.7,0,0,0


## Informações do Dataset

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   patientid          1000 non-null   int64  
 1   age                1000 non-null   int64  
 2   gender             1000 non-null   int64  
 3   chestpain          1000 non-null   int64  
 4   restingBP          1000 non-null   int64  
 5   serumcholestrol    1000 non-null   int64  
 6   fastingbloodsugar  1000 non-null   int64  
 7   restingrelectro    1000 non-null   int64  
 8   maxheartrate       1000 non-null   int64  
 9   exerciseangia      1000 non-null   int64  
 10  oldpeak            1000 non-null   float64
 11  slope              1000 non-null   int64  
 12  noofmajorvessels   1000 non-null   int64  
 13  target             1000 non-null   int64  
dtypes: float64(1), int64(13)
memory usage: 109.5 KB


In [21]:
df.describe()

Unnamed: 0,patientid,age,gender,chestpain,restingBP,serumcholestrol,fastingbloodsugar,restingrelectro,maxheartrate,exerciseangia,oldpeak,slope,noofmajorvessels,target
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,5048704.0,49.242,0.765,0.98,151.747,311.447,0.296,0.748,145.477,0.498,2.7077,1.54,1.222,0.58
std,2895905.0,17.86473,0.424211,0.953157,29.965228,132.443801,0.456719,0.770123,34.190268,0.500246,1.720753,1.003697,0.977585,0.493805
min,103368.0,20.0,0.0,0.0,94.0,0.0,0.0,0.0,71.0,0.0,0.0,0.0,0.0,0.0
25%,2536440.0,34.0,1.0,0.0,129.0,235.75,0.0,0.0,119.75,0.0,1.3,1.0,0.0,0.0
50%,4952508.0,49.0,1.0,1.0,147.0,318.0,0.0,1.0,146.0,0.0,2.4,2.0,1.0,1.0
75%,7681877.0,64.25,1.0,2.0,181.0,404.25,1.0,1.0,175.0,1.0,4.1,2.0,2.0,1.0
max,9990855.0,80.0,1.0,3.0,200.0,602.0,1.0,2.0,202.0,1.0,6.2,3.0,3.0,1.0
