### El objetivo de este proyecto es realizar un análisis exploratorio de datos (EDA) sobre el dataset Statlog Heart Disease, con el fin de comprender las características de los pacientes y detectar patrones que puedan estar relacionados con la presencia de enfermedades cardíacas.

In [None]:
# Importamos las librerías esenciales
import pandas as pd  # Para manejo de datos
import numpy as np   # Para operaciones numéricas
import plotly as pl  # Para visualizaciones interactivas
import seaborn as sns # Para gráficos estadísticos

### 1. Cargar el dataset.
###### He optado por usar el dataset diretamente de la url, y guardar una version csv en el proyecto. Apenas como una medida de seguidad, por si acaso hay alguna inestabilidad en la red o cambios en el repo, no comprometer ese proyecto.

In [None]:
import os
# Crear carpetas si no existen
os.makedirs("../data/raw", exist_ok=True)
os.makedirs("../data/processed", exist_ok=True)

# Leer dataset crudo
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/heart/heart.dat"
columns_raw = ["age","sex","cp","trestbps","chol","fbs","restecg","thalach","exang",
               "oldpeak","slope","ca","thal","target"]
df = pd.read_csv(url, sep=" ", names=columns_raw)

# Guardar copia original
df.to_csv("../data/raw/statlog_heart_disease.csv", index=False)


In [None]:
# Aseguramos que sea DataFrame
df = pd.DataFrame(df)
df.head(20)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1
5,65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,1
6,56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2
7,59.0,1.0,4.0,110.0,239.0,0.0,2.0,142.0,1.0,1.2,2.0,1.0,7.0,2
8,60.0,1.0,4.0,140.0,293.0,0.0,2.0,170.0,0.0,1.2,2.0,2.0,7.0,2
9,63.0,0.0,4.0,150.0,407.0,0.0,2.0,154.0,0.0,4.0,2.0,3.0,7.0,2


### 2. Crear una copia del dataset para hacer la limpieza y modificaciones necesarias. 
######  Aqui decidi renombrar las columnas para ser más fácil entender cada variable.

In [None]:
# Hacer copia para limpiar/transformar datos, ej: renombrar columnas
df_clean = df.copy()

columns_clean = [
    "age", "sex", "chest_pain", "rest_bp", "serum_chol",
    "blood_sugar", "rest_ecg", "max_heart_rate", "exercise_ang",
    "oldpeak", "slope", "major_vessels", "thal", "heart_disease"
]

df_clean.columns = columns_clean

# Guardar dataset limpio en processed
df_clean.to_csv("../data/processed/heart_disease_clean.csv", index=False)

# Visualizar la tabla con las columnas renombradas
df_clean.head(20)

Unnamed: 0,age,sex,chest_pain,rest_bp,serum_chol,blood_sugar,rest_ecg,max_heart_rate,exercise_ang,oldpeak,slope,major_vessels,thal,heart_disease
0,70.0,1.0,4.0,130.0,322.0,0.0,2.0,109.0,0.0,2.4,2.0,3.0,3.0,2
1,67.0,0.0,3.0,115.0,564.0,0.0,2.0,160.0,0.0,1.6,2.0,0.0,7.0,1
2,57.0,1.0,2.0,124.0,261.0,0.0,0.0,141.0,0.0,0.3,1.0,0.0,7.0,2
3,64.0,1.0,4.0,128.0,263.0,0.0,0.0,105.0,1.0,0.2,2.0,1.0,7.0,1
4,74.0,0.0,2.0,120.0,269.0,0.0,2.0,121.0,1.0,0.2,1.0,1.0,3.0,1
5,65.0,1.0,4.0,120.0,177.0,0.0,0.0,140.0,0.0,0.4,1.0,0.0,7.0,1
6,56.0,1.0,3.0,130.0,256.0,1.0,2.0,142.0,1.0,0.6,2.0,1.0,6.0,2
7,59.0,1.0,4.0,110.0,239.0,0.0,2.0,142.0,1.0,1.2,2.0,1.0,7.0,2
8,60.0,1.0,4.0,140.0,293.0,0.0,2.0,170.0,0.0,1.2,2.0,2.0,7.0,2
9,63.0,0.0,4.0,150.0,407.0,0.0,2.0,154.0,0.0,4.0,2.0,3.0,7.0,2


### 3. Visualizar la forma/tamaño del dataset.

In [None]:
# Enseña la cantidad de filas y cantidad de columnas del dataset.
df_clean.shape

(270, 14)

### 4. Visualizar info general del dataset.
###### Entiendo que la columna Dtype está más relacionada al tipo de dato (int, str, float...), pero he decidido modificar el Dtype de las variables categoricas para mejor visualizar la distribuición de las variables numericas vs categóricas. 

In [None]:
# Información general del dataset: Tipos de datos, nulos, uso de memoria
df_clean.info()

<class 'pandas.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             270 non-null    float64
 1   sex             270 non-null    float64
 2   chest_pain      270 non-null    float64
 3   rest_bp         270 non-null    float64
 4   serum_chol      270 non-null    float64
 5   blood_sugar     270 non-null    float64
 6   rest_ecg        270 non-null    float64
 7   max_heart_rate  270 non-null    float64
 8   exercise_ang    270 non-null    float64
 9   oldpeak         270 non-null    float64
 10  slope           270 non-null    float64
 11  major_vessels   270 non-null    float64
 12  thal            270 non-null    float64
 13  heart_disease   270 non-null    int64  
dtypes: float64(13), int64(1)
memory usage: 29.7 KB


In [None]:
# Modificar el Dtype (tipo de dato/variable) para categoricas, para mejor tomada de decisión en los análises

categorical_cols = [
    "sex",
    "chest_pain",
    "blood_sugar",
    "rest_ecg",
    "exercise_ang",
    "slope",
    "thal",
    "heart_disease"
]

for col in categorical_cols:
    df_clean[col] = df_clean[col].astype("category")

df_clean.info()

<class 'pandas.DataFrame'>
RangeIndex: 270 entries, 0 to 269
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype   
---  ------          --------------  -----   
 0   age             270 non-null    float64 
 1   sex             270 non-null    category
 2   chest_pain      270 non-null    category
 3   rest_bp         270 non-null    float64 
 4   serum_chol      270 non-null    float64 
 5   blood_sugar     270 non-null    category
 6   rest_ecg        270 non-null    category
 7   max_heart_rate  270 non-null    float64 
 8   exercise_ang    270 non-null    category
 9   oldpeak         270 non-null    float64 
 10  slope           270 non-null    category
 11  major_vessels   270 non-null    float64 
 12  thal            270 non-null    category
 13  heart_disease   270 non-null    category
dtypes: category(8), float64(6)
memory usage: 15.1 KB


### 5. Visualizar y identificar los valores nulos y duplicados, si aplicable.
######  Aún que en la analise descrita arriba ya enseña que no hay valores nulos en ese dataset, he optado por hacer la busqueda y contagen individualmente otra vez apenas para confirmación. 

In [None]:
# Contar valores no nulos por columna
df_clean.count()

age               270
sex               270
chest_pain        270
rest_bp           270
serum_chol        270
blood_sugar       270
rest_ecg          270
max_heart_rate    270
exercise_ang      270
oldpeak           270
slope             270
major_vessels     270
thal              270
heart_disease     270
dtype: int64

In [None]:
# Contar nulos por columna, ordenados de mayor a menor
df.isna().sum().sort_values(ascending=False)

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [None]:
# Buscar duplicados
df.duplicated().sum()

np.int64(0)

### 6. Visualizar en tablas la estadística descriptiva de las variables númericas y categóricas, separadamente.

##### Variables númericas descriptas en: média, desviación, min/max, percentiles.

In [None]:
# Chequeo rápido de estadísticas descriptivas de las columnas numéricas
df_clean.describe(include="number")

Unnamed: 0,age,rest_bp,serum_chol,max_heart_rate,oldpeak,major_vessels
count,270.0,270.0,270.0,270.0,270.0,270.0
mean,54.433333,131.344444,249.659259,149.677778,1.05,0.67037
std,9.109067,17.861608,51.686237,23.165717,1.14521,0.943896
min,29.0,94.0,126.0,71.0,0.0,0.0
25%,48.0,120.0,213.0,133.0,0.0,0.0
50%,55.0,130.0,245.0,153.5,0.8,0.0
75%,61.0,140.0,280.0,166.0,1.6,1.0
max,77.0,200.0,564.0,202.0,6.2,3.0


##### Variables categóricas descriptas en: count= valor absoluto, top = el valor que más si repite, freq = cuantas vezes el top si repite.
###### Esa funcion no informa porcentage (%).


In [None]:
# Chequeo rápido de estadísticas descriptivas de las columnas categóricas
df_clean.describe(include="category")

Unnamed: 0,sex,chest_pain,blood_sugar,rest_ecg,exercise_ang,slope,thal,heart_disease
count,270.0,270.0,270.0,270.0,270.0,270.0,270.0,270
unique,2.0,4.0,2.0,3.0,2.0,3.0,3.0,2
top,1.0,4.0,0.0,2.0,0.0,1.0,3.0,1
freq,183.0,129.0,230.0,137.0,181.0,130.0,152.0,150
