In [None]:
!wget https://raw.githubusercontent.com/FernandoV17/IA_VERANOS/refs/heads/main/ACT2/DATA/diabetes.csv

In [None]:
#Si Clonaste el Respositorio
data_path = '/content/IA_VERANOS/ACT2/DATA/diabetes.csv'

In [None]:
#Si nada mas usas google collab
data_path = '/content/diabetes.csv'

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler

Part 1

In [None]:
df = pd.read_csv(data_path)

In [None]:
display(
    df.head().style
    .set_caption("Tabla 1: Muestra inicial de datos")
    .set_properties(**{'text-align': 'center'})
    .format("{:.2f}", subset=df.select_dtypes(include=['float']).columns)
)

In [None]:
df_processed = df.copy()

In [None]:
print("\nAnálisis de valores nulos en el dataset:")
null_analysis = df.isnull().sum().to_frame('Conteo Nulos')
display(null_analysis.style.set_caption("Conteo de valores nulos por columna"))

key_columns = df.select_dtypes(include=['int', 'float']).columns.tolist()
print("\nValores cero en columnas clave:")
zero_analysis = df.select_dtypes(include=['int', 'float']).isin([0]).sum().to_frame('Conteo Ceros')
display(zero_analysis.style.set_caption("Conteo de valores cero en columnas numéricas"))

plt.figure(figsize=(10, 6))
sns.heatmap(df[key_columns].isin([0, np.nan]), cmap='viridis', cbar=False)
plt.title("Mapa de calor de valores cero/nulos en columnas médicas")
plt.show()

print("\nFilas con valores nulos o ceros problemáticos:")
for col in key_columns:
    zero_or_null = df[df[col].isin([0, np.nan])]
    if not zero_or_null.empty:
        print(f"\n--- {col} ---")
        display(zero_or_null.head())

Part 2

In [None]:
numeric_cols = df.select_dtypes(include=['int', 'float']).columns.tolist()
print("Columnas numéricas identificadas:", numeric_cols)

df_clean = df.copy()

for col in numeric_cols:
    col_median = df_clean[df_clean[col] != 0][col].median()
    # Replace zeros with median
    df_clean[col] = df_clean[col].replace(0, col_median)

    print(f"Replaced zeros in {col} with median value: {col_median:.2f}")

In [None]:
columnas_a_excluir = []

numeric_cols_clean = [col for col in df_clean.select_dtypes(include=['int', 'float']).columns
                    if col not in columnas_a_excluir]


print("VERIFICACIÓN DE CEROS RESTANTES")
ceros_restantes = (df_clean[numeric_cols_clean] == 0).sum()
print(ceros_restantes.to_string(name="Ceros remanentes"))
print(f"\nTotal de ceros restantes: {ceros_restantes.sum()}")

print("ESTADÍSTICAS DESCRIPTIVAS")
display(df_clean[numeric_cols_clean].describe().round(2))

print("DISTRIBUCIÓN DE VARIABLES")
plt.figure(figsize=(15, len(numeric_cols_clean)*4))
for i, col in enumerate(numeric_cols_clean, 1):
    plt.subplot(len(numeric_cols_clean), 1, i)
    sns.histplot(df[col], color='red', kde=True, alpha=0.3, label='Original', bins=30)
    sns.histplot(df_clean[col], color='blue', kde=True, alpha=0.3, label='Limpio', bins=30)
    plt.title(f'Distribución de {col}', pad=20)
    plt.legend()
    plt.xlabel('')
plt.tight_layout()
plt.show()

In [None]:
numeric_cols_clean = [col for col in df_clean.select_dtypes(include=['int', 'float']).columns
                    if col not in columnas_a_excluir]

print("COLUMNAS NUMÉRICAS IDENTIFICADAS")
print(numeric_cols_clean)

scaler = MinMaxScaler()
df_clean[numeric_cols_clean] = scaler.fit_transform(df_clean[numeric_cols_clean])

print("PRIMERAS FILAS POST-NORMALIZACIÓN")
display(df_clean.head().style.set_caption("Dataset Normalizado").format("{:.4f}"))

print("ESTADÍSTICAS DESCRIPTIVAS POST-NORMALIZACIÓN")
display(df_clean[numeric_cols_clean].describe().round(4).style.set_caption("Resumen Estadístico"))

Parte 3

In [None]:
#Histograma
plt.subplot(2, 2, 1)
sns.histplot(df['Glucose'], kde=True, color='blue', label='Original', alpha=0.5)
sns.histplot(df_processed['Glucose'], kde=True, color='red', label='Normalizado', alpha=0.5)
plt.title('Distribución de Glucose (Antes/Después)')
plt.xlabel('Glucose')
plt.ylabel('Frecuencia')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
#Boxplot
plt.subplot(2, 2, 2)
sns.boxplot(y=df_processed['BloodPressure'], color='orange')
plt.title('Distribución de BloodPressure (Normalizado)')
plt.ylabel('BloodPressure (0-1)')
plt.tight_layout()
plt.show()

In [None]:
#Scatter
plt.subplot(2, 2, 3)
sns.scatterplot(x=df_processed['Glucose'], y=df_processed['BMI'], hue=df_processed['Outcome'], alpha=0.6)
plt.title('Relación entre Glucose y BMI')
plt.xlabel('Glucose (Normalizado)')
plt.ylabel('BMI (Normalizado)')
plt.tight_layout()
plt.show()

In [None]:
#Bar Chart patients vs diabetes
plt.subplot(2, 2, 4)
outcome_counts = df['Outcome'].value_counts()
sns.barplot(x=outcome_counts.index, y=outcome_counts.values, palette=['green', 'red'])
plt.title('Distribución de Diabetes (Outcome)')
plt.xlabel('Diabetes (0=No, 1=Sí)')
plt.ylabel('Número de Pacientes')
plt.xticks([0, 1], ['No Diabetes', 'Diabetes'])
plt.tight_layout()
plt.show()