# <b><center>Preprocesamiento de Datos<b></center>

## **Librerías**

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split

## **Datos**

In [2]:
df = pd.read_csv('../0_data/hypertension_dataset.csv')
df.head()

Unnamed: 0,Age,Salt_Intake,Stress_Score,BP_History,Sleep_Duration,BMI,Medication,Family_History,Exercise_Level,Smoking_Status,Has_Hypertension
0,69,8.0,9,Normal,6.4,25.8,,Yes,Low,Non-Smoker,Yes
1,32,11.7,10,Normal,5.4,23.4,,No,Low,Non-Smoker,No
2,78,9.5,3,Normal,7.1,18.7,,No,Moderate,Non-Smoker,No
3,38,10.0,10,Hypertension,4.2,22.1,ACE Inhibitor,No,Low,Non-Smoker,Yes
4,41,9.8,1,Prehypertension,5.8,16.2,Other,No,Moderate,Non-Smoker,No


## **Preprocesamiento**

### **Tratamiento de Valores Nulos**

In [3]:
df["Medication"] = df["Medication"].fillna("Not specified")
df.isna().sum()

Age                 0
Salt_Intake         0
Stress_Score        0
BP_History          0
Sleep_Duration      0
BMI                 0
Medication          0
Family_History      0
Exercise_Level      0
Smoking_Status      0
Has_Hypertension    0
dtype: int64

### **Cambiar Tipo de Datos**

In [4]:
for col in df.select_dtypes(include="int64").columns:
    df[col] = df[col].astype("float64")

for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype("category")

df.dtypes

Age                  float64
Salt_Intake          float64
Stress_Score         float64
BP_History          category
Sleep_Duration       float64
BMI                  float64
Medication          category
Family_History      category
Exercise_Level      category
Smoking_Status      category
Has_Hypertension    category
dtype: object

### **Escalamiento de Datos**

In [5]:
numerical_cols = df.select_dtypes(include='float64').columns

scaler = RobustScaler()
df[numerical_cols] = scaler.fit_transform(df[numerical_cols])
df.select_dtypes(include='float64').head(3)

Unnamed: 0,Age,Salt_Intake,Stress_Score,Sleep_Duration,BMI
0,0.575758,-0.185185,0.666667,-0.047619,-0.016393
1,-0.545455,1.185185,0.833333,-0.52381,-0.409836
2,0.848485,0.37037,-0.333333,0.285714,-1.180328


### **Tratamiento de Datos Categoricos** 

In [6]:
# 1. Ordinal
df["BP_History"] = df["BP_History"].map({
    "Normal": 0,
    "Prehypertension": 1,
    "Hypertension": 2
})

df["Exercise_Level"] = df["Exercise_Level"].map({
    "Low": 0,
    "Moderate": 1,
    "High": 2
})

# 2. Binaria
df["Family_History"] = df["Family_History"].map({"Yes": 1, "No": 0})
df["Smoking_Status"] = df["Smoking_Status"].map({"Smoker": 1, "Non-Smoker": 0})
df["Has_Hypertension"] = df["Has_Hypertension"].map({"Yes": 1, "No": 0})  # si aún no lo hiciste

# 3. One-hot encoding para variable nominal
df = pd.get_dummies(df, columns=["Medication"], drop_first=True)

df.head()

Unnamed: 0,Age,Salt_Intake,Stress_Score,BP_History,Sleep_Duration,BMI,Family_History,Exercise_Level,Smoking_Status,Has_Hypertension,Medication_Beta Blocker,Medication_Diuretic,Medication_Not specified,Medication_Other
0,0.575758,-0.185185,0.666667,0,-0.047619,-0.016393,1,0,0,1,False,False,True,False
1,-0.545455,1.185185,0.833333,0,-0.52381,-0.409836,0,0,0,0,False,False,True,False
2,0.848485,0.37037,-0.333333,0,0.285714,-1.180328,0,1,0,0,False,False,True,False
3,-0.363636,0.555556,0.833333,2,-1.095238,-0.622951,0,0,0,1,False,False,False,False
4,-0.272727,0.481481,-0.666667,1,-0.333333,-1.590164,0,1,0,0,False,False,False,True


### **Datos Atípicos**

In [7]:
outliers_detectados = {}

for col in numerical_cols:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1

    limite_inferior = q1 - 1.5 * iqr
    limite_superior = q3 + 1.5 * iqr

    outliers = df[(df[col] < limite_inferior) | (df[col] > limite_superior)]

    outliers_detectados[col] = {
        "Q1": q1,
        "Q3": q3,
        "IQR": iqr,
        "Límite inferior": limite_inferior,
        "Límite superior": limite_superior,
        "Cantidad de outliers": outliers.shape[0],
        "Porcentaje": round(100 * outliers.shape[0] / df.shape[0], 2)
    }

pd.DataFrame(outliers_detectados).T.sort_values("Porcentaje", ascending=False)

Unnamed: 0,Q1,Q3,IQR,Límite inferior,Límite superior,Cantidad de outliers,Porcentaje
Salt_Intake,-0.481481,0.518519,1.0,-1.981481,2.018519,17.0,0.86
BMI,-0.47541,0.52459,1.0,-1.97541,2.02459,16.0,0.81
Sleep_Duration,-0.52381,0.47619,1.0,-2.02381,1.97619,12.0,0.6
Age,-0.484848,0.515152,1.0,-1.984848,2.015152,0.0,0.0
Stress_Score,-0.5,0.5,1.0,-2.0,2.0,0.0,0.0


Al ser muy pocos datos atipicos se mantendran sin tratamiento en el conjunto de datos.

### **Separación de Datos**

In [8]:
X = df.drop('Has_Hypertension', axis=1)
y = df['Has_Hypertension']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, 
    test_size=0.25,
    random_state=42,
    stratify=y
)

### **Exportación y Almacenamiento**

In [10]:
df_train = X_train.copy()
df_train["Has_Hypertension"] = y_train

df_test = X_test.copy()
df_test["Has_Hypertension"] = y_test

df_train.to_csv("../3_model/df_train.csv", index=False)
df_test.to_csv("../3_model/df_test.csv", index=False)