# 1.1 Data Cleaning — R (paso a paso)
# Objetivo:
# - Renombrar variables
# - Remover valores faltantes
# - Crear dummies para variables categóricas
# - Crear variable binaria y: 1 si hay enfermedad cardiaca, 0 caso contrario
# 0) Cargar librerías

In [6]:
install.packages("fastDummies")

Installing package into 'C:/Rlibs'
(as 'lib' is unspecified)

also installing the dependency 'data.table'





  There are binary versions available but the source versions are later:
            binary source needs_compilation
data.table  1.14.8 1.17.8              TRUE
fastDummies  1.6.3  1.7.5             FALSE

  Binaries will be installed
package 'data.table' successfully unpacked and MD5 sums checked

The downloaded binary packages are in
	C:\Users\ARIANA\AppData\Local\Temp\RtmpiWNP53\downloaded_packages


installing the source package 'fastDummies'




In [8]:
library(dplyr)
library(readr)
library(fastDummies)


# 1) Cargar dataset
# Cambia la ruta al archivo donde guardaste processed.cleveland.data

In [29]:
df <- read_csv("processed.cleveland.data", col_names = FALSE)

[1mRows: [22m[34m303[39m [1mColumns: [22m[34m14[39m
[36m--[39m [1mColumn specification[22m [36m------------------------------------------------------------------------------------------------[39m
[1mDelimiter:[22m ","
[31mchr[39m  (2): X12, X13
[32mdbl[39m (12): X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X14

[36mi[39m Use `spec()` to retrieve the full column specification for this data.
[36mi[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [31]:
# Reemplazar "?" por NA
df[df == "?"] <- NA

# Convertir todas las columnas posibles a numéricas
df <- df %>% mutate(across(everything(), ~ suppressWarnings(as.numeric(.))))


# 2) Renombrar columnas

In [33]:
cols_order <- c(
  "age", "sex", "cp", "restbp", "chol", "fbs", "restecg",
  "thalach", "exang", "oldpeak", "slope", "ca", "thal", "hd"
)
colnames(df) <- cols_order

# 3) Reemplazar "?" por NA y convertir a numérico

In [35]:
df[df == "?"] <- NA

# Convierte todas las columnas posibles a numéricas

In [37]:
df <- df %>%
  mutate(across(everything(), ~ suppressWarnings(as.numeric(.))))

# 4) Eliminar filas con NA

In [41]:
library(dplyr)
library(readr)
library(fastDummies)
library(tidyr)   # <- agrega esta línea

# ... resto del código igual ...
df <- df %>% drop_na()


# 5) Definir variables categóricas

In [43]:
cat_cols <- c("sex", "cp", "fbs", "restecg", "exang", "slope", "ca", "thal")

# 6) Crear variable binaria y

In [45]:
df <- df %>%
  mutate(y = ifelse(hd > 0, 1, 0))

# 7) Crear variables dummy

In [47]:
df_dum <- fastDummies::dummy_cols(df,
                                  select_columns = cat_cols,
                                  remove_selected_columns = TRUE,
                                  remove_first_dummy = TRUE)

# 8) Separar X e y

In [49]:
X <- df_dum %>% select(-hd, -y)
y <- df_dum$y

# 9) Guardar archivos limpios

In [51]:
write_csv(df_dum, "output/heart_clean_with_dummies.csv")
write_csv(X, "output/X_features.csv")
write_csv(data.frame(y = y), "output/y_binary.csv")

# 10) Resumen

In [53]:
cat("Filas finales:", nrow(df_dum), "\n")
cat("Columnas finales:", ncol(df_dum), "\n")
cat("Proporción de y=1:", round(mean(y), 3), "\n")
cat("Primeras columnas de X:\n")
print(head(colnames(X), 10))

Filas finales: 297 
Columnas finales: 22 
Proporción de y=1: 0.461 
Primeras columnas de X:
 [1] "age"     "restbp"  "chol"    "thalach" "oldpeak" "sex_1"   "cp_2"   
 [8] "cp_3"    "cp_4"    "fbs_1"  


In [55]:
str(df)
summary(df)


tibble [297 x 15] (S3: tbl_df/tbl/data.frame)
 $ age    : num [1:297] 63 67 67 37 41 56 62 57 63 53 ...
 $ sex    : num [1:297] 1 1 1 1 0 1 0 0 1 1 ...
 $ cp     : num [1:297] 1 4 4 3 2 2 4 4 4 4 ...
 $ restbp : num [1:297] 145 160 120 130 130 120 140 120 130 140 ...
 $ chol   : num [1:297] 233 286 229 250 204 236 268 354 254 203 ...
 $ fbs    : num [1:297] 1 0 0 0 0 0 0 0 0 1 ...
 $ restecg: num [1:297] 2 2 2 0 2 0 2 0 2 2 ...
 $ thalach: num [1:297] 150 108 129 187 172 178 160 163 147 155 ...
 $ exang  : num [1:297] 0 1 1 0 0 0 0 1 0 1 ...
 $ oldpeak: num [1:297] 2.3 1.5 2.6 3.5 1.4 0.8 3.6 0.6 1.4 3.1 ...
 $ slope  : num [1:297] 3 2 2 3 1 1 3 1 2 3 ...
 $ ca     : num [1:297] 0 3 2 0 0 0 2 0 1 0 ...
 $ thal   : num [1:297] 6 3 7 3 3 3 3 3 7 7 ...
 $ hd     : num [1:297] 0 2 1 0 0 0 3 0 2 1 ...
 $ y      : num [1:297] 0 1 1 0 0 0 1 0 1 1 ...


      age             sex               cp            restbp     
 Min.   :29.00   Min.   :0.0000   Min.   :1.000   Min.   : 94.0  
 1st Qu.:48.00   1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:120.0  
 Median :56.00   Median :1.0000   Median :3.000   Median :130.0  
 Mean   :54.54   Mean   :0.6768   Mean   :3.158   Mean   :131.7  
 3rd Qu.:61.00   3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:140.0  
 Max.   :77.00   Max.   :1.0000   Max.   :4.000   Max.   :200.0  
      chol            fbs            restecg          thalach     
 Min.   :126.0   Min.   :0.0000   Min.   :0.0000   Min.   : 71.0  
 1st Qu.:211.0   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:133.0  
 Median :243.0   Median :0.0000   Median :1.0000   Median :153.0  
 Mean   :247.4   Mean   :0.1448   Mean   :0.9966   Mean   :149.6  
 3rd Qu.:276.0   3rd Qu.:0.0000   3rd Qu.:2.0000   3rd Qu.:166.0  
 Max.   :564.0   Max.   :1.0000   Max.   :2.0000   Max.   :202.0  
     exang           oldpeak          slope             ca        
 M