**En este notebook se muestra cómo se elimina un porcentaje de los datos en tres columnas**

In [17]:
import pandas as pd
import numpy as np
import random


In [18]:
# Se cargan los datos originales a una variable
df = pd.read_csv("dataset\heart_2020_original.csv")


In [19]:
# También se pueden cargar desde la url de git
url = "https://raw.githubusercontent.com/ArleyF/proyecto_enfermedades_cardiacas/master/dataset/heart_2020_original.csv"
df = pd.read_csv(url)


In [20]:
# Se imprimen la cantidad de filas y columnas además de los 5 primeros datos
print(df.shape)
df.head(5)


(319795, 18)


Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [21]:
# Se verifica si hay datos faltantes
pd.isnull(df).sum()


HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [22]:
# Se verifica qué tanto se repiten los datos en cada columna
df.nunique()


HeartDisease           2
BMI                 3604
Smoking                2
AlcoholDrinking        2
Stroke                 2
PhysicalHealth        31
MentalHealth          31
DiffWalking            2
Sex                    2
AgeCategory           13
Race                   6
Diabetic               4
PhysicalActivity       2
GenHealth              5
SleepTime             24
Asthma                 2
KidneyDisease          2
SkinCancer             2
dtype: int64

In [23]:
# Se observan los tipos de datos
df.dtypes


HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [24]:
# Se eligen las columnas "BMI, Race y SleepTime" y se elimina aproximadamente el 8% de los datos

num_datos = int(df.shape[0])-1
porc_datos = int(num_datos*8/100)

cols = ["BMI", "Race", "SleepTime"]

for col in cols:
    contador = 0
    while contador < porc_datos:
        index = random.randint(0, num_datos)
        if df[col][index] != np.NaN:
            df[col][index] = np.NaN
            contador += 1


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col][index] = np.NaN


In [25]:
# Se verifica nuevamente los datos faltantes
pd.isnull(df).sum()


HeartDisease            0
BMI                 24588
Smoking                 0
AlcoholDrinking         0
Stroke                  0
PhysicalHealth          0
MentalHealth            0
DiffWalking             0
Sex                     0
AgeCategory             0
Race                24580
Diabetic                0
PhysicalActivity        0
GenHealth               0
SleepTime           24594
Asthma                  0
KidneyDisease           0
SkinCancer              0
dtype: int64

In [10]:
# Se guardan los cambios en un nuevo documento .csv
df.to_csv("dataset\heart_2020_modified.csv", index=False)
