<a href="https://colab.research.google.com/github/Catherine-Nguyen88/project_chd/blob/main/cleaning_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Cleaning data

In [2]:
! git clone https://github.com/Catherine-Nguyen88/project_chd

Cloning into 'project_chd'...
remote: Enumerating objects: 41, done.[K
remote: Counting objects: 100% (39/39), done.[K
remote: Compressing objects: 100% (25/25), done.[K
remote: Total 41 (delta 22), reused 27 (delta 14), pack-reused 2[K
Receiving objects: 100% (41/41), 821.15 KiB | 6.32 MiB/s, done.
Resolving deltas: 100% (22/22), done.


In [3]:
import pandas as pd
import numpy as np

In [4]:
# read in datasets
train_data = pd.read_csv('./project_chd/fhs_train.csv')
test_data = pd.read_csv('./project_chd/fhs_test.csv')

### **Explore training and testing data and clean**

**Training data**

In [5]:
print(train_data.head())
print(train_data.shape, '\n') # there are 3180 rows
print("Data types for variables in training dataset", '\n')
print(train_data.dtypes)

   Unnamed: 0  sex  age  education  currentSmoker  cigsPerDay  BPMeds  \
0        1267    1   58        1.0              0         0.0     0.0   
1        1209    0   40        1.0              1        15.0     0.0   
2        2050    0   52        1.0              0         0.0     0.0   
3        1183    1   38        2.0              1        43.0     0.0   
4        3225    0   43        1.0              0         0.0     0.0   

   prevalentStroke  prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  \
0                0             0         0    220.0  143.0  104.0  29.85   
1                0             0         0    199.0  122.0   82.0  22.16   
2                0             0         0    275.0  112.0   71.0  25.68   
3                0             1         0    170.0  130.0   94.0  23.90   
4                0             0         0    202.0  124.0   92.0  21.26   

   heartRate  glucose  TenYearCHD  
0         75     87.0           1  
1         85     77.0           

In [6]:
# count the nans in each variable
num_nans = train_data.isna().sum()
print("Number of NaNs for each variable:")
print(num_nans)

Number of NaNs for each variable:
Unnamed: 0           0
sex                  0
age                  0
education           85
currentSmoker        0
cigsPerDay          24
BPMeds              37
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             39
sysBP                0
diaBP                0
BMI                 15
heartRate            0
glucose            285
TenYearCHD           0
dtype: int64


In [16]:
# check columns in train_data
train_data.columns

Index(['Unnamed: 0', 'sex', 'age', 'education', 'currentSmoker', 'cigsPerDay',
       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [8]:
# check to see how many unique values there are in the 'Unnamed: 0' column
train_data['Unnamed: 0'].value_counts()

1267    1
322     1
2301    1
1143    1
3915    1
       ..
4214    1
2066    1
3695    1
1716    1
691     1
Name: Unnamed: 0, Length: 3180, dtype: int64

In [9]:
# drop 'Unnamed: 0' column as it is not listed in the data dictionary and it serves no person.
train_data1 = train_data.loc[:,['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay',
       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD']]
train_data1.columns

Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

**Testing data**

In [10]:
print(test_data.head())
print(test_data.shape, '\n') # there are 3180 rows
print("Data types for variables in testing dataset", '\n')
print(test_data.dtypes)

   Unnamed: 0  sex  age  education  currentSmoker  cigsPerDay  BPMeds  \
0         674    0   58        1.0              1        20.0     0.0   
1        4070    0   51        3.0              0         0.0     0.0   
2        3150    0   44        2.0              1         9.0     0.0   
3        1695    0   40        2.0              1        20.0     0.0   
4        2692    1   58        2.0              1        20.0     0.0   

   prevalentStroke  prevalentHyp  diabetes  totChol  sysBP  diaBP    BMI  \
0                0             0         0      NaN  126.0   77.0  30.08   
1                0             0         0    264.0  135.0   83.0  26.68   
2                0             1         0      NaN  147.5   96.0  30.57   
3                0             0         0    271.0  138.5   88.0  27.24   
4                0             0         0    207.0  110.0   80.0  23.55   

   heartRate  glucose  TenYearCHD  
0       78.0      NaN           0  
1       60.0     74.0           

In [11]:
# count the nans in each variable
num_nans = test_data.isna().sum()
print("Number of NaNs for each variable:")
print(num_nans)

Number of NaNs for each variable:
Unnamed: 0           0
sex                  0
age                  0
education           20
currentSmoker        0
cigsPerDay           5
BPMeds              16
prevalentStroke      0
prevalentHyp         0
diabetes             0
totChol             11
sysBP                0
diaBP                0
BMI                  4
heartRate            1
glucose            103
TenYearCHD           0
dtype: int64


In [13]:
# look at test_data columns
test_data.columns

Index(['Unnamed: 0', 'sex', 'age', 'education', 'currentSmoker', 'cigsPerDay',
       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')

In [14]:
# check to see how many unique values there are in the 'Unnamed: 0' column
test_data['Unnamed: 0'].value_counts()

674     1
2344    1
938     1
2975    1
2928    1
       ..
2563    1
2258    1
2864    1
1907    1
115     1
Name: Unnamed: 0, Length: 1060, dtype: int64

In [15]:
# drop 'Unnamed: 0' column as it is not listed in the data dictionary and it serves no person.
test_data1 = train_data.loc[:,['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay',
       'BPMeds', 'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol',
       'sysBP', 'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD']]
test_data1.columns

Index(['sex', 'age', 'education', 'currentSmoker', 'cigsPerDay', 'BPMeds',
       'prevalentStroke', 'prevalentHyp', 'diabetes', 'totChol', 'sysBP',
       'diaBP', 'BMI', 'heartRate', 'glucose', 'TenYearCHD'],
      dtype='object')