In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [3]:
df = pd.read_csv("drive/MyDrive/SD201/heart_2020.csv")

In [4]:
df.head()

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,No,16.6,Yes,No,No,3.0,30.0,No,Female,55-59,White,Yes,Yes,Very good,5.0,Yes,No,Yes
1,No,20.34,No,No,Yes,0.0,0.0,No,Female,80 or older,White,No,Yes,Very good,7.0,No,No,No
2,No,26.58,Yes,No,No,20.0,30.0,No,Male,65-69,White,Yes,Yes,Fair,8.0,Yes,No,No
3,No,24.21,No,No,No,0.0,0.0,No,Female,75-79,White,No,No,Good,6.0,No,No,Yes
4,No,23.71,No,No,No,28.0,0.0,Yes,Female,40-44,White,No,Yes,Very good,8.0,No,No,No


In [5]:
df.columns

Index(['HeartDisease', 'BMI', 'Smoking', 'AlcoholDrinking', 'Stroke',
       'PhysicalHealth', 'MentalHealth', 'DiffWalking', 'Sex', 'AgeCategory',
       'Race', 'Diabetic', 'PhysicalActivity', 'GenHealth', 'SleepTime',
       'Asthma', 'KidneyDisease', 'SkinCancer'],
      dtype='object')

In [6]:
df.isna().sum()

HeartDisease        0
BMI                 0
Smoking             0
AlcoholDrinking     0
Stroke              0
PhysicalHealth      0
MentalHealth        0
DiffWalking         0
Sex                 0
AgeCategory         0
Race                0
Diabetic            0
PhysicalActivity    0
GenHealth           0
SleepTime           0
Asthma              0
KidneyDisease       0
SkinCancer          0
dtype: int64

In [7]:
df.dtypes

HeartDisease         object
BMI                 float64
Smoking              object
AlcoholDrinking      object
Stroke               object
PhysicalHealth      float64
MentalHealth        float64
DiffWalking          object
Sex                  object
AgeCategory          object
Race                 object
Diabetic             object
PhysicalActivity     object
GenHealth            object
SleepTime           float64
Asthma               object
KidneyDisease        object
SkinCancer           object
dtype: object

In [8]:
non_numeric_columns = df.select_dtypes(exclude=["float64"]).columns.tolist()
non_numeric_columns

['HeartDisease',
 'Smoking',
 'AlcoholDrinking',
 'Stroke',
 'DiffWalking',
 'Sex',
 'AgeCategory',
 'Race',
 'Diabetic',
 'PhysicalActivity',
 'GenHealth',
 'Asthma',
 'KidneyDisease',
 'SkinCancer']

In [9]:
numeric_columns = df.select_dtypes(include=["float64"]).columns.tolist()
numeric_columns

['BMI', 'PhysicalHealth', 'MentalHealth', 'SleepTime']

In [10]:
def encode_column(col):
  le = LabelEncoder()
  df[col]=le.fit_transform(df[col])
  le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
  print(col+" mapping: "+str(le_name_mapping))

In [11]:
def normalize_column(col):
  scaler = MinMaxScaler()
  df[col] = scaler.fit_transform(np.array(df[col]).reshape(-1,1))
  print(col+ " min: "+ str(scaler.data_min_[0])+ ", max: "+ str(scaler.data_max_[0]))
  

In [12]:
for col in non_numeric_columns:
  encode_column(col)

HeartDisease mapping: {'No': 0, 'Yes': 1}
Smoking mapping: {'No': 0, 'Yes': 1}
AlcoholDrinking mapping: {'No': 0, 'Yes': 1}
Stroke mapping: {'No': 0, 'Yes': 1}
DiffWalking mapping: {'No': 0, 'Yes': 1}
Sex mapping: {'Female': 0, 'Male': 1}
AgeCategory mapping: {'18-24': 0, '25-29': 1, '30-34': 2, '35-39': 3, '40-44': 4, '45-49': 5, '50-54': 6, '55-59': 7, '60-64': 8, '65-69': 9, '70-74': 10, '75-79': 11, '80 or older': 12}
Race mapping: {'American Indian/Alaskan Native': 0, 'Asian': 1, 'Black': 2, 'Hispanic': 3, 'Other': 4, 'White': 5}
Diabetic mapping: {'No': 0, 'No, borderline diabetes': 1, 'Yes': 2, 'Yes (during pregnancy)': 3}
PhysicalActivity mapping: {'No': 0, 'Yes': 1}
GenHealth mapping: {'Excellent': 0, 'Fair': 1, 'Good': 2, 'Poor': 3, 'Very good': 4}
Asthma mapping: {'No': 0, 'Yes': 1}
KidneyDisease mapping: {'No': 0, 'Yes': 1}
SkinCancer mapping: {'No': 0, 'Yes': 1}


In [13]:
for col in numeric_columns:
  normalize_column(col)

BMI min: 12.02, max: 94.85
PhysicalHealth min: 0.0, max: 30.0
MentalHealth min: 0.0, max: 30.0
SleepTime min: 1.0, max: 24.0


In [14]:
df

Unnamed: 0,HeartDisease,BMI,Smoking,AlcoholDrinking,Stroke,PhysicalHealth,MentalHealth,DiffWalking,Sex,AgeCategory,Race,Diabetic,PhysicalActivity,GenHealth,SleepTime,Asthma,KidneyDisease,SkinCancer
0,0,0.055294,1,0,0,0.100000,1.0,0,0,7,5,2,1,4,0.173913,1,0,1
1,0,0.100447,0,0,1,0.000000,0.0,0,0,12,5,0,1,4,0.260870,0,0,0
2,0,0.175782,1,0,0,0.666667,1.0,0,1,9,5,2,1,1,0.304348,1,0,0
3,0,0.147169,0,0,0,0.000000,0.0,0,0,11,5,0,0,2,0.217391,0,0,1
4,0,0.141132,0,0,0,0.933333,0.0,1,0,4,5,0,1,4,0.304348,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
319790,1,0.185802,1,0,0,0.233333,0.0,1,1,8,3,2,0,1,0.217391,1,0,0
319791,0,0.215139,1,0,0,0.000000,0.0,0,1,3,3,0,1,4,0.173913,1,0,0
319792,0,0.147531,0,0,0,0.000000,0.0,0,0,5,3,0,1,2,0.217391,0,0,0
319793,0,0.250996,0,0,0,0.000000,0.0,0,0,1,3,0,0,2,0.478261,0,0,0


In [15]:
df.to_csv("drive/MyDrive/SD201/heart_2020_cleaned.csv", index=False)