# Imports

In [19]:
import pandas as pd

# Load Cleaned Data

In [20]:
df = pd.read_csv("./Data/cleaned_data.csv")
df.head()

Unnamed: 0,id,sex,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
2,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
3,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1
4,56669,Male,81.0,0,0,Yes,Private,Urban,186.21,29.0,formerly smoked,1


# Set memberships

In [21]:
# Check if id is unique across all rows
len(df.id.unique()) == df.shape[0]

True

In [22]:
# Check gender
df.sex.unique()

array(['Male', 'Female', 'Other'], dtype=object)

In [23]:
# Check ever_married
df.ever_married.unique()

array(['Yes', 'No'], dtype=object)

In [24]:
# Check work_type
df.work_type.unique()

array(['Private', 'Self-employed', 'Govt_job', 'children', 'Never_worked'],
      dtype=object)

In [25]:
# Check Residence_type
df.Residence_type.unique()

array(['Urban', 'Rural'], dtype=object)

In [26]:
# Check smoking status
df.smoking_status.unique()

array(['formerly smoked', 'never smoked', 'smokes', 'Unknown'],
      dtype=object)

In [27]:
# Check occurence of stroke
df.stroke.unique()

array([1, 0])

In [28]:
# Check occurence of hypertension
df.hypertension.unique()

array([0, 1])

In [29]:
# Check occurence of heart_disease
df.heart_disease.unique()

array([1, 0])

# Cross field validation

In [30]:
# All children must be less than age 18
df[(df.work_type == "children")]['age'].sort_values()

1529     0.08
3137     0.08
3790     0.16
3453     0.16
3843     0.16
        ...  
1379    16.00
2405    16.00
4413    16.00
4682    16.00
418     16.00
Name: age, Length: 671, dtype: float64

In [31]:
# All married people should be greater than age 17
df[(df.ever_married=="Yes")]['age'].sort_values()

1106    18.0
787     19.0
924     19.0
711     20.0
3994    20.0
        ... 
1008    82.0
2222    82.0
36      82.0
1276    82.0
4322    82.0
Name: age, Length: 3202, dtype: float64

In [32]:
# On average, women should have a larger BMI 
df.groupby("sex")['bmi'].mean()

sex
Female    29.048860
Male      28.560508
Other     22.400000
Name: bmi, dtype: float64

# Feature creation

In [33]:
# Create an age group variable
df['age_group'] = pd.cut(df.age, bins=4) # Create 4 age groups
df.age_group.unique()

[(61.52, 82.0], (41.04, 61.52], (20.56, 41.04], (-0.00192, 20.56]]
Categories (4, interval[float64]): [(-0.00192, 20.56] < (20.56, 41.04] < (41.04, 61.52] < (61.52, 82.0]]

In [34]:
# Create a BMI group variable
df['bmi_group'] = pd.cut(df.bmi, bins = 4) # Create 4 bmi groups
df.bmi_group.unique()

[(24.425, 38.55], (10.244, 24.425], (38.55, 52.675], (52.675, 66.8]]
Categories (4, interval[float64]): [(10.244, 24.425] < (24.425, 38.55] < (38.55, 52.675] < (52.675, 66.8]]

# Save feature created data

In [35]:
df.to_csv("./Data/trainingData.csv", index=False)