In [12]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [13]:
# Import data
df = pd.read_csv('medical_examination.csv')

In [14]:
# examine the df content
df.head()

Unnamed: 0,id,age,sex,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [15]:
# Add 'overweight' column
# first calculate their BMI by dividing their weight in kilograms by the square of their height in meters
# If that value is > 25 then the person is overweight
# Use the value 0 for NOT overweight and the value 1 for overweight
bmi = df['weight'] / np.square(df['height']/100)
# condition check to get the boolean; assigning datatype to reduce storage
df['overweight'] = (bmi > 25).astype('uint8')

In [16]:
# Normalize data by making 0 always good and 1 always bad
# If the value of 'cholesterol' or 'gluc' is 1, make the value 0
# If the value is more than 1, make the value 1
# Define a function to normalize the values
def normalize_value(value):
    if value == 1:
        return 0
    elif value > 1:
        return 1
    else:
        return value
# Apply the normalization function to 'cholesterol' and 'gluc' columns
df['cholesterol'] = df['cholesterol'].apply(normalize_value)
df['gluc'] = df['gluc'].apply(normalize_value)
print(df.head())

   id    age  sex  height  weight  ap_hi  ap_lo  cholesterol  gluc  smoke  \
0   0  18393    2     168    62.0    110     80            0     0      0   
1   1  20228    1     156    85.0    140     90            1     0      0   
2   2  18857    1     165    64.0    130     70            1     0      0   
3   3  17623    2     169    82.0    150    100            0     0      0   
4   4  17474    1     156    56.0    100     60            0     0      0   

   alco  active  cardio  overweight  
0     0       1       0           0  
1     0       1       1           1  
2     0       0       1           0  
3     0       1       1           1  
4     0       0       0           0  


In [17]:
# Draw Categorical Plot
def draw_cat_plot():
    # Create DataFrame for cat plot using `pd.melt` 
    # using just the values from 'cholesterol', 'gluc', 'smoke', 'alco', 'active', and 'overweight'.
    columns = [
      'active',
      'alco',
      'cholesterol',
      'gluc',
      'overweight',
      'smoke'
    ]
    df_cat = pd.melt(df, id_vars=["cardio"], value_vars=columns)

    # Group and reformat the data to split it by 'cardio'
    # Show the counts of each feature. You will have to rename one of the columns for the catplot to work correctly.
    df_cat = df_cat.reset_index() \
                .groupby(['variable', 'cardio', 'value']) \
                .agg('count') \
                .rename(columns={'index': 'total'}) \
                .reset_index()
    

    # Draw the catplot with 'sns.catplot()'    
    # Get the figure for the output
    fig = sns.catplot(
        x="variable",
        y="total",
        col="cardio",
        hue="value",
        data=df_cat,
        kind="bar").fig


    # Do not modify the next two lines
    fig.savefig('catplot.png')
    return fig

In [18]:
# Draw Heat Map
def draw_heat_map():
    # Clean the data
    # apply filtering criteria to eliminate unwanted values
    # condition for pressure hi >= lo
    # height is less than the 2.5th percentile
    # height is more than the 97.5th percentile
    # weight is less then the 2.5th percentile
    # weight is more than the 97.5th percentile
    df_heat = df[
      (df['ap_lo'] <= df['ap_hi'])
      & (df['height'] >= df['height'].quantile(0.025))
      & (df['height'] <= df['height'].quantile(0.975))
      & (df['weight'] >= df['weight'].quantile(0.025))
      & (df['weight'] <= df['weight'].quantile(0.975))
    ]

    # Calculate the correlation matrix
    corr = df_heat.corr()

    # Generate a mask for the upper triangle
    mask = np.zeros_like(corr)
    mask[np.triu_indices_from(mask)] = True

    # Set up the matplotlib figure
    fig = plt.figure(figsize=(14,7))

    # Draw the heatmap with 'sns.heatmap()'
    ns.heatmap(corr, mask=mask,
                annot=True, fmt='.1f',
                center=0, vmin=-0.5, vmax=0.5)


    # Do not modify the next two lines
    fig.savefig('heatmap.png')
    return fig