# Lab 3: Exploratory Data Analysis (EDA) - Additional Visualizations

This notebook contains additional visualizations for the Diabetes Health Indicators dataset,
following the patterns from the reference CCSIT_ARTI308_Lab3.ipynb file.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Make plots look cleaner
sns.set_theme()

In [None]:
import kagglehub
import pathlib

# Download and load the dataset
path = kagglehub.dataset_download("alexteboul/diabetes-health-indicators-dataset")
path = pathlib.Path(path)
csv = next(path.iterdir())
df = pd.read_csv(csv)

# Display first 5 rows
df.head()

## Univariate Analysis

Univariate analysis examines each variable individually to understand its distribution and characteristics.

### Distribution of BMI

BMI (Body Mass Index) is a key health indicator. Let's examine its distribution.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['BMI'], bins=30, kde=True)
plt.title("Distribution of BMI", fontsize=14)
plt.xlabel("BMI")
plt.ylabel("Count")
plt.axvline(df['BMI'].mean(), color='red', linestyle='--', label=f'Mean: {df["BMI"].mean():.1f}')
plt.axvline(df['BMI'].median(), color='green', linestyle='--', label=f'Median: {df["BMI"].median():.1f}')
plt.legend()
plt.show()

print(f"BMI Statistics:")
print(f"  Mean: {df['BMI'].mean():.2f}")
print(f"  Median: {df['BMI'].median():.2f}")
print(f"  Std: {df['BMI'].std():.2f}")
print(f"  Min: {df['BMI'].min():.2f}")
print(f"  Max: {df['BMI'].max():.2f}")

- The BMI distribution is right-skewed
- Most individuals have BMI between 20-35
- Some outliers with very high BMI values (up to 98)

### Distribution of Mental Health Days

MentHlth represents the number of days in the past 30 days when mental health was not good.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['MentHlth'], bins=31, kde=True)
plt.title("Distribution of Mental Health Days (Past 30 Days)", fontsize=14)
plt.xlabel("Number of Days with Poor Mental Health")
plt.ylabel("Count")
plt.show()

print(f"Mental Health Days Statistics:")
print(f"  Mean: {df['MentHlth'].mean():.2f} days")
print(f"  Median: {df['MentHlth'].median():.2f} days")
print(f"  People with 0 poor mental health days: {(df['MentHlth'] == 0).sum()} ({(df['MentHlth'] == 0).mean()*100:.1f}%)")
print(f"  People with 30 poor mental health days: {(df['MentHlth'] == 30).sum()} ({(df['MentHlth'] == 30).mean()*100:.1f}%)")

- Highly right-skewed distribution
- Most people report 0 days of poor mental health
- A small but notable group reports 30 days (entire month)

### Distribution of Physical Health Days

PhysHlth represents the number of days in the past 30 days when physical health was not good.

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(df['PhysHlth'], bins=31, kde=True)
plt.title("Distribution of Physical Health Days (Past 30 Days)", fontsize=14)
plt.xlabel("Number of Days with Poor Physical Health")
plt.ylabel("Count")
plt.show()

print(f"Physical Health Days Statistics:")
print(f"  Mean: {df['PhysHlth'].mean():.2f} days")
print(f"  Median: {df['PhysHlth'].median():.2f} days")
print(f"  People with 0 poor physical health days: {(df['PhysHlth'] == 0).sum()} ({(df['PhysHlth'] == 0).mean()*100:.1f}%)")

### Distribution of Diabetes Status

Diabetes_012: 0 = No diabetes, 1 = Prediabetes, 2 = Diabetes

In [None]:
# Map diabetes status to labels
diabetes_labels = {0.0: 'No Diabetes', 1.0: 'Prediabetes', 2.0: 'Diabetes'}
df['Diabetes_Label'] = df['Diabetes_012'].map(diabetes_labels)

plt.figure(figsize=(8, 5))
ax = sns.countplot(data=df, x='Diabetes_Label', order=['No Diabetes', 'Prediabetes', 'Diabetes'])
plt.title("Distribution of Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("Count")

# Add count labels on bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height()):,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=10)
plt.show()

print("\nDiabetes Status Distribution:")
print(df['Diabetes_Label'].value_counts())
print(f"\nPercentages:")
print(df['Diabetes_Label'].value_counts(normalize=True) * 100)

### Distribution of High Blood Pressure

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.countplot(data=df, x='HighBP')
plt.title("Distribution of High Blood Pressure", fontsize=14)
plt.xlabel("High Blood Pressure (0 = No, 1 = Yes)")
plt.ylabel("Count")

# Add count labels on bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height()):,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=10)
plt.show()

print("\nHigh Blood Pressure Distribution:")
print(df['HighBP'].value_counts())
print(f"\nPercentage with High BP: {df['HighBP'].mean()*100:.1f}%")

### Distribution of High Cholesterol

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.countplot(data=df, x='HighChol')
plt.title("Distribution of High Cholesterol", fontsize=14)
plt.xlabel("High Cholesterol (0 = No, 1 = Yes)")
plt.ylabel("Count")

# Add count labels on bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height()):,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=10)
plt.show()

print("\nHigh Cholesterol Distribution:")
print(df['HighChol'].value_counts())
print(f"\nPercentage with High Cholesterol: {df['HighChol'].mean()*100:.1f}%")

### Distribution of Smoker Status

In [None]:
plt.figure(figsize=(8, 5))
ax = sns.countplot(data=df, x='Smoker')
plt.title("Distribution of Smoker Status", fontsize=14)
plt.xlabel("Smoker (0 = No, 1 = Yes)")
plt.ylabel("Count")

# Add count labels on bars
for p in ax.patches:
    ax.annotate(f'{int(p.get_height()):,}', 
                (p.get_x() + p.get_width() / 2., p.get_height()), 
                ha='center', va='bottom', fontsize=10)
plt.show()

print("\nSmoker Distribution:")
print(df['Smoker'].value_counts())
print(f"\nPercentage who are smokers: {df['Smoker'].mean()*100:.1f}%")

## Bivariate Analysis

Bivariate analysis examines relationships between two variables.

### Diabetes Status by High Blood Pressure

Examining the relationship between diabetes status and high blood pressure.

In [None]:
# Calculate percentage of High BP within each diabetes category
diabetes_bp = df.groupby('Diabetes_Label')['HighBP'].mean() * 100

plt.figure(figsize=(10, 6))
ax = diabetes_bp.plot(kind='bar', color=['green', 'orange', 'red'])
plt.title("Percentage with High Blood Pressure by Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("Percentage with High BP (%)")
plt.xticks(rotation=0)

# Add percentage labels on bars
for i, v in enumerate(diabetes_bp):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', fontsize=10)
plt.show()

print("\nHigh BP Rate by Diabetes Status:")
print(diabetes_bp)

- People with diabetes have significantly higher rates of high blood pressure
- There's a clear progression: No Diabetes < Prediabetes < Diabetes

### Diabetes Status by High Cholesterol

In [None]:
# Calculate percentage of High Cholesterol within each diabetes category
diabetes_chol = df.groupby('Diabetes_Label')['HighChol'].mean() * 100

plt.figure(figsize=(10, 6))
ax = diabetes_chol.plot(kind='bar', color=['green', 'orange', 'red'])
plt.title("Percentage with High Cholesterol by Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("Percentage with High Cholesterol (%)")
plt.xticks(rotation=0)

# Add percentage labels on bars
for i, v in enumerate(diabetes_chol):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', fontsize=10)
plt.show()

print("\nHigh Cholesterol Rate by Diabetes Status:")
print(diabetes_chol)

### BMI Distribution by Diabetes Status

Comparing BMI distributions across diabetes categories using box plots.

In [None]:
plt.figure(figsize=(10, 6))
sns.boxplot(data=df, x='Diabetes_Label', y='BMI', order=['No Diabetes', 'Prediabetes', 'Diabetes'])
plt.title("BMI Distribution by Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("BMI")
plt.show()

print("\nBMI Statistics by Diabetes Status:")
print(df.groupby('Diabetes_Label')['BMI'].describe())

- Higher BMI is associated with diabetes
- Median BMI increases from No Diabetes to Diabetes
- Outliers present in all groups

### BMI Distribution by Diabetes Status (Violin Plot)

Violin plots show the distribution shape in addition to the summary statistics.

In [None]:
plt.figure(figsize=(10, 6))
sns.violinplot(data=df, x='Diabetes_Label', y='BMI', order=['No Diabetes', 'Prediabetes', 'Diabetes'])
plt.title("BMI Distribution by Diabetes Status (Violin Plot)", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("BMI")
plt.show()

### General Health by Diabetes Status

GenHlth: 1 = Excellent, 2 = Very Good, 3 = Good, 4 = Fair, 5 = Poor

In [None]:
# Map general health to labels
genhlth_labels = {1.0: 'Excellent', 2.0: 'Very Good', 3.0: 'Good', 4.0: 'Fair', 5.0: 'Poor'}
df['GenHlth_Label'] = df['GenHlth'].map(genhlth_labels)

# Calculate mean general health score by diabetes status
genhlth_by_diabetes = df.groupby('Diabetes_Label')['GenHlth'].mean()

plt.figure(figsize=(10, 6))
ax = genhlth_by_diabetes.plot(kind='bar', color=['green', 'orange', 'red'])
plt.title("Average General Health Score by Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("Average General Health Score (1=Excellent, 5=Poor)")
plt.xticks(rotation=0)

# Add value labels on bars
for i, v in enumerate(genhlth_by_diabetes):
    ax.text(i, v + 0.05, f'{v:.2f}', ha='center', fontsize=10)
plt.show()

print("\nAverage General Health Score by Diabetes Status:")
print(genhlth_by_diabetes)

### Physical Activity by Diabetes Status

In [None]:
# Calculate percentage of physical activity within each diabetes category
activity_by_diabetes = df.groupby('Diabetes_Label')['PhysActivity'].mean() * 100

plt.figure(figsize=(10, 6))
ax = activity_by_diabetes.plot(kind='bar', color=['green', 'orange', 'red'])
plt.title("Percentage with Physical Activity by Diabetes Status", fontsize=14)
plt.xlabel("Diabetes Status")
plt.ylabel("Percentage with Physical Activity (%)")
plt.xticks(rotation=0)

# Add percentage labels on bars
for i, v in enumerate(activity_by_diabetes):
    ax.text(i, v + 1, f'{v:.1f}%', ha='center', fontsize=10)
plt.show()

print("\nPhysical Activity Rate by Diabetes Status:")
print(activity_by_diabetes)

## Correlation Analysis

Examining correlations between variables to identify relationships.

### Correlation Matrix

A heatmap showing correlations between all numerical variables.

In [None]:
# Select key variables for correlation analysis
key_vars = ['Diabetes_012', 'HighBP', 'HighChol', 'BMI', 'Smoker', 'Stroke', 
            'HeartDiseaseorAttack', 'PhysActivity', 'GenHlth', 'MentHlth', 
            'PhysHlth', 'DiffWalk', 'Age', 'Education', 'Income']

plt.figure(figsize=(14, 10))
corr_matrix = df[key_vars].corr()
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', center=0,
            linewidths=0.5, cbar_kws={'shrink': 0.8})
plt.title("Correlation Matrix of Health Indicators", fontsize=14)
plt.tight_layout()
plt.show()

- **Diabetes_012** shows positive correlations with HighBP, HighChol, BMI, and Age
- **GenHlth** is positively correlated with PhysHlth and MentHlth
- **PhysActivity** shows negative correlation with diabetes (protective factor)
- **Income** and **Education** show negative correlations with diabetes

### Top Correlations with Diabetes

In [None]:
# Get correlations with Diabetes_012
diabetes_corr = corr_matrix['Diabetes_012'].drop('Diabetes_012').sort_values(key=abs, ascending=False)

plt.figure(figsize=(10, 6))
diabetes_corr.plot(kind='barh', color=['red' if x > 0 else 'green' for x in diabetes_corr])
plt.title("Correlations with Diabetes Status", fontsize=14)
plt.xlabel("Correlation Coefficient")
plt.axvline(0, color='black', linestyle='-', linewidth=0.5)
plt.tight_layout()
plt.show()

print("\nTop Correlations with Diabetes:")
print(diabetes_corr)

## Age Analysis

Examining the relationship between age and diabetes.

### Age Distribution by Diabetes Status

Age is encoded as: 1 = 18-24, 2 = 25-29, 3 = 30-34, ..., 13 = 80+

In [None]:
# Map age to labels
age_labels = {1.0: '18-24', 2.0: '25-29', 3.0: '30-34', 4.0: '35-39', 
              5.0: '40-44', 6.0: '45-49', 7.0: '50-54', 8.0: '55-59', 
              9.0: '60-64', 10.0: '65-69', 11.0: '70-74', 12.0: '75-79', 13.0: '80+'}

# Calculate diabetes rate by age group
diabetes_by_age = df.groupby('Age')['Diabetes_012'].apply(lambda x: (x == 2.0).mean() * 100)
diabetes_by_age.index = diabetes_by_age.index.map(age_labels)

plt.figure(figsize=(12, 6))
ax = diabetes_by_age.plot(kind='bar', color='coral')
plt.title("Diabetes Rate by Age Group", fontsize=14)
plt.xlabel("Age Group")
plt.ylabel("Diabetes Rate (%)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nDiabetes Rate by Age Group:")
print(diabetes_by_age)

- Diabetes rate increases significantly with age
- The highest rates are in the 70-74 and 65-69 age groups
- This aligns with known medical patterns

## Income Analysis

Examining the relationship between income and health indicators.

### Diabetes Rate by Income Level

Income is encoded as: 1 = <$10k, 2 = $10-15k, 3 = $15-20k, ..., 8 = $75k+

In [None]:
# Map income to labels
income_labels = {1.0: '<$10k', 2.0: '$10-15k', 3.0: '$15-20k', 4.0: '$20-25k',
                 5.0: '$25-35k', 6.0: '$35-50k', 7.0: '$50-75k', 8.0: '$75k+'}

# Calculate diabetes rate by income level
diabetes_by_income = df.groupby('Income')['Diabetes_012'].apply(lambda x: (x == 2.0).mean() * 100)
diabetes_by_income.index = diabetes_by_income.index.map(income_labels)

plt.figure(figsize=(12, 6))
ax = diabetes_by_income.plot(kind='bar', color='teal')
plt.title("Diabetes Rate by Income Level", fontsize=14)
plt.xlabel("Income Level")
plt.ylabel("Diabetes Rate (%)")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nDiabetes Rate by Income Level:")
print(diabetes_by_income)

- Higher income is associated with lower diabetes rates
- This may reflect access to healthcare, diet, and lifestyle factors

## Scatter Plot Analysis

Examining relationships between continuous variables.

### BMI vs Physical Health Days

In [None]:
# Sample the data for better visualization (too many points otherwise)
df_sample = df.sample(n=5000, random_state=42)

plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_sample, x='BMI', y='PhysHlth', hue='Diabetes_Label', 
                alpha=0.5, palette={'No Diabetes': 'green', 'Prediabetes': 'orange', 'Diabetes': 'red'})
plt.title("BMI vs Physical Health Days (Sample of 5000)", fontsize=14)
plt.xlabel("BMI")
plt.ylabel("Physical Health Days (Past 30)")
plt.legend(title='Diabetes Status')
plt.show()

### Mental Health vs Physical Health Days

In [None]:
plt.figure(figsize=(10, 6))
sns.scatterplot(data=df_sample, x='MentHlth', y='PhysHlth', hue='Diabetes_Label', 
                alpha=0.5, palette={'No Diabetes': 'green', 'Prediabetes': 'orange', 'Diabetes': 'red'})
plt.title("Mental Health vs Physical Health Days (Sample of 5000)", fontsize=14)
plt.xlabel("Mental Health Days (Past 30)")
plt.ylabel("Physical Health Days (Past 30)")
plt.legend(title='Diabetes Status')
plt.show()

## Pair Plot Analysis

Visualizing pairwise relationships between key variables.

In [None]:
# Select a subset of variables for the pair plot
pairplot_vars = ['BMI', 'GenHlth', 'MentHlth', 'PhysHlth', 'Age', 'Diabetes_Label']
df_pairplot = df_sample[pairplot_vars]

plt.figure(figsize=(12, 10))
sns.pairplot(df_pairplot, hue='Diabetes_Label', 
             palette={'No Diabetes': 'green', 'Prediabetes': 'orange', 'Diabetes': 'red'},
             diag_kind='hist', corner=False)
plt.suptitle("Pair Plot of Key Health Indicators", y=1.02, fontsize=14)
plt.show()

## Summary of Key Findings

### Health Indicators Associated with Diabetes:
1. **High Blood Pressure**: Strong positive association with diabetes
2. **High Cholesterol**: Strong positive association with diabetes
3. **BMI**: Higher BMI is associated with higher diabetes rates
4. **Age**: Diabetes risk increases significantly with age
5. **Physical Activity**: Protective factor - lower diabetes rates among active individuals
6. **Income**: Higher income associated with lower diabetes rates
7. **General Health**: Poorer self-reported health associated with diabetes

### Correlations:
- Diabetes shows moderate positive correlations with HighBP, HighChol, BMI, and Age
- Physical activity shows negative correlation with diabetes
- Mental and physical health days are positively correlated