# Exploratory Data Analysis

## Housekeeping

In [None]:
# Set working directory
import os
import re

try:
  from google.colab import drive
  drive.mount('/content/drive')
  os.chdir('/content/drive/MyDrive/ds1_nhanes/')
except:
  from pathlib import Path
  if not re.search(r'ds1_nhanes$', str(os.getcwd())):
    os.chdir(Path(os.getcwd()).parent)

print(os.getcwd())

In [None]:
# Load packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# Load dataset
df = pd.read_csv('data/clean/nhanes_2017_2023_clean.csv')
df.info()

In [None]:
# Check prop_pbp summary stats
df[['SEQN', 'oz_pbp', 'pf_total_(oz_eq)', 'prop_pbp']].describe()

## PBP Consumption

Big caveat here is that we are comparing the ounces of PBPs consumed to the ounces of total proteins consumed, and these are probably definitely not the same thing. At some point we should probably pull grams of protein from the NHANES data rather than using the FPED as our basis of comparison on PBP consumption.

Set a common theme for our plots:

In [None]:
sns.set_theme(
    style="ticks",
    rc= {
      "axes.spines.right": False,
      "axes.spines.top": False,
      "figure.figsize": (6, 5)
    }
  )

Proportion of PBP consumption out of total protein consumption by gender. Note that we are using the 2-day weights

In [None]:
# prop pbp consumption by gender
ax = sns.barplot(
  data=df,
  y='prop_pbp',
  x='education',
  hue='gender',
  order=[
    "Don't know",
    'Less than 9th grade',
    'High school/GED',
    'Some college or AA',
    'College graduate or above'
  ],
  weights='weight_2d',
  errorbar=('ci', 95)
)
ax.set(
  ylabel = 'Prop plant-based protein-rich foods',
  xlabel = 'Education'
  # title = 'Proportion of protein-rich foods from plants by education'
)
ax.legend(title='Gender')
plt.xticks(rotation=45)
plt.tight_layout()

# Save plot
plt.savefig('outputs/checkin_1/pbp_by_education.png')

plt.show()
# Would like to adjust names horizontally to line up better after rotation,
# but no hjust arg?

Something to double check - are there children in the NHANES survey? Splitting by education could have pretty different meanings if there are.

In [None]:
# pbp consumption by race
ax = sns.barplot(
  data=df,
  y='prop_pbp',
  x='race',
  hue='gender',
  weights='weight_2d',
  errorbar=('ci', 95)
)
ax.set(
  xlabel = 'Race',
  ylabel = 'Prop plant-based protein-rich food'
  # title = 'Proportion of protein-rich food from plants by race'
)
ax.legend(title='Gender')
plt.xticks(rotation=45)
plt.tight_layout()

# Save it
plt.savefig('outputs/checkin_1/pbp_by_race.png')

# Show it
plt.show()

In [None]:
# pbp consumption by poverty ratio
ax = sns.barplot(
  data = df,
  y='prop_pbp',
  x='income_ratio_qs',
  hue='gender',
  weights='weight_2d',
  errorbar=('ci', 95),
  order=[
      'Lowest',
      'Low',
      'Medium',
      'High',
      'Highest'
  ]
)
ax.set(
  xlabel = 'Quintiles of Income to Poverty Ratio',
  ylabel = 'Prop plant-based protein-rich food'
  # title = 'Proportion of protein-rich food from plants by income'
)
ax.legend(title='Gender')
plt.tight_layout()

# Save it
plt.savefig('outputs/checkin_1/pbp_by_income.png')

# Show it
plt.show()

This one is pretty interesting!

## Test a Table

Just figuring out how to make a LaTeX table

In [None]:
# Make a smaller DF to play around with
small_df = df[['SEQN', 'weight_2d', 'gender', 'prop_pbp']].head()
print(small_df)

In [None]:
# Rename columns to ditch underscores (which break in LaTeX)
small_df.columns = ['SEQN', 'Weight', 'Gender', 'Prop PBP']
print(small_df)

In [None]:
small_df.to_latex(
  'outputs/checkin_1/test_table.tex', # file path
  index=False, # no row numbers
  float_format="%.2f", # round to 2 digits
  label='test_table', # how we reference it in the text
  caption='This is a test table', # What it says after table number
  position='h' # h is for 'here', tries to put it immediately after the
  # reference, but it reserves some license to move it around so that it fits
)

## Summary Table

Maybe?

In [None]:
# Lump each category - fruits, grains, proteins and fats, dairy
food_groups = df.filter(regex='^[fgpd][f_](?!.*calc$)')
categories = ['fruit', 'grain', 'protein', 'dairy']

# For each category, make a column in groups_df
# groups_df = df.loc[:, ['weight_2d', 'gender', 'age', 'race', 'education', 'income_ratio_qs']]
groups_df = df.loc[:, ['weight_2d']]

for cat in categories:
  initial = cat[0]
  cols = food_groups.columns[food_groups.columns.str.startswith(initial)]
  groups_df[cat] = food_groups[cols].sum(axis=1)

groups_df.head()

In [None]:
import statsmodels.api as sm

data = sm.add_constant(groups_df['weight_2d'])

for cat in categories:
  desc_stats = sm.stats.DescrStatsW(groups_df[cat], weights=df['weight_2d'])
  print(f'{cat} mean: {desc_stats.mean.round(3)}')
  print(f'{cat} std: {desc_stats.std.round(3)}')

In [None]:
memory_usage = df.memory_usage(deep=True).sum()
memory_usage_mb = memory_usage / (1024 ** 2)
print(memory_usage_mb)

In [None]:
df.isnull().mean()

In [None]:
sns.histplot(
    data=df,
    x='age',
    weights='weight_2d'
)