In [None]:
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import colors
from matplotlib.ticker import PercentFormatter
import seaborn as sns
from statistics import variance
from statistics import mean

# calculate outcome y
def get_outcome(df):
    df['y'] = 0
    df.loc[
       (df['MIHx']==1.0) |
       (df['strokeHx']==1.0) |
       (df['CHDHx']==1.0)|
       (df['CVDHx']==1.0),
       'y'] = 1
    return df

In [None]:
### read in data

In [None]:
df = pd.read_csv('data/common_data_jhs.csv')
df_raw = pd.read_csv('data/jhs_data.csv')

df_v1 = df[df["visit"] == 1]
df_v2 = df[df["visit"] == 2]
df_v3 = df[df["visit"] == 3]

In [None]:
### availability of nses and outcome

In [None]:
# data of outcome
df_y_v1 = df_raw[df_raw["visit"] == 1]
df_y_v1 = get_outcome(df_y_v1)
df_y_v1 = df_y_v1[['subjid','nbSESpc2score','y']]
df_y_v1 = df_y_v1.rename(columns={"y": "y1"})

df_y_v2 = df_raw[df_raw["visit"] == 2]
df_y_v2 = get_outcome(df_y_v2)
df_y_v2 = df_y_v2[['subjid','y']]
df_y_v2 = df_y_v2.rename(columns={"y": "y2"})

df_y_v3 = df_raw[df_raw["visit"] == 3]
df_y_v3 = get_outcome(df_y_v3)
df_y_v3 = df_y_v3[['subjid','y']]
df_y_v3 = df_y_v3.rename(columns={"y": "y3"})

In [None]:
merge1 = pd.merge(df_y_v1, df_y_v2)
merged= pd.merge(merge1, df_y_v3)
merged

In [None]:
### availability of covariates

In [None]:
def CheckCovariate(covariate,df_v1=df_v1, df_v2=df_v2, df_v3=df_v3, iscategorical = True): 
    print(covariate)
    nan_v1 = df_v1[covariate].isnull().sum()
    nan_v2 = df_v2[covariate].isnull().sum()
    nan_v3 = df_v3[covariate].isnull().sum()
    
    # categorical covariates
    if iscategorical == True:
        if nan_v1 < 1000:
            print("#na in V1 =",nan_v1,"\n",
                  df_v1[covariate].value_counts() / len(df_v1))
        else:
            print("#na in V1 =",nan_v1,"not available")
        
        if nan_v2 < 1000:
            print("#na in V2 =",nan_v2,"\n",
                  df_v2[covariate].value_counts() / len(df_v2))
        else:
            print("#na in V2 =",nan_v2,"not available")
            
        if nan_v3 < 1000:
            print("#na in V3 =",nan_v3,"\n",
                  df_v3[covariate].value_counts() / len(df_v3),'\n')
        else:
            print("#na in V3 =",nan_v3,"not available\n")
    
    # continuous covariates
    if iscategorical == False:
        if nan_v1 < 1000:
            print(df_v1[covariate].describe())
                  
        else:
            print("#na in V1 =",nan_v1,"not available")
            
        if nan_v2 < 1000:
            print("#na in V2 =",nan_v2,"\n",
                  "mean =", np.nanmean(df_v2[covariate]),"\n",
                 "Variance =", np.nanvar(df_v2[covariate]))
        else:
            print("#na in V2 =",nan_v2,"not available")
            
        if nan_v3 < 1000:
            print("#na in V3 =",nan_v3,"\n",
                  "mean =", np.nanmean(df_v3[covariate]),"\n",
                 "Variance =", np.nanvar(df_v3[covariate]),'\n')
        else:
            print("#na in V3 =",nan_v3,"not available\n")
        

In [None]:
# categorical 
list(map(CheckCovariate, ['nSES','sex','currentSmoker','Diabetes']))

In [None]:
# continuous
print(CheckCovariate('nbSESpc2score', iscategorical = False))
print(CheckCovariate('age', iscategorical = False))
print(CheckCovariate('sbp', iscategorical = False))
print(CheckCovariate('totchol', iscategorical = False))
print(CheckCovariate('hdl', iscategorical = False))

In [None]:
# continuous nb features
print(CheckCovariate('nbK3FavorFoodstore', iscategorical = False))
print(CheckCovariate('nbK3paFacilities', iscategorical = False))
print(CheckCovariate('nbpctResiden1mi', iscategorical = False))

In [None]:
### plots ###

In [None]:
### thesis plots

In [None]:
df = pd.read_csv('data/jhs_raw_complete.csv')

counts = df.groupby(['y_tot','currentSmoker'])['subjid'].count()
counts

In [None]:
outcome = ['no CVD','CVD']
no = [2756, 392]
yes = [319, 72]

plt.rcParams["figure.figsize"] = (5,5)
# Set position of bar on X axis
barWidth = 0.1
r1 = np.arange(len(outcome))
r2 = r1+barWidth

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=no, color=colors[0], width=barWidth, edgecolor='white',label = 'no')
plt.bar(r2, height=yes, color=colors[1], width=barWidth, edgecolor='white',label = 'yes')

plt.legend()
plt.xticks(r1 + barWidth/2, outcome)  
plt.ylabel('frequency')  # 纵坐标轴标题
plt.title('Current Smoker')


plt.show()

In [None]:
totchol = sns.boxplot(x='y_tot', y='sbp', data=df).set_xticklabels(["no CVD","CVD"])
plt.title('sbp')
plt.show()

In [None]:
# plot for age

In [None]:
df_v1.groupby('y').age.hist(bins[10, 20, 30, 40,50], bins)
plt.show()

In [None]:
import matplotlib.pyplot as xyz
age_dat = df_v1['age']
bins = [15,20,25,30,35,40,45,50,55,60,65,70,75,80,85,90,95]
xyz.hist(age_dat, bins, histtype='bar',rwidth=0.9, color = "skyblue")
counts, edges, bars = xyz.hist(age_dat, bins, histtype='bar',rwidth=0.9)
plt.bar_label(bars)
xyz.xlabel('age groups')
xyz.ylabel('Number of people')
xyz.title('JHS age distribution')
xyz.show()

In [None]:
#plot for gender

In [None]:
df_v1['sex'].value_counts().plot(kind='bar')
xyz.xlabel('gender groups')
xyz.ylabel('Number of people')
xyz.title('JHS genderdistribution')
plt.show()

In [None]:
plt.bar(0, height=female, color=colors[0], width=barWidth, edgecolor='white',label = 'Feamle')
plt.bar(1, height=male, color=colors[1], width=barWidth, edgecolor='white',label = 'Male')
 
plt.ylabel('frequency')  # 纵坐标轴标题
plt.title('gender distribution')


plt.show()

In [None]:
covar_v1 = df[df["visit"] == 1]
select_df = covar_v1[['subjid','sex','y']]
count_gender=select_df.groupby(['y','sex'])['subjid'].count()
count_gender


In [None]:
outcome = ['noCVD','CVD']
female = [2117, 172]
male = [1154, 125]

plt.rcParams["figure.figsize"] = (5,5)
# Set position of bar on X axis
barWidth = 0.1
r1 = np.arange(len(outcome))
r2 = r1+barWidth

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=female, color=colors[0], width=barWidth, edgecolor='white',label = 'Feamle')
plt.bar(r2, height=male, color=colors[1], width=barWidth, edgecolor='white',label = 'Male')

plt.legend()
plt.xticks(r1 + barWidth/2, outcome)  
plt.ylabel('frequency')  # 纵坐标轴标题
plt.title('gender distribution')


plt.show()

In [None]:
select_df = covar_v1[['subjid','currentSmoker','y']]
count_smoke=select_df.groupby(['y','currentSmoker'])['subjid'].count()
count_smoke


In [None]:
outcome = ['noCVD','CVD']
non_smoker = [2896, 252]
smoker = [346, 45]

#plot for gender

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.3
r1 = np.arange(len(outcome))
r2 = r1+barWidth

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=non_smoker, color=colors[0], width=barWidth, edgecolor='white',label = 'non_smoker')
plt.bar(r2, height=smoker, color=colors[1], width=barWidth, edgecolor='white',label = 'smoker')

plt.legend()
plt.xticks(r1 + barWidth/2, outcome)  
plt.ylabel('frequency')  # 纵坐标轴标题
plt.title('smoker distribution')


plt.show()

In [None]:
# Diabetes
covar_v1 = df[df["visit"] == 1]
select_df = covar_v1[['subjid','Diabetes','y']]
count_dbt=select_df.groupby(['y','Diabetes'])['subjid'].count()
count_dbt

In [None]:
outcome = ['noCVD','CVD']
non_dbt = [2625, 178]
dbt = [615, 117]

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.3
r1 = np.arange(len(outcome))
r2 = r1+barWidth

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=non_dbt, color=colors[0], width=barWidth, edgecolor='white',label = 'no diabetes')
plt.bar(r2, height=dbt, color=colors[1], width=barWidth, edgecolor='white',label = 'diabetes')

plt.legend()
plt.xticks(r1 + barWidth/2, outcome)  
plt.ylabel('frequency')  # 纵坐标轴标题
plt.title('diabetes distribution')


plt.show()

In [None]:
outcome = ['noCVD','CVD']

subset = covar_v1[covar_v1['y'] == 1]
    
sns.distplot(subset['age'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'CVD case')
    
# Plot formatting
plt.legend(prop={'size': 16}, title = '')
plt.title('Density Plot with Multiple Airlines')
plt.xlabel('Delay (min)')
plt.ylabel('Density')

In [None]:
outcome = ['noCVD','CVD']

subset_1 = covar_v1[covar_v1['y'] == 1]
subset_0 = covar_v1[covar_v1['y'] == 0]
    
sns.distplot(subset_1['age'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'CVD case')
sns.distplot(subset_0['age'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'no CVD')

plt.legend(prop={'size': 16}, title = '')

plt.show()

In [None]:
sns.distplot(subset_1['sbp'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'CVD case')
sns.distplot(subset_0['sbp'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'no CVD')

plt.legend(prop={'size': 16}, title = '')

plt.show()

In [None]:
sns.distplot(subset_1['hdl'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'CVD case')
sns.distplot(subset_0['hdl'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'no CVD')

plt.legend(prop={'size': 16}, title = '')

plt.show()

In [None]:
# totchol
sns.distplot(subset_1['totchol'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'CVD case')
sns.distplot(subset_0['totchol'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'no CVD')

plt.legend(prop={'size': 16}, title = '')

plt.show()

In [None]:
# alcohol consumption
df = pd.read_csv('data/common_data_jhs.csv')
v1 = df[df["visit"] == 1]
print(v1)

In [None]:
alc_v1 = v1[['subjid','alc', 'alcw', 'nSES']]
alc_v1.isna().sum()
alc_low_nses = alc_v1[alc_v1["nSES"] == 0]
alc_high_nses = alc_v1[alc_v1["nSES"] == 1]
alc_low_nses

In [None]:
# alc info missing in each subgroup
alc_low_nses.isnull().sum()
alc_high_nses.isnull().sum()

In [None]:
# replace na with 2
alc_v1['alc'].fillna(value=2, inplace=True)

In [None]:
# alc
na_alc = v1[v1.isnull()['alc']]

select_df = alc_v1[['subjid','alc','nSES']]
count_alc=select_df.groupby(['nSES','alc'])['subjid'].count()
print(count_alc)


In [None]:
subgroup = ['low_nSES','high_nSES']
alc_na = [8, 7]
alc_response = [1981, 1572]

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.3
r1 = np.arange(len(subgroup))
r2 = r1+barWidth

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=alc_na, color=colors[0], width=barWidth, edgecolor='white',label = 'alc missing')
plt.bar(r2, height=alc_response, color=colors[1], width=barWidth, edgecolor='white',label = 'alc response')

plt.legend()
plt.xticks(r1 + barWidth/2, subgroup)  
plt.ylabel('frequency')  # 纵坐标轴标题
plt.title('distribution of alcohol info missing')


plt.show()

In [None]:
# raw data
df.raw = pd.read_csv('data/jhs_data.csv')
v1_raw = df.raw[df.raw["visit"] == 1]

In [None]:
alc_v1 = v1_raw[['subjid','alc', 'alcw', 'nSES']]
alc_v1.isna().sum()
alc_low_nses = alc_v1[alc_v1["nSES"] == 0]
alc_high_nses = alc_v1[alc_v1["nSES"] == 1]
alc_low_nses

In [None]:
# alc info missing in each subgroup
alc_low_nses.isnull().sum()
alc_high_nses.isnull().sum()

In [None]:
# check which covariates are available across several visits

## visit 2
covar_v2 = df[df["visit"] == 2]

In [None]:
covar_v2

In [None]:
covar_v2 = covar_v2[['age', 'currentSmoker', 'sbp', 'Diabetes', 'hdl', 'totchol']]

In [None]:
covar_v2.isnull().sum()

In [None]:
## visit 2
covar_v3 = df[df["visit"] == 3]
covar_v3 = covar_v3[['age',  'currentSmoker', 'sbp', 'Diabetes', 'hdl', 'totchol']]
covar_v3

In [None]:
covar_v3.isnull().sum()

In [None]:
### plot for nb features

In [None]:
## nbK3FavorFoodstore

subset_1 = covar_v1[covar_v1['nSES'] == 1]
subset_0 = covar_v1[covar_v1['nSES'] == 0]
    
sns.distplot(subset_1['nbK3FavorFoodstore'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'high nSES')
sns.distplot(subset_0['nbK3FavorFoodstore'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'low nSES')

plt.legend(prop={'size': 16}, title = '')
plt.xlabel("Favorable Food Stores", size=20)
plt.ylabel("Density", size=20)
plt.tick_params(axis='both', which='major', labelsize=16)

plt.show()

In [None]:
## nbK3paFacilities

subset_1 = covar_v1[covar_v1['nSES'] == 1]
subset_0 = covar_v1[covar_v1['nSES'] == 0]
    
sns.distplot(subset_1['nbK3paFacilities'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'high nSES')
sns.distplot(subset_0['nbK3paFacilities'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'low nSES')

plt.legend(prop={'size': 16}, title = '')
plt.xlabel("Physical Activity Facilities ", size=20)
plt.ylabel("Density", size=20)
plt.tick_params(axis='both', which='major', labelsize=16)

plt.show()

In [None]:
## nbpctResiden1mi

subset_1 = covar_v1[covar_v1['nSES'] == 1]
subset_0 = covar_v1[covar_v1['nSES'] == 0]
    
sns.distplot(subset_1['nbpctResiden1mi'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'high nSES')
sns.distplot(subset_0['nbpctResiden1mi'], hist = False, kde = True,
                 kde_kws = {'linewidth': 3},
                 label = 'low nSES')

plt.legend(prop={'size': 16}, title = '')
plt.xlabel("Percent Residential Land Use (per Square Mile)", size=20)
plt.ylabel("Density", size=20)
plt.tick_params(axis='both', which='major', labelsize=16)

plt.show()

In [None]:
nb_df = pd.read_csv('data/jhs_nb_gcomputation.csv')

In [None]:
select_df = nb_df[['subjid','nSES','nFood','nFac','nRes']]
count_food=select_df.groupby(['nSES','nFood'])['subjid'].count()
print(count_food)

count_fac=select_df.groupby(['nSES','nFac'])['subjid'].count()
print(count_fac)

count_res=select_df.groupby(['nSES','nRes'])['subjid'].count()
print(count_res)

In [None]:
outcome = ['low nSES','high nSES']
low_res = [746, 777]
high_res = [1243, 802]

plt.rcParams["figure.figsize"] = (5,5)
# Set position of bar on X axis
barWidth = 0.1
r1 = np.arange(len(outcome))
r2 = r1+barWidth

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=low_res, color=colors[0], width=barWidth, edgecolor='white',label = 'low_res')
plt.bar(r2, height=high_res, color=colors[1], width=barWidth, edgecolor='white',label = 'high_res')

plt.legend()
plt.xticks(r1 + barWidth/2, outcome)  
plt.ylabel('frequency')  # 纵坐标轴标题
plt.title('nRes distribution by nSES')


plt.show()