In [None]:
# Import libraries
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import os

from lifelines.fitters.coxph_fitter import CoxPHFitter
from lifelines.statistics import proportional_hazard_test
from lifelines import KaplanMeierFitter

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
mesa_ori=pd.read_csv("../../data/exam_1_with_outcome_nses.csv")
mesa=pd.read_csv("../../code/thesis_code/mesa_preprocess_entire.csv")

## Descriptive Statistics

### Gender

In [None]:
select_exam_1=mesa_ori[['idno','site1c','gender1','race1c','age1c','cig1c','chol1','hdl1','sbp1c','diabet1','nSES','y']]
select_exam_1['race_2']=np.where(select_exam_1['race1c']==3,1,0)

select_exam_gender=select_exam_1[['idno','site1c','gender1','y']]
list_gender=select_exam_gender.groupby(['site1c','y','gender1'])['idno'].count().to_list()


In [None]:
#plot for gender

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_gender[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = 'gender=0 Feamle')
plt.bar(r2, height=list_gender[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = 'gender=1 Male')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('Gender Distribution of different sites')
plt.legend(fontsize=13)
plt.show()

In [None]:
select_exam_gender2=select_exam_1[['idno','gender1','y']]
list_gender2=select_exam_gender2.groupby(['y','gender1'])['idno'].count().to_list()

In [None]:
#plot for gender

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,3])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_gender2[0:4:2], color=colors[0], width=barWidth, edgecolor='white',label = 'gender=0 Feamle')
plt.bar(r2, height=list_gender2[1:4:2], color=colors[1], width=barWidth, edgecolor='white',label = 'gender=1 Male')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1'],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('Gender Distribution')
plt.text(0.95, 2850, "2826",fontsize=13)
plt.text(1.4, 2350, "2331",fontsize=13)
plt.text(2.95, 430, "419",fontsize=13)
plt.text(3.4, 620, "613",fontsize=13)
plt.legend(fontsize=13)
plt.show()

### Race

In [None]:
race1c_exam_1=select_exam_1[['idno','site1c','race1c','y']]
count_race1c=race1c_exam_1.groupby(['site1c','y','race1c']).count()

#new data frame
a=np.array([3,4,5,6,7,8])
df = pd.DataFrame(columns=["site1c", "y", "race1c"])
df['site1c']=a.repeat(8)
df['y']=[0,0,0,0,1,1,1,1]*6
df['race1c']=[1,2,3,4]*12


join_race=pd.merge(df,count_race1c, how="left",on=['site1c','y','race1c'])
join_race=join_race.fillna(value=0)

In [None]:

#plot for race1c

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.2
r1 = np.array([1,2,4,5,7,8,10,11,13,14,16,17])
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]

colors = ['#003F5B',
          '#A05195',
          '#DDA0DD',
          '#D8BFD8']

# Make the plot
plt.bar(r1, height=join_race['idno'].to_list()[0:48:4], color=colors[0], width=barWidth, edgecolor='white',label = '1: White, Caucasian')
plt.bar(r2, height=join_race['idno'].to_list()[1:48:4], color=colors[1], width=barWidth, edgecolor='white',label = '2: Chinese American')
plt.bar(r3, height=join_race['idno'].to_list()[2:48:4], color=colors[2], width=barWidth, edgecolor='white',label = '3: Black, African-American')
plt.bar(r4, height=join_race['idno'].to_list()[3:48:4], color=colors[3], width=barWidth, edgecolor='white',label = '4: Hispanic')
plt.xticks([r + barWidth*1.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=12)
plt.ylabel('Count',fontsize=12)
plt.title('Race Distribution of different sites',fontsize=12)
plt.legend(fontsize=13)
plt.show()

In [None]:
race1c_exam_1_2=select_exam_1[['idno','race1c','y']]
list_race1c2=race1c_exam_1_2.groupby(['y','race1c'])['idno'].count().to_list()

In [None]:
#plot for race1c

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.2
r1 = np.array([1,3])
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]
r4 = [x + barWidth for x in r3]

colors = ['#003F5B',
          '#A05195',
          '#DDA0DD',
          '#D8BFD8']

# Make the plot
plt.bar(r1, height=list_race1c2[0:8:4], color=colors[0], width=barWidth, edgecolor='white',label = '1: White, Caucasian')
plt.bar(r2, height=list_race1c2[1:8:4], color=colors[1], width=barWidth, edgecolor='white',label = '2: Chinese American')
plt.bar(r3, height=list_race1c2[2:8:4], color=colors[2], width=barWidth, edgecolor='white',label = '3: Black, African-American')
plt.bar(r4, height=list_race1c2[3:8:4], color=colors[3], width=barWidth, edgecolor='white',label = '4: Hispanic')
plt.xticks([r + barWidth*1.5 for r in r1], ['y=0', 'y=1'],fontsize=13)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=12)
plt.ylabel('Count',fontsize=12)
plt.title('Race Distribution',fontsize=12)
plt.text(0.95, 2015, "2004",fontsize=13)
plt.text(1.15, 643, "624",fontsize=13)
plt.text(1.35, 1435, "1430",fontsize=13)
plt.text(1.55, 1103, "1099",fontsize=13)
plt.text(2.95, 440, "427",fontsize=13)
plt.text(3.17, 110, "105",fontsize=13)
plt.text(3.35, 265, "263",fontsize=13)
plt.text(3.55, 250, "237",fontsize=13)
plt.legend(fontsize=13)
plt.show()

### Race recoded

In [None]:
race1c_exam_1_3=select_exam_1[['idno','site1c','race_2','y']]
list_race1c3=race1c_exam_1_3.groupby(['y','race_2'])['idno'].count().to_list()

In [None]:
#plot for recode race

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,3])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_race1c3[0:4:2], color=colors[0], width=barWidth, edgecolor='white',label = 'gender=0 Others')
plt.bar(r2, height=list_race1c3[1:4:2], color=colors[1], width=barWidth, edgecolor='white',label = 'gender=1 Black Racial Group')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1'],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('Race Distribution')
plt.text(0.95, 3750, "3727",fontsize=13)
plt.text(1.4, 1440, "1430",fontsize=13)
plt.text(2.95, 780, "769",fontsize=13)
plt.text(3.4, 270, "263",fontsize=13)
plt.legend(fontsize=13)
plt.show()

In [None]:
count_race1c4=race1c_exam_1_3.groupby(['site1c','y','race_2']).count()

#new data frame
a=np.array([3,4,5,6,7,8])
df = pd.DataFrame(columns=["site1c", "y", "race_2"])
df['site1c']=a.repeat(4)
df['y']=[0,0,1,1]*6
df['race_2']=[0,1]*12


join_race2=pd.merge(df,count_race1c4, how="left",on=['site1c','y','race_2'])
join_race2=join_race2.fillna(value=0)

In [None]:
#plot for race1c_recoded

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=join_race2['idno'].to_list()[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = '0: Other')
plt.bar(r2, height=join_race2['idno'].to_list()[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = '1: Black, African-American')
 
# Add xticks on the middle of the group bars
# plt.xlabel('group', fontweight='bold')
#plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nsite3:WFU           ','y=0', 'y=1\nsite4:COL            ','y=0', 'y=1\nsite5:JHU           ','y=0', 'y=1\nsite6:UMN           ','y=0', 'y=1\nsite7:NWU            ','y=0', 'y=1\nsite8:UCLA           '],fontsize=10)
#plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            ','y=0', 'y=1\nColumbia University             ','y=0', 'y=1\nJohns Hopkins University            ','y=0', 'y=1\nUniversity of Minnesota           ','y=0', 'y=1\nNorthwestern University            ','y=0', 'y=1\nUCLA               '],fontsize=10)
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('Race Distribution of different sites')
plt.legend(fontsize=13)
plt.show()

### Age

In [None]:
age_0 = select_exam_1[select_exam_1['y'] == 0]
age_1 = select_exam_1[select_exam_1['y'] == 1] 

In [None]:
sns.kdeplot(age_0['age1c'], shade=True, label = 'no CVD')
sns.kdeplot(age_1['age1c'], shade=True, label = 'CVD')
plt.xlabel('Age \n Control (y=0,no CVD) and Incident (y=1,CVD) Group',fontsize=12)
plt.ylabel('Density',fontsize=12)
plt.title('Age Distribution',fontsize=12)
plt.legend(fontsize=12)
plt.show()

### Chol1

In [None]:
chol_0 = select_exam_1[select_exam_1['y'] == 0]
chol_1 = select_exam_1[select_exam_1['y'] == 1]

In [None]:
sns.kdeplot(chol_0['chol1'], shade=True, label = 'no CVD')
sns.kdeplot(chol_1['chol1'], shade=True, label = 'CVD')
plt.xlabel('Total Lipoprotein Cholesterol \n Control (y=0,no CVD) and Incident (y=1,CVD) Group',fontsize=12)
plt.ylabel('Density',fontsize=12)
plt.title('Total Lipoprotein Cholesterol Distribution',fontsize=12)
plt.legend(fontsize=12)
plt.show()

## hdl1

In [None]:
hdl_0 = select_exam_1[select_exam_1['y'] == 0]
hdl_1 = select_exam_1[select_exam_1['y'] == 1] 

In [None]:
sns.kdeplot(hdl_0['hdl1'], shade=True, label = 'no CVD')
sns.kdeplot(hdl_1['hdl1'], shade=True, label = 'CVD')
plt.xlabel('High-Density Lipoprotein Cholesterol  \n Control (y=0,no CVD) and Incident (y=1,CVD) Group',fontsize=12)
plt.ylabel('Density',fontsize=12)
plt.title('High-Density Lipoprotein Cholesterol  Distribution',fontsize=12)
plt.legend(fontsize=12)
plt.show()

### sbp1c

In [None]:
sbp_0 = select_exam_1[select_exam_1['y'] == 0]
sbp_1 = select_exam_1[select_exam_1['y'] == 1] 

In [None]:
sns.kdeplot(sbp_0['sbp1c'], shade=True, label = 'no CVD')
sns.kdeplot(sbp_1['sbp1c'], shade=True, label = 'CVD')
plt.xlabel('Treated or Untreated Systolic Blood Pressure  \n Control (y=0,no CVD) and Incident (y=1,CVD) Group',fontsize=12)
plt.ylabel('Density',fontsize=12)
plt.title('Treated or Untreated Systolic Blood Pressure  Distribution',fontsize=12)
plt.legend(fontsize=12)
plt.show()

### cig1c

In [None]:
cig1c_exam_1=select_exam_1[['idno','site1c','cig1c','y']]
list_cig1c=cig1c_exam_1.groupby(['site1c','y','cig1c'])['idno'].count().to_list()

In [None]:
#plot for cig1c

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.28
r1 = np.array([1,2,4,5,7,8,10,11,13,14,16,17])
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]

colors = ['#003F5B',
          '#A05195',
          '#DDA0DD']

# Make the plot
plt.bar(r1, height=list_cig1c[0:36:3], color=colors[0], width=barWidth, edgecolor='white',label = 'cig1c=0 Never')
plt.bar(r2, height=list_cig1c[1:36:3], color=colors[1], width=barWidth, edgecolor='white',label = 'cig1c=1 Former')
plt.bar(r3, height=list_cig1c[2:36:3], color=colors[2], width=barWidth, edgecolor='white',label = 'cig1c=2 Current')
plt.xticks([r + barWidth*1 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=12)
plt.ylabel('Count',fontsize=12)
plt.title('Cigarette Smoking Status Distribution of different sites',fontsize=12)
plt.legend(fontsize=13)
plt.show()

### cig1c recoded

In [None]:
cig1c_exam_1['cig1c'] = cig1c_exam_1['cig1c'].replace(2,1)
cig1c_exam_1['cig1c'] = cig1c_exam_1['cig1c'].fillna(value=0)
list_cig1c=cig1c_exam_1.groupby(['site1c','y','cig1c'])['idno'].count().to_list()

In [None]:
#plot for cig1c_recode

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_cig1c[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = '0: Other')
plt.bar(r2, height=list_cig1c[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = '1: ever or currently smokes')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('Cigarette Smoking Status recoded Distribution of different sites')
plt.legend(fontsize=13)
plt.show()

### Diabete

In [None]:
diabet1_exam_1=select_exam_1[['idno','site1c','diabet1','y']]
count_diabet1=diabet1_exam_1.groupby(['site1c','y','diabet1']).count()

In [None]:
#new data frame
a=np.array([3,4,5,6,7,8])
df_d = pd.DataFrame(columns=["site1c", "y", "diabet1"])
df_d['site1c']=a.repeat(6)
df_d['y']=[0,0,0,1,1,1]*6
df_d['diabet1']=[0,1,9]*12


join_diabet1=pd.merge(df_d,count_diabet1, how="left",on=['site1c','y','diabet1'])
join_diabet1=join_diabet1.fillna(value=0)


In [None]:
#plot for diabete

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.28
r1 = np.array([1,2,4,5,7,8,10,11,13,14,16,17])
r2 = [x + barWidth for x in r1]
r3 = [x + barWidth for x in r2]


colors = ['#003F5B',
          '#A05195',
          '#DDA0DD']

# Make the plot
plt.bar(r1, height=join_diabet1['idno'].to_list()[0:36:3], color=colors[0], width=barWidth, edgecolor='white',label = '0:No')
plt.bar(r2, height=join_diabet1['idno'].to_list()[1:36:3], color=colors[1], width=barWidth, edgecolor='white',label = '1:Yes')
plt.bar(r3, height=join_diabet1['idno'].to_list()[2:36:3], color=colors[2], width=barWidth, edgecolor='white',label = '9:Do not Know')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=12)
plt.ylabel('Count',fontsize=12)
plt.title('Diabetes Distribution of different sites',fontsize=12)
plt.legend(fontsize=13)
plt.show()

In [None]:
diabet1_exam_1_2=select_exam_1[['idno','site1c','diabet1','y']]
diabet1_exam_1_2['diabet1'] = diabet1_exam_1_2['diabet1'].replace(9,0) #fill DK as 0
diabet1_exam_1_2['diabet1'] = diabet1_exam_1_2['diabet1'].fillna(value=0)
list_diabet12=diabet1_exam_1_2.groupby(['site1c','y','diabet1'])['idno'].count().to_list()


In [None]:
#plot for diabete recoded

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_diabet12[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = 'nSES=0 Low nSES Group')
plt.bar(r2, height=list_diabet12[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = 'nSES=1 High nSES Group')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('diabete recoded Distribution of different sites')
plt.legend(fontsize=13)
plt.show()

### nSES

In [None]:
select_exam_nses=select_exam_1[['idno','site1c','nSES','y']]
list_nses=select_exam_nses.groupby(['site1c','y','nSES'])['idno'].count().to_list()

In [None]:
#plot for nSES

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_nses[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = 'nSES=0 Low nSES Group')
plt.bar(r2, height=list_nses[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = 'nSES=1 High nSES Group')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('nSES Distribution of different sites')
plt.legend(fontsize=13)
plt.show()

In [None]:
list_nses2=select_exam_nses.groupby(['y','nSES'])['idno'].count().to_list()

In [None]:
#plot for nSES

plt.rcParams["figure.figsize"] = (20,10)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,3])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_nses2[0:4:2], color=colors[0], width=barWidth, edgecolor='white',label = 'nSES=0 Low nSES Group')
plt.bar(r2, height=list_nses2[1:4:2], color=colors[1], width=barWidth, edgecolor='white',label = 'nSES=1 High nSES Group')
 
# Add xticks on the middle of the group bars
# plt.xlabel('group', fontweight='bold')
#plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nsite3:WFU           ','y=0', 'y=1\nsite4:COL            ','y=0', 'y=1\nsite5:JHU           ','y=0', 'y=1\nsite6:UMN           ','y=0', 'y=1\nsite7:NWU            ','y=0', 'y=1\nsite8:UCLA           '],fontsize=10)
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1'],fontsize=10)
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group')
plt.ylabel('Count')
plt.title('nSES Distribution')
plt.text(0.95, 2620, "2600",fontsize=13)
plt.text(1.4, 2580, "2557",fontsize=13)
plt.text(2.95, 540, "515",fontsize=13)
plt.text(3.4, 540, "517",fontsize=13)
plt.legend(fontsize=13)
plt.show()

In [None]:
plt.subplot(4, 1, 1)

#plot for gender

plt.rcParams["figure.figsize"] = (58,66)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_gender[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = 'gender=0 Feamle')
plt.bar(r2, height=list_gender[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = 'gender=1 Male')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=24,fontweight='bold')
plt.yticks(fontsize=18)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=20,fontweight='bold')
plt.ylabel('Count')
title1 = plt.title('Plot 1: Gender Distribution of different sites',fontsize=32)
title1.set_weight('bold')
plt.legend(fontsize=22)



plt.subplot(4, 1, 2)

#plot for race1c_recoded

plt.rcParams["figure.figsize"] = (58,66)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=join_race2['idno'].to_list()[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = '0: Other')
plt.bar(r2, height=join_race2['idno'].to_list()[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = '1: Black, African-American')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=24,fontweight='bold')
plt.yticks(fontsize=18)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=20,fontweight='bold')
plt.ylabel('Count')
title2 = plt.title('Plot 2: Race Distribution of different sites',fontsize=32)
title2.set_weight('bold')
plt.legend(fontsize=22)



plt.subplot(4, 1, 3)

#plot for cig1c_recode

plt.rcParams["figure.figsize"] = (58,66)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_cig1c[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = '0: Other')
plt.bar(r2, height=list_cig1c[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = '1: ever or currently smokes')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=24,fontweight='bold')
plt.yticks(fontsize=18)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=20,fontweight='bold')
plt.ylabel('Count')
title3 = plt.title('Plot 3: Cigarette Smoking Status Distribution of different sites',fontsize=32)
title3.set_weight('bold')
plt.legend(fontsize=22)



plt.subplot(4, 1, 4)
#plot for diabete

plt.rcParams["figure.figsize"] = (58,66)
# Set position of bar on X axis
barWidth = 0.45
r1 = np.array([1,2,5,6,9,10,13,14,17,18,21,22])
r2 = [x + barWidth for x in r1]

colors = ['#003F5B',
          '#A05195']

# Make the plot
plt.bar(r1, height=list_diabet12[0:24:2], color=colors[0], width=barWidth, edgecolor='white',label = 'nSES=0 Low nSES Group')
plt.bar(r2, height=list_diabet12[1:24:2], color=colors[1], width=barWidth, edgecolor='white',label = 'nSES=1 High nSES Group')
plt.xticks([r + barWidth*0.5 for r in r1], ['y=0', 'y=1\nWake Forest University            \nsite 3                ','y=0', 'y=1\nColumbia University             \nsite 4                ','y=0', 'y=1\nJohns Hopkins University            \nsite 5                ','y=0', 'y=1\nUniversity of Minnesota           \nsite 6                ','y=0', 'y=1\nNorthwestern University            \nsite 7                ','y=0', 'y=1\nUCLA               \nsite 8              '],fontsize=24,fontweight='bold')
plt.yticks(fontsize=16)
# Create legend & Show graphic
plt.xlabel('Control (y=0) and Incident (y=1) Group',fontsize=20,fontweight='bold')
plt.ylabel('Count',fontsize=12)
title4 = plt.title('Plot 4: Diabetes Recoded Distribution of different sites',fontsize=32)
title4.set_weight('bold')
plt.legend(fontsize=22)

plt.show()

## Cox Proportional Hazards Model Plots - Hazard Ratio Plots

In [None]:
cox1 = mesa[['gender1','cig1c','diabet1','race_2','age1c','chol1','hdl1','sbp1c','fuptt','y']]
cph1 = CoxPHFitter(alpha=0.05)
cph1.fit(cox1, 'fuptt', 'y')

cox1_nSES = mesa[['gender1','cig1c','diabet1','race_2','age1c','nSES','chol1','hdl1','sbp1c','fuptt','y']]
cph2 = CoxPHFitter(alpha=0.05)
cph2.fit(cox1_nSES, 'fuptt', 'y')

cox1_in = mesa[['gender1','cig1c','diabet1','race_2','age1c','chol1','hdl1','sbp1c','fuptt','gender_race','y']]
cph3 = CoxPHFitter(alpha=0.05)
cph3.fit(cox1_in, 'fuptt', 'y')

cox1_in_nSES = mesa[['gender1','cig1c','diabet1','race_2','age1c','chol1','hdl1','nSES','sbp1c','fuptt','gender_race','y']]
cph4 = CoxPHFitter(alpha=0.05)
cph4.fit(cox1_in_nSES, 'fuptt', 'y')

In [None]:
plt.subplot(2, 2, 1)
plt.rcParams["figure.figsize"] = (20,15)
cph1.plot(hazard_ratios=True)
title1 = plt.title('Plot 1: Model 1 Hazards Ratio',fontsize=18)
title1.set_weight('bold')

plt.subplot(2, 2, 2)
plt.rcParams["figure.figsize"] = (20,15)
cph2.plot(hazard_ratios=True)
title2 = plt.title('Plot 2: Model 2 Hazards Ratio',fontsize=18)
title2.set_weight('bold')

plt.subplot(2, 2, 3)
plt.rcParams["figure.figsize"] = (20,15)
cph3.plot(hazard_ratios=True)
title3 = plt.title('Plot 3: Model 3 Hazards Ratio',fontsize=18)
title3.set_weight('bold')

plt.subplot(2, 2, 4)
plt.rcParams["figure.figsize"] = (20,15)
cph4.plot(hazard_ratios=True)
title4 = plt.title('Plot 4: Model 4 Hazards Ratio',fontsize=18)
title4.set_weight('bold')


title = plt.suptitle("Cox proportional hazards models results Comparison",fontsize=24)
title.set_weight('bold')
plt.show()

## Kaplan-Meier Curve

In [None]:
#generate the km estimate curve and print the km estimate for each model

plt.subplot(2, 2, 1)
plt.rcParams["figure.figsize"] = (20,15)

# model 1: no interaction no nSES
T = cox1['fuptt']
E = cox1['y']
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)
kmf.survival_function_
kmf.plot_survival_function()
title1 = plt.title('Plot 1: Model 1 Kaplan-Meier Curve',fontsize=18)
title1.set_weight('bold')



plt.subplot(2, 2, 2)
plt.rcParams["figure.figsize"] = (20,15)

# model 2: no interaction with nSES
T = cox1_nSES['fuptt']
E = cox1_nSES['y']
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)
kmf.survival_function_
kmf.plot_survival_function()
title1 = plt.title('Plot 2: Model 2 Kaplan-Meier Curve',fontsize=18)
title1.set_weight('bold')



plt.subplot(2, 2, 3)
plt.rcParams["figure.figsize"] = (20,15)

# model 3: with interaction no nSES
T = cox1_in['fuptt']
E = cox1_in['y']

kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)
kmf.survival_function_
kmf.plot_survival_function()
title1 = plt.title('Plot 3: Model 3 Kaplan-Meier Curve',fontsize=18)
title1.set_weight('bold')



plt.subplot(2, 2, 4)
plt.rcParams["figure.figsize"] = (20,15)

# model 4: with interaction with nSES
T = cox1_in_nSES['fuptt']
E = cox1_in_nSES['y']
kmf = KaplanMeierFitter()
kmf.fit(T, event_observed=E)
kmf.survival_function_
kmf.plot_survival_function()
title1 = plt.title('Plot 4: Model 4 Kaplan-Meier Curve',fontsize=18)
title1.set_weight('bold')


title = plt.suptitle("Kaplan-Meier Curve Comparison",fontsize=24)
title.set_weight('bold')
plt.show()

## G-Computation Error Bar

In [None]:
def get_causal_effect(treatment, formula, data):
    '''
    Input: treatment - column_name
           formula - Regression formula
           data - dataframe
    Output: ate - average causal effect
    
    The main idea is to have two separate models for those with treatment=1 and treatment =0
    According to this we sample our data based on the treatment and fit two models
    
    We then predict the outcome for the entire data based on our fitted models and
    then evaluate the expected difference in the outcome which is our causal effect.
    '''
    
    f = sm.families.family.Binomial()
    fm_a1 = smf.glm(formula, 
                    data.loc[data[treatment] == 1], family=f).fit()
    #print(fm_a1.summary())
    fm_a0 = smf.glm(formula, 
                data.loc[data[treatment] == 0], family=f).fit()
    
    #print(fm_a0.summary())
    y_a1 = fm_a1.predict(data)
    y_a0 = fm_a0.predict(data)
    ate = np.mean(y_a1 - y_a0)
    return ate

In [None]:
'''
Bootstrapping to get confidence intervals for the causal effect
'''
def bootstrap_ci(treatment, formula, data, nb, ate):
    ate_rs = []
    for i in range(nb):  # Drawing nb bootstrapped samples, can simply start with 10 samples
        d_star = data.sample(n=data.shape[0], # Same size as input data
                             replace=True, random_state = 29*i)  # Draw with replacement
        f = sm.families.family.Binomial()
        fm_a1 = smf.glm(formula, 
                        d_star.loc[d_star[treatment] == 1], family=f).fit()
        fm_a0 = smf.glm(formula, 
                    d_star.loc[d_star[treatment] == 0], family=f).fit()
        y_a1 = fm_a1.predict(d_star)
        y_a0 = fm_a0.predict(d_star)
        ate_rs.append(np.mean(y_a1 - y_a0))

    #print("95% Confidence limits for the ATE")
    ci_perc = np.percentile(ate_rs, q=[2.5, 97.5])
    ate_se = np.std(ate_rs, ddof=1)
    ci_approx = np.round([ate - 1.96*ate_se,
                          ate + 1.96*ate_se],6)
    return ci_perc, ci_approx

In [None]:
data = mesa[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'

ate = get_causal_effect(treatment,formula,data)
ci_perc, ci_approx = bootstrap_ci(treatment, formula, data, 1000, ate)

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 3)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'

ate3 = get_causal_effect(treatment,formula,data)
ci_perc_3, ci_approx_3 = bootstrap_ci(treatment, formula, data, 1000, ate3)

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 4)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'

ate4 = get_causal_effect(treatment,formula,data)
ci_perc_4, ci_approx_4 = bootstrap_ci(treatment, formula, data, 1000, ate4)

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 5)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'

ate5 = get_causal_effect(treatment,formula,data)
ci_perc_5, ci_approx_5 = bootstrap_ci(treatment, formula, data, 1000, ate5)

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 7)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'

ate7 = get_causal_effect(treatment,formula,data)
ci_perc_7, ci_approx_7 = bootstrap_ci(treatment, formula, data, 1000, ate7)

In [None]:
select_exam_2 = mesa.loc[(mesa['site1c'] == 8)]
data = select_exam_2[['nSES','y','cig1c','diabet1','age1c','sbp1c','hdl1','chol1','gender1','race_2']].copy()
formula = "y ~  age1c + sbp1c + hdl1 + chol1 + C(cig1c) + C(diabet1) + C(gender1) + C(race_2)"
treatment = 'nSES'

ate8 = get_causal_effect(treatment,formula,data)
ci_perc_8, ci_approx_8 = bootstrap_ci(treatment, formula, data, 1000, ate8)

In [None]:
plt.rcParams["figure.figsize"] = (10,6)

error_all = ci_perc[1]-ci_perc[0]
error_3 = ci_perc_3[1]-ci_perc_3[0]
error_4 = ci_perc_4[1]-ci_perc_4[0]
error_5 = ci_perc_5[1]-ci_perc_5[0]
error_7 = ci_perc_7[1]-ci_perc_7[0]
error_8 = ci_perc_8[1]-ci_perc_8[0]

errors = [error_all, error_3, error_4, error_5, error_7, error_8]

x = [1, 3, 5, 7, 9, 11]
y = [ate, ate3, ate4, ate5, ate7, ate8]

plt.figure()
plt.errorbar(x, y, yerr=errors, fmt = 'o', color = 'k')
plt.xticks((1, 3, 5, 7, 9, 11), ('Overall', 'site3', 'site4', 'site5', 'site7', 'site8')) 
plt.show()

In [None]:
plt.rcParams["figure.figsize"] = (10,6)

error_all = ci_approx[1]-ci_approx[0]
error_3 = ci_approx_3[1]-ci_approx_3[0]
error_4 = ci_approx_4[1]-ci_approx_4[0]
error_5 = ci_approx_5[1]-ci_approx_5[0]
error_7 = ci_approx_7[1]-ci_approx_7[0]
error_8 = ci_approx_8[1]-ci_approx_8[0]

errors = [error_all, error_3, error_4, error_5, error_7, error_8]

x = [1, 3, 5, 7, 9, 11]
y = [ate, ate3, ate4, ate5, ate7, ate8]

plt.figure()
plt.errorbar(x, y, yerr=errors, fmt = 'o', color = 'k')
plt.xticks((1, 3, 5, 7, 9, 11), ('Overall', 'site3', 'site4', 'site5', 'site7', 'site8')) 
plt.show()