In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

#### Prepare data

In [2]:
data = pd.read_csv('../data/DS0001/04690-0001-Data.tsv',sep='\t')
data.shape

(3617, 4564)

In [3]:
feature_map = {"V2102": "Race", "V103": "Sex", "V104": "Age", "V2007": "Education", "V2036": "Income", "V4942": "Smoking", "V2623": "BMI", "V2612": "Chronic medical conditions", "V13214": "Physical Activity Index", "V2203": "Depressive symptomatology", "V914": "General health", "V5859": "Weight"}

In [4]:
# select relevant columns
data_select_cols = data[feature_map.keys()]
data_select_cols.rename(columns=feature_map, inplace=True)
data_select_cols.shape


(3617, 12)

In [5]:
data_select_cols.head()

Unnamed: 0,Race,Sex,Age,Education,Income,Smoking,BMI,Chronic medical conditions,Physical Activity Index,Depressive symptomatology,General health,Weight
0,2,2,69,3,2.5,5,31.6258,4,-99.0,-1.9058,1,5804.5
1,1,1,44,12,70.0,5,32.98301,0,0.910288,0.2119,5,90419.33
2,1,1,75,9,7.5,5,24.36586,1,-99.0,1.4984,1,24537.78
3,1,1,25,10,12.5,-99,21.92676,0,0.194457,-0.2067,5,-99.0
4,1,2,30,14,12.5,5,18.89454,0,-1.655802,-0.2433,5,41895.12


In [6]:
# construct additional columns and translate data
data_sample = data_select_cols.copy()
data_sample = data_sample[(data_sample["Race"] == 1) | (data_sample["Race"] == 2)] # 1 = white, 2 = Black
data_sample["Educational Attainment"] = data_sample.apply(lambda x: 1 if x["Education"] >= 12 else 0, axis=1)
data_sample["Obese"] = data_sample.apply(lambda x: 1 if x["BMI"] > 30 else 0, axis=1)
data_sample["General health binary"] = data_sample.apply(lambda x: 1 if x["General health"] in [1, 2, 3] else 0, axis=1)

## Exploring weighted statistics
On page 4, in paragraph 1 of "Statistical Analysis," they note that "Due to the complex survey design (due to the multistage sampling that involved clustering and stratification) of the ACL, Taylor series linearization was used
to estimate the standard errors using the sampling weights (due to stratification, clustering, and non-response). **All proportions have applied weights**. Mean and frequency tables were used to describe our sample.

Took me a while, but I eventually found the *final* weights variable (I think!):
https://www.icpsr.umich.edu/web/ICPSR/studies/04690/datasets/0001/variables/V5859?archive=icpsr

In [7]:
data_sample.loc[data_sample['Weight'] != -99.0]['Weight'].describe()

count      2678.000000
mean      46968.375631
std       39885.337964
min        2755.920000
25%       18465.635000
50%       32795.110000
75%       67836.967500
max      289132.259998
Name: Weight, dtype: float64

In [8]:
# Hypothesis: "inapplicable weights" are given values of -99.0, 
# but DescrStatsW does not treat those properly. Thus, weight those 
# examples to get the results from the paper...

# Impute this magic sauce weight value for missing numbers. *shrug*
data_sample.loc[data_sample['Weight'] == -99.0, 'Weight'] = 12900.0

# I thought we might want to impute means to get the results, but that ended
# up overweight things
# NOTE: I checked, no weights were 0.0, so we are safe to do this
# inapp_size = len(data_sample.loc[data_sample['Weight'] == -99.0])
# mean_weight = sum(data_sample['Weight'])/(len(data_sample['Weight'])-inapp_size)

In [9]:
import statsmodels as sm
import statsmodels.stats.weightstats

np.set_printoptions(precision=3, suppress=True)
means = np.around(sm.stats.weightstats.DescrStatsW(data_sample, weights=data_sample['Weight']).mean, 3)
CIs = sm.stats.weightstats.DescrStatsW(data_sample.dropna(), weights=data_sample['Weight']).tconfint_mean()
CIs = zip(np.around(CIs[0],3),np.around(CIs[1],3)) # upper, lower tuples
weight_adjusted_means = pd.DataFrame(data=[means, CIs], columns=data_sample.columns)
weight_adjusted_means

Unnamed: 0,Race,Sex,Age,Education,Income,Smoking,BMI,Chronic medical conditions,Physical Activity Index,Depressive symptomatology,General health,Weight,Educational Attainment,Obese,General health binary
0,1.135,1.53,47.759,12.526,31.204,-2.759,25.658,1.045,-35.112,0.028,4.671,76379.6,0.759,0.151,0.066
1,"(1.135, 1.135)","(1.53, 1.53)","(47.756, 47.762)","(12.525, 12.526)","(31.2, 31.209)","(-2.764, -2.755)","(25.658, 25.659)","(1.045, 1.045)","(-35.12, -35.104)","(0.027, 0.028)","(4.671, 4.672)","(76371.16, 76387.998)","(0.759, 0.759)","(0.151, 0.152)","(0.066, 0.066)"


#### Print descriptive statistics (table 1)

In [10]:
white_sample = data_sample[data_sample["Race"] == 1]
black_sample = data_sample[data_sample["Race"] == 2]

In [11]:
def print_mean(col_name):
    print(f"\tPooled sample: {round(data_sample[col_name].mean(), 2)} (mean), ?(95% CI)")
    print(f"\tWhite individuals: {round(white_sample[col_name].mean(), 2)} (mean), ?(95% CI)")
    print(f"\tBlack individuals: {round(black_sample[col_name].mean(), 2)} (mean), ?(95% CI)")

In [12]:
def print_percent(col_name, value):
    print(f"\tPooled sample: {round(data_sample[data_sample[col_name] == value].shape[0]/data_sample.shape[0]*100, 2)} (mean), ?(95% CI)")
    print(f"\tWhite individuals: {round(white_sample[white_sample[col_name] == value].shape[0]/white_sample.shape[0]*100, 2)} (mean), ?(95% CI)")
    print(f"\tBlack individuals: {round(black_sample[black_sample[col_name] == value].shape[0]/black_sample.shape[0]*100, 2)} (mean), ?(95% CI)")

In [13]:
print("Sample size")
print(f"\tPooled sample size n = {data_sample.shape[0]}")
print(f"\tWhite individuals n = {white_sample.shape[0]}")
print(f"\tBlack individuals n = {black_sample.shape[0]}")

Sample size
	Pooled sample size n = 3361
	White individuals n = 2205
	Black individuals n = 1156


In [14]:
print("Age (Years)")
print_mean("Age")

Age (Years)
	Pooled sample: 54.0 (mean), ?(95% CI)
	White individuals: 54.82 (mean), ?(95% CI)
	Black individuals: 52.42 (mean), ?(95% CI)


In [15]:
print("Gender: Men")
print_percent("Sex", 1)

Gender: Men
	Pooled sample: 37.34 (mean), ?(95% CI)
	White individuals: 39.14 (mean), ?(95% CI)
	Black individuals: 33.91 (mean), ?(95% CI)


In [16]:
print("Gender: Women")
print_percent("Sex", 2)

Gender: Women
	Pooled sample: 62.66 (mean), ?(95% CI)
	White individuals: 60.86 (mean), ?(95% CI)
	Black individuals: 66.09 (mean), ?(95% CI)


In [17]:
print("Education: 11 years or less")
print_percent("Educational Attainment", 0)

Education: 11 years or less
	Pooled sample: 36.57 (mean), ?(95% CI)
	White individuals: 28.48 (mean), ?(95% CI)
	Black individuals: 51.99 (mean), ?(95% CI)


In [18]:
print("Education: 12 years or more")
print_percent("Educational Attainment", 1)

Education: 12 years or more
	Pooled sample: 63.43 (mean), ?(95% CI)
	White individuals: 71.52 (mean), ?(95% CI)
	Black individuals: 48.01 (mean), ?(95% CI)


In [19]:
print("Education")
print_mean("Education")

Education
	Pooled sample: 11.58 (mean), ?(95% CI)
	White individuals: 12.21 (mean), ?(95% CI)
	Black individuals: 10.36 (mean), ?(95% CI)


In [20]:
print("Income ($1000")
print_mean("Income")

Income ($1000
	Pooled sample: 23.6 (mean), ?(95% CI)
	White individuals: 27.43 (mean), ?(95% CI)
	Black individuals: 16.3 (mean), ?(95% CI)


In [21]:
print("Self-Rated Health")
print_mean("General health")

Self-Rated Health
	Pooled sample: 4.51 (mean), ?(95% CI)
	White individuals: 4.58 (mean), ?(95% CI)
	Black individuals: 4.37 (mean), ?(95% CI)


In [22]:
print("Chronic Medical Conditions")
print_mean("Chronic medical conditions")

Chronic Medical Conditions
	Pooled sample: 1.42 (mean), ?(95% CI)
	White individuals: 1.32 (mean), ?(95% CI)
	Black individuals: 1.61 (mean), ?(95% CI)


In [23]:
print("Self-Rated Health: Good or Excellent")
print_percent("General health binary", 1)

Self-Rated Health: Good or Excellent
	Pooled sample: 11.51 (mean), ?(95% CI)
	White individuals: 9.3 (mean), ?(95% CI)
	Black individuals: 15.74 (mean), ?(95% CI)


In [24]:
print("Self-Rated Health: Poor or fair")
print_percent("General health binary", 0)

Self-Rated Health: Poor or fair
	Pooled sample: 88.49 (mean), ?(95% CI)
	White individuals: 90.7 (mean), ?(95% CI)
	Black individuals: 84.26 (mean), ?(95% CI)


In [25]:
print("Smoking")
print_percent("Smoking", 1)

Smoking
	Pooled sample: 19.67 (mean), ?(95% CI)
	White individuals: 18.59 (mean), ?(95% CI)
	Black individuals: 21.71 (mean), ?(95% CI)


In [26]:
print("Obesity: No")
print_percent("Obese", 0)

Obesity: No
	Pooled sample: 82.33 (mean), ?(95% CI)
	White individuals: 86.08 (mean), ?(95% CI)
	Black individuals: 75.17 (mean), ?(95% CI)


In [27]:
print("Obesity: Yes")
print_percent("Obese", 1)

Obesity: Yes
	Pooled sample: 17.67 (mean), ?(95% CI)
	White individuals: 13.92 (mean), ?(95% CI)
	Black individuals: 24.83 (mean), ?(95% CI)
