In [1]:
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
raw = pd.read_csv('./data/DS0001/04690-0001-Data.tsv',sep='\t')
raw.shape

(3617, 4564)

## Dataset sampling

- Originally 3617 respondents
- Listwise deletion to 1640 participants
- Use dropna() function for Listwise deletion
- All of these participants had a spouse, a child, and friends when responding to the survey

In [3]:
# Married, have friends
# V2060: Married == 1
# V2225: # of Friends > 0
# V2017: # of Children > 0

df = raw[(raw['V2060'] == 1) & (raw['V2225'] > 0) & (raw['V2017'] > 0)]
print(len(df))

1768


In [4]:
df.isnull().values.any()

False

## Dependent Variables

Positive emotional state

- “In the past week: I felt happy” - H1E V1006
- “In the past week: I enjoyed life” - H1J V1010
- The two variables are combined to V2620


Negative emotional state
- “In the past week: I felt sad” - H1M V1012
- “In the past week: I felt depressed” - H1A V1002
- “In the past week: I felt lonely” - H1F V1007

In [5]:
missing_val = {-95: np.nan, -96: np.nan, -99: np.nan}

df.replace({'V1006': missing_val, 'V1010': missing_val, 'V1012': missing_val, 
            'V1002': missing_val, 'V1007': missing_val}, inplace=True)

df = df.dropna(axis='index', subset=['V1002','V1006','V1007','V1010','V1012'], how='any')


df['positive_emotion'] = df[['V1006', 'V1010']].mean(axis=1)
df['negative_emotion'] = df[['V1012', 'V1002', 'V1007']].mean(axis=1)

# Normalize
df['positive_emotion']=(df['positive_emotion']-df['positive_emotion'].min()) \
                    /(df['positive_emotion'].max()-df['positive_emotion'].min())
df['negative_emotion']=(df['negative_emotion']-df['negative_emotion'].min()) \
                    /(df['negative_emotion'].max()-df['negative_emotion'].min())
print(len(df))

1740


## Independent Variables

Spouse
- Love and care: C2 V405
- Willing to listen: C4 V407
- Too many demands: C3 V406
- Critical of you: C5 V408

Child
- Love and care: C14 V431
- Willing to listen: C16 V433
- Too many demands: C15 V432
- Critical of you: C17 V434

Friends
- Love and care: C47 V535
- Willing to listen: C49 V537
- Too many demands: C48 V536
- Critical of you: C50 V538

In [7]:
df['spouse_support'] = df['V2204']
df['spouse_strain'] = df['V2205']
df['child_support'] = df['V2207']
df['child_strain'] = df['V2208']
df['friend_support'] = df['V2216']
df['friend_strain'] = df['V2217']

In [8]:
df.isnull().values.any()

False

In [9]:
len(df)

1740

## Control Variables

- Confidants: C54 V546
- Age: A0C1 V104
- Income: R27 V1730
- Sex: AOB1 V103
- Education R13 V1646
    - Less than HS
    - HS diploma
    - College degree
    - 4-year degree
    - Graduate degree
- Retired: J1E V1105
- Age category
    - Age <45
    - Age 45-65
    - Age >65
- Number of Children
    - Children in HH: V105, V108, V111, V114, V117, V120, V123, V126, V129, V132, V135 -> Check each column 
    - Children elsewhere: V202



In [10]:
def age_categorize(row):  
    if row['age'] < 45:
        return 'under 45'
    elif row['age'] >= 45 and row['age'] <= 65:
        return '45-65'
    elif row['age'] > 65:
        return 'over 65'
    
def education_categorize(row):  
    if row['education'] < 12:
        return '1' # Less than high school
    elif 12 <= row['education'] < 14:
        return '2' # High school diploma
    elif 14 <= row['education'] < 16:
        return '3' # College degree
    elif 16 <= row['education'] < 17:
        return '4' # 4-year degree
    elif row['education'] >= 17:
        return '5' # Graduate degree
#     return np.nan

In [11]:
missing_val = {-95: np.nan, -96: np.nan, -99: np.nan}
check_box = {1: 1, 5: 0}

df.replace({'V546': missing_val, 'V104': missing_val, 'V2020': missing_val,
            'V1105': check_box, 'V103': missing_val, 'V2007': missing_val,
            'V2017':missing_val
            }, inplace=True)

df['confidants'] = df['V546']
df['age'] = df['V104']
# df['age_group'] = df.apply(lambda row: age_categorize(row), axis=1)
df['income'] = df['V2020']
df['sex'] = df['V103']
df['education'] = df['V2007']
# df['education_group'] = df.apply(lambda row: education_categorize(row), axis=1)
df['retired'] = df['V1105']
df['num_child'] = df['V2017']

df = df.dropna(axis='index', subset=['confidants','age','age','income','sex',
                                     'education','retired','num_child'], how='any')

df['age_group'] = df.apply(lambda row: age_categorize(row), axis=1)
df['education_group'] = df.apply(lambda row: education_categorize(row), axis=1)

In [12]:
df.isnull().values.any()

False

In [13]:
len(df)

1585

In [14]:
data = df[['positive_emotion', 'negative_emotion', 'spouse_support', 'spouse_strain',
           'child_support', 'child_strain', 'friend_support', 'friend_strain',
           'confidants', 'age', 'age_group', 'income', 'sex', 'education', 'education_group',
           'retired', 'num_child']].copy(deep=True)

print(data.shape)
data.head()

(1585, 17)


Unnamed: 0,positive_emotion,negative_emotion,spouse_support,spouse_strain,child_support,child_strain,friend_support,friend_strain,confidants,age,age_group,income,sex,education,education_group,retired,num_child
1,0.5,0.0,0.3309,1.3306,-0.4619,1.8144,-2.045698,0.3005,1.0,44.0,under 45,9,1,12,2,0,2
9,1.0,0.333333,0.3309,-1.3283,-0.1654,-0.442,0.1147,0.2565,4.0,73.0,over 65,2,1,9,1,1,3
11,0.5,0.166667,-0.4141,-0.7946,-0.1654,-0.4823,-0.4013,0.9053,3.0,47.0,45-65,7,1,10,1,0,5
12,1.0,0.0,0.8749,-0.2641,0.9599,-1.026199,1.243099,-0.9531,2.0,48.0,45-65,7,1,14,3,1,3
13,1.0,0.166667,0.3309,-0.7946,0.9599,-1.026199,-0.4978,1.4661,7.0,55.0,45-65,7,2,5,1,0,3


## Statistical Hybrid Model

In [16]:
len(data)

1585

In [18]:
from statsmodels.regression.mixed_linear_model import MixedLM

# For now, this is what we can do.
# In fact, this is what they do!
# data = data.dropna()

# Describe variance components
vc = {'confidants': '0 + C(confidants)', 'age': '0 + C(age)', 'education_group': '0 + C(education_group)', 
      'income': '0 + C(income)', 'sex': '0 + C(sex)', 'retired': '0 + C(retired)', 'num_child': '0 + C(num_child)'} 

model1 = MixedLM.from_formula(
    'positive_emotion ~ spouse_support + spouse_strain + child_support + child_strain + friend_support + friend_strain',
    vc_formula = vc, data=data, groups=data['age_group'])

result1 = model1.fit()
result1.summary()




0,1,2,3
Model:,MixedLM,Dependent Variable:,positive_emotion
No. Observations:,1585,Method:,REML
No. Groups:,3,Scale:,0.0508
Min. group size:,378,Log-Likelihood:,-25.7399
Max. group size:,621,Converged:,No
Mean group size:,528.3,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.847,0.120,7.049,0.000,0.612,1.083
spouse_support,0.033,0.007,5.082,0.000,0.020,0.046
spouse_strain,-0.010,0.007,-1.457,0.145,-0.024,0.003
child_support,0.015,0.005,2.981,0.003,0.005,0.026
child_strain,-0.015,0.005,-2.966,0.003,-0.025,-0.005
friend_support,0.022,0.007,3.060,0.002,0.008,0.036
friend_strain,-0.009,0.007,-1.256,0.209,-0.023,0.005
age Var,0.007,,,,,
confidants Var,0.003,,,,,


In [20]:
from statsmodels.regression.mixed_linear_model import MixedLM

# For now, this is what we can do.
# In fact, this is what they do!
# data = data.dropna()

# Describe variance components
vc = {'confidants': '0 + C(confidants)', 'age': '0 + C(age)', 'education_group': '0 + C(education_group)', 
      'income': '0 + C(income)', 'sex': '0 + C(sex)', 'retired': '0 + C(retired)', 'num_child': '0 + C(num_child)'} 

model2 = MixedLM.from_formula(
    'negative_emotion ~ spouse_support + spouse_strain + child_support + child_strain + friend_support + friend_strain',
    vc_formula = vc, data=data, groups=data['age_group'])

result2 = model2.fit()
result2.summary()




0,1,2,3
Model:,MixedLM,Dependent Variable:,negative_emotion
No. Observations:,1585,Method:,REML
No. Groups:,3,Scale:,0.0377
Min. group size:,378,Log-Likelihood:,165.3430
Max. group size:,621,Converged:,No
Mean group size:,528.3,,

0,1,2,3,4,5,6
,Coef.,Std.Err.,z,P>|z|,[0.025,0.975]
Intercept,0.177,0.105,1.689,0.091,-0.028,0.383
spouse_support,-0.025,0.006,-4.508,0.000,-0.036,-0.014
spouse_strain,0.017,0.006,2.837,0.005,0.005,0.028
child_support,-0.010,0.004,-2.202,0.028,-0.018,-0.001
child_strain,0.009,0.004,2.158,0.031,0.001,0.018
friend_support,-0.011,0.006,-1.873,0.061,-0.023,0.001
friend_strain,0.021,0.006,3.465,0.001,0.009,0.033
age Var,0.076,,,,,
confidants Var,0.001,,,,,
