In [13]:
from __future__ import print_function, division

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display

import thinkstats2_mod
import thinkplot

import statsmodels.formula.api as smf

In [14]:
dct = thinkstats2_mod.ReadStataDct('GSS/GSS.dct')
df = dct.ReadFixedWidth('GSS/GSS.dat')
print('Total number of respondents: %i' %(df.shape[0] + 1))

Total number of respondents: 62467


In [15]:
# Filter out years without relevant data
df = df[df.year >= 1974]
print('Number of respondents left: %i' %(df.shape[0] + 1))

# Filter out invalid polviews
df = df[df.polviews <= 7]
df = df[df.polviews > 0]

# Center polviews likert scale around 0
df['polviews'] = np.subtract(df['polviews'], 4)
print('Number of respondents left: %i' %(df.shape[0] + 1))

# Filter out invalid age respondants
df = df[df.age <= 89]
print('Number of respondents left: %i' %(df.shape[0] + 1))

# Filter out marital status no answers
df = df[df.marital != 9]
print('Number of respondents left: %i' %(df.shape[0] + 1))

# Filter out highest education degree invalid
df = df[df.degree <= 4]

# Recode religion to make coding more consistent through GSS years
df['relig_recode'] = df.relig
#recode Christians who are not Catholic as '1' (replacing Protestant code)
df.relig_recode.replace([10, 11, 13], 1, inplace=True)
#recode Buddhism, Hinduism, Other eastern, Moslem/islam, Native american, Inter-nondenominational
df.relig_recode.replace([6, 7, 8, 9, 12], 5, inplace=True)
#remove don't know/don't answer respondants
df = df[df.relig_recode <= 5]

# Square income cause why not
df['coninc'] = df['coninc'] ** 3

# For more information about SEI scores http://gss.norc.org/Documents/reports/methodological-reports/MR074.pdf

# Maybe try to fill in values?

df_control = df.copy(deep=True)

Number of respondents left: 59350
Number of respondents left: 53082
Number of respondents left: 52916
Number of respondents left: 52905


In [16]:
formula = 'df_control.polviews ~ df_control.marital'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_marital = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.02
Model:,OLS,Adj. R-squared:,0.02
Method:,Least Squares,F-statistic:,1089.0
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,1.89e-236
Time:,12:24:24,Log-Likelihood:,-91036.0
No. Observations:,52664,AIC:,182100.0
Df Residuals:,52662,BIC:,182100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3831,0.010,36.929,0.000,0.363,0.403
df_control.marital,-0.1208,0.004,-33.004,0.000,-0.128,-0.114

0,1,2,3
Omnibus:,380.059,Durbin-Watson:,1.912
Prob(Omnibus):,0.0,Jarque-Bera (JB):,276.011
Skew:,-0.069,Prob(JB):,1.16e-60
Kurtosis:,2.673,Cond. No.,5.38


In [17]:
formula = 'df_control.polviews ~ df_control.marital + df_control.age'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_age = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.026
Model:,OLS,Adj. R-squared:,0.026
Method:,Least Squares,F-statistic:,691.4
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,4.18e-297
Time:,12:24:24,Log-Likelihood:,-90892.0
No. Observations:,52664,AIC:,181800.0
Df Residuals:,52661,BIC:,181800.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0564,0.022,2.577,0.010,0.013,0.099
df_control.marital,-0.1002,0.004,-26.038,0.000,-0.108,-0.093
df_control.age,0.0061,0.000,16.957,0.000,0.005,0.007

0,1,2,3
Omnibus:,367.456,Durbin-Watson:,1.913
Prob(Omnibus):,0.0,Jarque-Bera (JB):,273.972
Skew:,-0.078,Prob(JB):,3.22e-60
Kurtosis:,2.683,Cond. No.,183.0


In [18]:
formula = 'df_control.polviews ~ df_control.marital + df_control.sex'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_sex = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.021
Model:,OLS,Adj. R-squared:,0.021
Method:,Least Squares,F-statistic:,573.1
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,6.070000000000001e-247
Time:,12:24:24,Log-Likelihood:,-91008.0
No. Observations:,52664,AIC:,182000.0
Df Residuals:,52661,BIC:,182000.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5225,0.021,24.478,0.000,0.481,0.564
df_control.marital,-0.1212,0.004,-33.118,0.000,-0.128,-0.114
df_control.sex,-0.0892,0.012,-7.470,0.000,-0.113,-0.066

0,1,2,3
Omnibus:,371.573,Durbin-Watson:,1.912
Prob(Omnibus):,0.0,Jarque-Bera (JB):,273.065
Skew:,-0.072,Prob(JB):,5.07e-60
Kurtosis:,2.678,Cond. No.,13.1


In [19]:
formula = 'df_control.polviews ~ df_control.marital + df_control.age + df_control.sex'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_agesex = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.027
Model:,OLS,Adj. R-squared:,0.027
Method:,Least Squares,F-statistic:,482.9
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,1.43e-309
Time:,12:24:24,Log-Likelihood:,-90860.0
No. Observations:,52664,AIC:,181700.0
Df Residuals:,52660,BIC:,181800.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.2008,0.028,7.090,0.000,0.145,0.256
df_control.marital,-0.1003,0.004,-26.078,0.000,-0.108,-0.093
df_control.age,0.0062,0.000,17.207,0.000,0.005,0.007
df_control.sex,-0.0955,0.012,-8.017,0.000,-0.119,-0.072

0,1,2,3
Omnibus:,360.36,Durbin-Watson:,1.913
Prob(Omnibus):,0.0,Jarque-Bera (JB):,272.322
Skew:,-0.083,Prob(JB):,7.35e-60
Kurtosis:,2.689,Cond. No.,245.0


In [20]:
formula = 'df_control.polviews ~ df_control.marital + df_control.age + df_control.sex + df_control.race'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_agesexrace = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.029
Model:,OLS,Adj. R-squared:,0.029
Method:,Least Squares,F-statistic:,398.5
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,0.0
Time:,12:24:24,Log-Likelihood:,-90789.0
No. Observations:,52664,AIC:,181600.0
Df Residuals:,52659,BIC:,181600.0
Df Model:,4,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3607,0.031,11.517,0.000,0.299,0.422
df_control.marital,-0.0946,0.004,-24.454,0.000,-0.102,-0.087
df_control.age,0.0058,0.000,16.308,0.000,0.005,0.007
df_control.sex,-0.0917,0.012,-7.709,0.000,-0.115,-0.068
df_control.race,-0.1324,0.011,-11.893,0.000,-0.154,-0.111

0,1,2,3
Omnibus:,336.458,Durbin-Watson:,1.916
Prob(Omnibus):,0.0,Jarque-Bera (JB):,254.752
Skew:,-0.077,Prob(JB):,4.8e-56
Kurtosis:,2.696,Cond. No.,270.0


In [21]:
formula = 'df_control.polviews ~ df_control.marital + df_control.age + df_control.sex + df_control.race + df_control.degree'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_agesexrace = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.031
Model:,OLS,Adj. R-squared:,0.031
Method:,Least Squares,F-statistic:,342.2
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,0.0
Time:,12:24:24,Log-Likelihood:,-90733.0
No. Observations:,52664,AIC:,181500.0
Df Residuals:,52658,BIC:,181500.0
Df Model:,5,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4710,0.033,14.292,0.000,0.406,0.536
df_control.marital,-0.0956,0.004,-24.722,0.000,-0.103,-0.088
df_control.age,0.0055,0.000,15.326,0.000,0.005,0.006
df_control.sex,-0.0966,0.012,-8.117,0.000,-0.120,-0.073
df_control.race,-0.1404,0.011,-12.594,0.000,-0.162,-0.119
df_control.degree,-0.0539,0.005,-10.658,0.000,-0.064,-0.044

0,1,2,3
Omnibus:,350.547,Durbin-Watson:,1.919
Prob(Omnibus):,0.0,Jarque-Bera (JB):,257.507
Skew:,-0.066,Prob(JB):,1.21e-56
Kurtosis:,2.684,Cond. No.,284.0


In [22]:
formula = 'df_control.polviews ~ df_control.marital + df_control.age + df_control.sex + df_control.race + df_control.degree + df_control.sei10'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_agesexracesei = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.032
Model:,OLS,Adj. R-squared:,0.031
Method:,Least Squares,F-statistic:,285.5
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,0.0
Time:,12:24:24,Log-Likelihood:,-90732.0
No. Observations:,52664,AIC:,181500.0
Df Residuals:,52657,BIC:,181500.0
Df Model:,6,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4568,0.034,13.255,0.000,0.389,0.524
df_control.marital,-0.0952,0.004,-24.555,0.000,-0.103,-0.088
df_control.age,0.0055,0.000,15.104,0.000,0.005,0.006
df_control.sex,-0.0944,0.012,-7.874,0.000,-0.118,-0.071
df_control.race,-0.1395,0.011,-12.497,0.000,-0.161,-0.118
df_control.degree,-0.0590,0.006,-9.472,0.000,-0.071,-0.047
df_control.sei10,0.0004,0.000,1.410,0.159,-0.000,0.001

0,1,2,3
Omnibus:,349.388,Durbin-Watson:,1.919
Prob(Omnibus):,0.0,Jarque-Bera (JB):,256.745
Skew:,-0.066,Prob(JB):,1.77e-56
Kurtosis:,2.684,Cond. No.,397.0


In [23]:
formula = 'df_control.polviews ~ df_control.marital + df_control.age + df_control.sex + df_control.race + df_control.degree + df_control.sei10 + df_control.coninc'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_agesexraceseiincome = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.028
Model:,OLS,Adj. R-squared:,0.028
Method:,Least Squares,F-statistic:,504.4
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,3.5e-323
Time:,12:24:24,Log-Likelihood:,-90829.0
No. Observations:,52664,AIC:,181700.0
Df Residuals:,52660,BIC:,181700.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0114,0.000,-29.553,0.000,-0.012,-0.011
df_control.marital,-0.0798,0.003,-27.297,0.000,-0.086,-0.074
df_control.age,0.0079,0.000,33.755,0.000,0.007,0.008
df_control.sex,-0.0223,0.001,-24.270,0.000,-0.024,-0.020
df_control.race,-0.0199,0.001,-30.852,0.000,-0.021,-0.019
df_control.degree,-0.0609,0.006,-9.842,0.000,-0.073,-0.049
df_control.sei10,0.0014,0.000,4.752,0.000,0.001,0.002
df_control.coninc,3.482e-17,7.4e-18,4.708,0.000,2.03e-17,4.93e-17

0,1,2,3
Omnibus:,364.065,Durbin-Watson:,1.914
Prob(Omnibus):,0.0,Jarque-Bera (JB):,270.579
Skew:,-0.076,Prob(JB):,1.7600000000000001e-59
Kurtosis:,2.683,Cond. No.,5460000000000000.0


In [24]:
formula = 'df_control.polviews ~ df_control.marital + df_control.age + df_control.sex + df_control.race + df_control.degree + df_control.sei10 + df_control.coninc + df_control.relig_recode'
model = smf.ols(formula, data=df_control)
results = model.fit()
params_agesexraceseiincome = results.params
results.summary()

0,1,2,3
Dep. Variable:,df_control.polviews,R-squared:,0.049
Model:,OLS,Adj. R-squared:,0.048
Method:,Least Squares,F-statistic:,895.4
Date:,"Sun, 23 Apr 2017",Prob (F-statistic):,0.0
Time:,12:24:24,Log-Likelihood:,-90265.0
No. Observations:,52664,AIC:,180500.0
Df Residuals:,52660,BIC:,180600.0
Df Model:,3,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.0173,0.000,-45.859,0.000,-0.018,-0.017
df_control.marital,-0.0335,0.003,-10.320,0.000,-0.040,-0.027
df_control.age,0.0102,0.000,42.254,0.000,0.010,0.011
df_control.sex,-0.0262,0.001,-35.192,0.000,-0.028,-0.025
df_control.race,-0.0295,0.001,-46.594,0.000,-0.031,-0.028
df_control.degree,-0.0335,0.006,-5.377,0.000,-0.046,-0.021
df_control.sei10,0.0025,0.000,8.269,0.000,0.002,0.003
df_control.coninc,5.478e-17,7.31e-18,7.496,0.000,4.05e-17,6.91e-17
df_control.relig_recode,-0.1682,0.005,-33.848,0.000,-0.178,-0.158

0,1,2,3
Omnibus:,291.539,Durbin-Watson:,1.924
Prob(Omnibus):,0.0,Jarque-Bera (JB):,226.466
Skew:,-0.075,Prob(JB):,6.66e-50
Kurtosis:,2.716,Cond. No.,5670000000000000.0
