In [1]:
from __future__ import print_function, division

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from IPython.display import display

import thinkstats2_mod
import thinkplot

import statsmodels.formula.api as smf

In [2]:
dct = thinkstats2_mod.ReadStataDct('GSS/GSS.dct')
df = dct.ReadFixedWidth('GSS/GSS.dat')
display(df.shape)
# display(df)

(62466, 182)

In [3]:
# Filter out years without relevant data
df = df[df.year >= 1974]
display(df.shape)

# Filter out invalid polviews
# df = df[df.polviews > 0]
df = df[df.polviews <= 7]
display(df.shape)

# Filter out invalid age respondants
df = df[df.age <= 89]
display(df.shape)

# Filter out work status no answers
df = df[df.wrkstat != 9]
display(df.shape)

# Filter out marital status no answers
df = df[df.marital != 9]
display(df.shape)

# Filter out years of education w/ don't know, no answer
df = df[df.educ <= 20]
display(df.shape)

# Filter out degree of education invalid
df = df[df.degree <= 4]
display(df.shape)

# Filter out perceived class invalid
df = df[df.class_ > 0]
df = df[df.class_ <= 4]
display(df.shape)

# Filter out total fam income invalid
df = df[df.income > 0]
df = df[df.income <= 12]
display(df.shape)

df_control = df.copy(deep=True)

# display(df)

(59349, 182)

(56741, 182)

(56562, 182)

(56551, 182)

(56542, 182)

(56450, 182)

(56395, 182)

(53914, 182)

(48881, 182)

In [4]:
formula = 'df_control.polviews ~ df_control.age + df_control.wrkstat + df_control.marital + df_control.educ + df_control.degree + df_control.class_ + df_control.income'
model = smf.mnlogit(formula, data=df)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.759130
         Iterations 6


0,1,2,3
Dep. Variable:,df_control.polviews,No. Observations:,48881.0
Model:,MNLogit,Df Residuals:,48825.0
Method:,MLE,Df Model:,49.0
Date:,"Wed, 19 Apr 2017",Pseudo R-squ.:,0.01729
Time:,23:51:39,Log-Likelihood:,-85988.0
converged:,True,LL-Null:,-87501.0
,,LLR p-value:,0.0

df_control.polviews=1,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.3537,0.325,4.170,0.000,0.717,1.990
df_control.age,-0.0035,0.002,-1.587,0.112,-0.008,0.001
df_control.wrkstat,0.0058,0.016,0.366,0.714,-0.025,0.037
df_control.marital,0.1490,0.022,6.910,0.000,0.107,0.191
df_control.educ,-0.0807,0.023,-3.433,0.001,-0.127,-0.035
df_control.degree,0.2634,0.057,4.602,0.000,0.151,0.376
df_control.class_,-0.0005,0.055,-0.009,0.993,-0.108,0.107
df_control.income,-0.1495,0.015,-10.220,0.000,-0.178,-0.121
df_control.polviews=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.7398,0.243,11.290,0.000,2.264,3.215


In [5]:
df_1974 = df_control.copy(deep=True)

# Select 1974
df_1974 = df_1974[df_1974.year == 1974]
display(df_1974.shape)

(1294, 182)

In [6]:
formula = 'df_1974.polviews ~ df_1974.age + df_1974.wrkstat + df_1974.marital + df_1974.educ + df_1974.degree + df_1974.class_ + df_1974.income'
model = smf.mnlogit(formula, data=df_1974)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.573435
         Iterations 7


0,1,2,3
Dep. Variable:,df_1974.polviews,No. Observations:,1294.0
Model:,MNLogit,Df Residuals:,1246.0
Method:,MLE,Df Model:,42.0
Date:,"Wed, 19 Apr 2017",Pseudo R-squ.:,0.02766
Time:,23:51:39,Log-Likelihood:,-2036.0
converged:,True,LL-Null:,-2094.0
,,LLR p-value:,7.805e-09

df_1974.polviews=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,4.8165,1.904,2.529,0.011,1.084,8.549
df_1974.age,-0.0267,0.015,-1.823,0.068,-0.055,0.002
df_1974.wrkstat,0.1120,0.108,1.036,0.300,-0.100,0.324
df_1974.marital,-0.3523,0.144,-2.453,0.014,-0.634,-0.071
df_1974.educ,-0.1281,0.139,-0.925,0.355,-0.400,0.144
df_1974.degree,0.2827,0.357,0.791,0.429,-0.418,0.983
df_1974.class_,0.0878,0.373,0.236,0.814,-0.643,0.819
df_1974.income,-0.0002,0.079,-0.002,0.998,-0.154,0.154
df_1974.polviews=3,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,3.3932,1.907,1.779,0.075,-0.345,7.132


In [7]:
df_2016 = df_control.copy(deep=True)

# Select 2016
df_2016 = df_2016[df_2016.year == 2016]
display(df_2016.shape)

(2337, 182)

In [8]:
formula = 'df_2016.polviews ~ df_2016.age + df_2016.wrkstat + df_2016.marital + df_2016.educ + df_2016.degree + df_2016.class_ + df_2016.income'
model = smf.mnlogit(formula, data=df_2016)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.670273
         Iterations 7


0,1,2,3
Dep. Variable:,df_2016.polviews,No. Observations:,2337.0
Model:,MNLogit,Df Residuals:,2289.0
Method:,MLE,Df Model:,42.0
Date:,"Wed, 19 Apr 2017",Pseudo R-squ.:,0.02417
Time:,23:51:39,Log-Likelihood:,-3903.4
converged:,True,LL-Null:,-4000.1
,,LLR p-value:,2.688e-21

df_2016.polviews=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.0569,1.147,0.921,0.357,-1.191,3.305
df_2016.age,-0.0040,0.007,-0.537,0.591,-0.018,0.010
df_2016.wrkstat,0.0454,0.055,0.832,0.405,-0.061,0.152
df_2016.marital,-0.0314,0.072,-0.437,0.662,-0.172,0.109
df_2016.educ,0.0014,0.083,0.017,0.986,-0.161,0.164
df_2016.degree,0.1465,0.190,0.769,0.442,-0.227,0.520
df_2016.class_,-0.3394,0.179,-1.892,0.059,-0.691,0.012
df_2016.income,0.0526,0.048,1.103,0.270,-0.041,0.146
df_2016.polviews=3,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.0178,1.180,0.863,0.388,-1.295,3.330


In [9]:
formula = 'df_control.polviews ~ df_control.age'
model = smf.mnlogit(formula, data=df_control)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.782779
         Iterations 6


0,1,2,3
Dep. Variable:,df_control.polviews,No. Observations:,48881.0
Model:,MNLogit,Df Residuals:,48867.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 19 Apr 2017",Pseudo R-squ.:,0.004075
Time:,23:51:40,Log-Likelihood:,-87144.0
converged:,True,LL-Null:,-87501.0
,,LLR p-value:,9.926e-150

df_control.polviews=1,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-0.2913,0.095,-3.074,0.002,-0.477,-0.106
df_control.age,-0.0073,0.002,-3.610,0.000,-0.011,-0.003
df_control.polviews=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1602,0.068,17.051,0.000,1.027,1.294
df_control.age,-0.0086,0.001,-5.997,0.000,-0.011,-0.006
df_control.polviews=3,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.3388,0.067,19.945,0.000,1.207,1.470
df_control.age,-0.0107,0.001,-7.555,0.000,-0.013,-0.008
df_control.polviews=4,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.9056,0.060,31.501,0.000,1.787,2.024


In [10]:
formula = 'df_control.polviews ~ df_control.age + df_control.marital'
model = smf.mnlogit(formula, data=df_control)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.775452
         Iterations 6


0,1,2,3
Dep. Variable:,df_control.polviews,No. Observations:,48881.0
Model:,MNLogit,Df Residuals:,48860.0
Method:,MLE,Df Model:,14.0
Date:,"Wed, 19 Apr 2017",Pseudo R-squ.:,0.008169
Time:,23:51:41,Log-Likelihood:,-86786.0
converged:,True,LL-Null:,-87501.0
,,LLR p-value:,7.149e-297

df_control.polviews=1,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1288,0.130,-8.664,0.000,-1.384,-0.873
df_control.age,-9.939e-05,0.002,-0.047,0.962,-0.004,0.004
df_control.marital,0.1943,0.021,9.206,0.000,0.153,0.236
df_control.polviews=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.7632,0.090,8.480,0.000,0.587,0.940
df_control.age,-0.0051,0.001,-3.409,0.001,-0.008,-0.002
df_control.marital,0.0954,0.015,6.276,0.000,0.066,0.125
df_control.polviews=3,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.2987,0.088,14.755,0.000,1.126,1.471
df_control.age,-0.0103,0.001,-6.920,0.000,-0.013,-0.007


In [11]:
formula = 'df_control.polviews ~ df_control.marital'
model = smf.mnlogit(formula, data=df_control)
results = model.fit()
results.summary()

Optimization terminated successfully.
         Current function value: 1.779617
         Iterations 6


0,1,2,3
Dep. Variable:,df_control.polviews,No. Observations:,48881.0
Model:,MNLogit,Df Residuals:,48867.0
Method:,MLE,Df Model:,7.0
Date:,"Wed, 19 Apr 2017",Pseudo R-squ.:,0.005842
Time:,23:51:42,Log-Likelihood:,-86989.0
converged:,True,LL-Null:,-87501.0
,,LLR p-value:,1.823e-216

df_control.polviews=1,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,-1.1331,0.064,-17.754,0.000,-1.258,-1.008
df_control.marital,0.1942,0.020,9.885,0.000,0.156,0.233
df_control.polviews=2,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.4928,0.043,11.413,0.000,0.408,0.577
df_control.marital,0.1132,0.014,7.871,0.000,0.085,0.141
df_control.polviews=3,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.7672,0.042,18.247,0.000,0.685,0.850
df_control.marital,0.0421,0.014,2.946,0.003,0.014,0.070
df_control.polviews=4,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,2.0395,0.037,54.453,0.000,1.966,2.113
