# A generational model of support for gun control

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
# Configure Jupyter so figures appear in the notebook
%matplotlib inline

# Configure Jupyter to display the assigned value after an assignment
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'

import pandas as pd
import numpy as np

import thinkstats2
import thinkplot
import utils

import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns
sns.set(style='white', font_scale=1.0, context='talk')

from collections import Counter

import statsmodels.formula.api as smf
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.discrete.discrete_model import Logit

In [2]:
def read_samples(iters=101):
    """Read samples.
    
    iters: number of times to run
    """
    for i in range(iters):
        key = 'iter%d' % i
        sample = pd.read_hdf('iterations2016.h5', key)
        yield sample

In [3]:
for sample in read_samples(1):
    pass

### Run logistic models

In [4]:
sample.shape

(40339, 68)

In [5]:
sample.columns

Index(['year', 'gunage', 'gunnum', 'owngun', 'rowngun', 'realinc', 'conrinc',
       'hispanic', 'cohort', 'ballot', 'wtssall', 'gun', 'gunlaw', 'cappun',
       'id_', 'age', 'educ', 'sex', 'race', 'income', 'rincome', 'srcbelt',
       'polviews', 'natcrime', 'adults', 'cohort5', 'cohort10', 'year8',
       'year4', 'age10', 'age5', 'age3', 'twenties', 'thirties', 'forties',
       'fifties', 'sixties', 'seventies', 'eighties', 'nineties', 'favor',
       'gunhome', 'threatened', 'spendcrime', 'topincome', 'lowincome',
       'liberal', 'moderate', 'conservative', 'female', 'ishisp', 'black',
       'otherrace', 'urban', 'suburban', 'rural', 'college', 'lowrealinc',
       'highrealinc', 'ones', 'c', 'a', 'y', 'c2', 'a2', 'y2', 'y3', 'ac'],
      dtype='object')

In [6]:
# not including Hispanic, due to too much missing data

varnames = ['nineties', 'eighties', 'seventies', 'fifties', 'forties', 'thirties', 'twenties',
            'female', 'black', 'otherrace', 'conservative', 'liberal', 'lowrealinc', 'highrealinc',
            'college', 'urban', 'rural']

all_varnames = varnames + ['y', 'y2', 'y3', 'favor']

['nineties',
 'eighties',
 'seventies',
 'fifties',
 'forties',
 'thirties',
 'twenties',
 'female',
 'black',
 'otherrace',
 'conservative',
 'liberal',
 'lowrealinc',
 'highrealinc',
 'college',
 'urban',
 'rural',
 'y',
 'y2',
 'y3',
 'favor']

In [7]:
def copy_nan(df, varname, newvar):
    """Put a NaN into newvar in any place where varname is Nan.
    
    df: DataFrame
    varname: string old var name
    newvar: string new var name
    """
    df.loc[df[varname].isnull(), newvar] = np.nan

In [8]:
def make_boolean(df, varname, values, newvar):
    """Make a boolean variable.
    
    df: DataFrame
    varname: name of base variable
    values: sequence of values for varname
    newvar: name of new variable (recode)
    """
    #assert numnull(df[varname]) == 0
    df[newvar] = df[varname].isin(values)
    copy_nan(df, varname, newvar)

In [9]:
def make_booleans(df):
    df['cohort10'] = utils.RoundIntoBins(df, 'cohort', 10)
    make_boolean(df, 'cohort10', [1920], 'twenties')
    make_boolean(df, 'cohort10', [1930], 'thirties')
    make_boolean(df, 'cohort10', [1940], 'forties')
    make_boolean(df, 'cohort10', [1950], 'fifties')
    make_boolean(df, 'cohort10', [1960], 'sixties')
    make_boolean(df, 'cohort10', [1970], 'seventies')
    make_boolean(df, 'cohort10', [1980], 'eighties')
    make_boolean(df, 'cohort10', [1990], 'nineties')
    make_boolean(df, 'gunlaw', [1.0], 'favor')
    make_boolean(df, 'owngun', [1.0], 'gunhome')
    make_boolean(df, 'gun', [1.0], 'threatened')
    make_boolean(df, 'natcrime', [1.0], 'spendcrime')
    make_boolean(df, 'income', [12], 'topincome')
    make_boolean(df, 'income', [1,2,3,4,5,6,7,8], 'lowincome')
    make_boolean(df, 'polviews', [1,2,3], 'liberal')
    make_boolean(df, 'polviews', [4], 'moderate')
    make_boolean(df, 'polviews', [6,7,8], 'conservative')
    make_boolean(df, 'sex', [2], 'female')
    make_boolean(df, 'hispanic', [2], 'ishisp')
    make_boolean(df, 'race', [2], 'black')
    make_boolean(df, 'race', [3], 'otherrace')
    make_boolean(df, 'srcbelt', [1,2,5], 'urban')
    make_boolean(df, 'srcbelt', [3,4], 'suburban')
    make_boolean(df, 'srcbelt', [6], 'rural')

    df['college'] = df['educ'] >= 13 
    copy_nan(df, 'educ', 'college')
    
    quantile25 = df['realinc'].quantile(0.25)
    df['lowrealinc'] = df['realinc'] <= quantile25 
    copy_nan(df, 'realinc', 'lowrealinc')

    quantile75 = df['realinc'].quantile(0.75)
    df['highrealinc'] = df['realinc'] >= quantile75 
    copy_nan(df, 'realinc', 'highrealinc')

In [10]:
def replace_invalid(df):
    df.gunlaw.replace([8, 9, 0], np.nan, inplace=True)
    df.owngun.replace([3, 8, 9, 0], np.nan, inplace=True)
    df.gun.replace([8, 9, 0], np.nan, inplace=True)
    df.natcrime.replace([8, 9, 0], np.nan, inplace=True)
    df.income.replace([0, 13, 98, 99], np.nan, inplace=True)
    df.realinc.replace([0], np.nan, inplace=True)                  # TODO: check this
    df.educ.replace([98,99], np.nan, inplace=True)
    df.polviews.replace([8, 9, 0], np.nan, inplace=True)
    df.age.replace([98, 99], np.nan, inplace=True)               # 89 means 89 or older
    df.hispanic.replace([98, 99, 0], np.nan, inplace=True)
    df.cohort.replace([9999], np.nan, inplace=True)

In [12]:
gss = utils.read_gss('gss_gun')
replace_invalid(gss)
gss = gss.dropna(subset=['gunlaw', 'age', 'cohort'])
make_booleans(gss)
gss.shape

(41878, 53)

In [13]:
for varname in varnames:
    print(varname, sum(gss[varname].isnull()))

nineties 0
eighties 0
seventies 0
fifties 0
forties 0
thirties 0
twenties 0
female 0
black 0
otherrace 0
conservative 4719
liberal 4719
lowrealinc 3833
highrealinc 3833
college 88
urban 0
rural 0


Select just the columns we need

In [14]:
data = sample[all_varnames]
data.shape

(40339, 21)

In [15]:
formula = ('favor ~ y + y2 + y3 + nineties + eighties + seventies + fifties + forties + thirties + twenties + '
           'female + black + otherrace + conservative + liberal + lowrealinc + highrealinc + ' 
           'college + urban + rural')
model = smf.logit(formula, data=data).fit()

model.summary()

Optimization terminated successfully.
         Current function value: 0.513959
         Iterations 6


0,1,2,3
Dep. Variable:,favor,No. Observations:,40339.0
Model:,Logit,Df Residuals:,40318.0
Method:,MLE,Df Model:,20.0
Date:,"Tue, 18 Jun 2019",Pseudo R-squ.:,0.05506
Time:,16:39:37,Log-Likelihood:,-20733.0
converged:,True,LL-Null:,-21941.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.1744,0.043,27.375,0.000,1.090,1.258
y,0.0319,0.002,13.465,0.000,0.027,0.037
y2,6.145e-06,0.000,0.059,0.953,-0.000,0.000
y3,-8.132e-05,6.78e-06,-11.987,0.000,-9.46e-05,-6.8e-05
nineties,-0.3695,0.105,-3.534,0.000,-0.575,-0.165
eighties,-0.2247,0.064,-3.489,0.000,-0.351,-0.098
seventies,-0.1245,0.052,-2.378,0.017,-0.227,-0.022
fifties,-0.0974,0.036,-2.675,0.007,-0.169,-0.026
forties,-0.0518,0.039,-1.338,0.181,-0.128,0.024


Make a row for someone in 2016 with all booleans false.

In [16]:
def make_base():
    y = 2016 - 1990
    y2 = y**2
    y3 = y**3

    d = dict(y=y, y2=y2, y3=y3)
    for varname in varnames:
        d[varname] = 0

    return pd.Series(d)

base = make_base()
base

y                  26
y2                676
y3              17576
nineties            0
eighties            0
seventies           0
fifties             0
forties             0
thirties            0
twenties            0
female              0
black               0
otherrace           0
conservative        0
liberal             0
lowrealinc          0
highrealinc         0
college             0
urban               0
rural               0
dtype: int64

Make a DataFrame that contains one row for each case we want to consider.

In [17]:
def make_df_pred():
    def add_yminus(df, varname, offset):
        """Add a column with y minus an offset.
        
        df: DataFrame
        varname: string new var name
        offset: how much to shift y
        """
        df.loc[varname] = base
        df.loc[varname, 'y'] += offset
        df.loc[varname, 'y2'] = df.loc[varname, 'y']**2
        df.loc[varname, 'y3'] = df.loc[varname, 'y']**3
    
    base = make_base()
    df_pred = pd.DataFrame(columns=base.index, dtype=float)    
    df_pred.loc['base'] = base

    for varname in varnames:
        df_pred.loc[varname] = base
        df_pred.loc[varname, varname] = 1
    
    add_yminus(df_pred, 'yminus10', -10)
    add_yminus(df_pred, 'yminus20', -20)
    add_yminus(df_pred, 'yminus30', -30)
    add_yminus(df_pred, 'yminus40', -40)
    
    #df_pred.loc['lowest combo'] = base
    #low_vars = ['gunhome', 'nineties', 'rural', 
    #            'conservative', 'lowrealinc']
    #df_pred.loc['lowest combo', low_vars] = 1
    
    #df_pred.loc['highest combo'] = base
    #high_vars = ['female', 'otherrace', 'liberal', 
    #            'college', 'highrealinc']
    #df_pred.loc['highest combo', high_vars] = 1
    
    return df_pred
    
df_pred = make_df_pred()

Unnamed: 0,y,y2,y3,nineties,eighties,seventies,fifties,forties,thirties,twenties,female,black,otherrace,conservative,liberal,lowrealinc,highrealinc,college,urban,rural
base,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nineties,26.0,676.0,17576.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eighties,26.0,676.0,17576.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seventies,26.0,676.0,17576.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fifties,26.0,676.0,17576.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
forties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
thirties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
twenties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
female,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
black,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
pred = model.predict(df_pred) * 100

base            64.080850
nineties        55.214334
eighties        58.763637
seventies       61.167410
fifties         61.810301
forties         62.880192
thirties        60.500798
twenties        62.068934
female          79.087298
black           71.545728
otherrace       77.716199
conservative    53.454136
liberal         70.576994
lowrealinc      63.627451
highrealinc     64.864965
college         66.536168
urban           56.404695
rural           45.337279
yminus10        79.469058
yminus20        79.388753
yminus30        74.117223
yminus40        72.152232
dtype: float64

In [19]:
pred - pred['base']

base             0.000000
nineties        -8.866515
eighties        -5.317213
seventies       -2.913440
fifties         -2.270549
forties         -1.200658
thirties        -3.580052
twenties        -2.011916
female          15.006448
black            7.464878
otherrace       13.635349
conservative   -10.626714
liberal          6.496144
lowrealinc      -0.453399
highrealinc      0.784115
college          2.455318
urban           -7.676155
rural          -18.743571
yminus10        15.388208
yminus20        15.307903
yminus30        10.036374
yminus40         8.071382
dtype: float64

In [20]:
def make_result(pred):
    """Make a DataFrame with one row per case.
    
    pred: series of predictions
    """
    result = pd.DataFrame()
    result['pred'] = pred
    result['offset'] = pred - pred['base']
    return result

result = make_result(pred)

Unnamed: 0,pred,offset
base,64.08085,0.0
nineties,55.214334,-8.866515
eighties,58.763637,-5.317213
seventies,61.16741,-2.91344
fifties,61.810301,-2.270549
forties,62.880192,-1.200658
thirties,60.500798,-3.580052
twenties,62.068934,-2.011916
female,79.087298,15.006448
black,71.545728,7.464878


### Iterate

To estimate uncertainty due to random sampling and missing values, we have to iterate the procedure we just ran.

In [21]:
results = []
for sample in read_samples():
    data = sample[all_varnames]
    model = smf.logit(formula, data=data).fit(disp=0)

    df_pred = make_df_pred()
    pred = model.predict(df_pred) * 100
    result = make_result(pred)
        
    results.append(result)

Process the results.

In [22]:
preds = [result.pred for result in results]
median, low, high = thinkstats2.PercentileRows(preds, [50, 5, 95])

estimates = pd.DataFrame(index=result.index)
estimates['low5'] = low
estimates['median'] = median
estimates['high95'] = high
estimates.round(0).astype(int)

Unnamed: 0,low5,median,high95
base,62,64,67
nineties,52,56,60
eighties,56,59,62
seventies,59,61,64
fifties,61,63,65
forties,61,62,65
thirties,59,61,63
twenties,60,63,65
female,78,79,81
black,70,72,74


In [23]:
def make_table(estimates):
    lines = estimates.round(1).to_html().split('\n')
    for line in lines:
        print(line)

Generate the table for the offsets.

In [24]:
preds = [result.offset for result in results]
median, low, high = thinkstats2.PercentileRows(preds, [50, 2.5, 97.5])

offsets = pd.DataFrame(index=result.index)
offsets['low2.5'] = low
offsets['median'] = median
offsets['high97.5'] = high
table = offsets.sort_values('median', ascending=False).round(0).astype(int)

Unnamed: 0,low2.5,median,high97.5
female,14,15,16
yminus10,13,15,17
yminus20,12,14,17
otherrace,9,12,15
yminus30,7,9,11
black,6,8,10
yminus40,5,7,9
liberal,5,6,7
college,2,3,4
highrealinc,0,1,2


In [25]:
output = pd.DataFrame(columns=['support', 'offset', '90% CI'])
for label, row in table.iterrows():
    low, median, high = row
    support = estimates.loc[label]['median'].round(0).astype(int)
    ci = '(%d, %d)' % (low, high)
    output.loc[label] = support, median, ci
    
output

Unnamed: 0,support,offset,90% CI
female,79,15,"(14, 16)"
yminus10,79,15,"(13, 17)"
yminus20,79,14,"(12, 17)"
otherrace,76,12,"(9, 15)"
yminus30,73,9,"(7, 11)"
black,72,8,"(6, 10)"
yminus40,72,7,"(5, 9)"
liberal,70,6,"(5, 7)"
college,67,3,"(2, 4)"
highrealinc,65,1,"(0, 2)"


In [26]:
def make_table(offsets):
    lines = offsets.sort_values('median').round(1).to_html().split('\n')
    for line in lines:
        print(line)

In [29]:
output.loc[:'liberal']

Unnamed: 0,support,offset,90% CI
female,79,15,"(14, 16)"
yminus10,79,15,"(13, 17)"
yminus20,79,14,"(12, 17)"
otherrace,76,12,"(9, 15)"
yminus30,73,9,"(7, 11)"
black,72,8,"(6, 10)"
yminus40,72,7,"(5, 9)"
liberal,70,6,"(5, 7)"


In [31]:
output.loc['college': 'thirties']

Unnamed: 0,support,offset,90% CI
college,67,3,"(2, 4)"
highrealinc,65,1,"(0, 2)"
lowrealinc,65,0,"(-1, 1)"
base,64,0,"(0, 0)"
fifties,63,-1,"(-3, 0)"
twenties,63,-2,"(-4, 1)"
forties,62,-2,"(-4, 1)"
seventies,61,-3,"(-5, -1)"
thirties,61,-3,"(-5, -2)"


In [32]:
output.loc['eighties': 'rural']

Unnamed: 0,support,offset,90% CI
eighties,59,-5,"(-8, -2)"
urban,56,-8,"(-10, -7)"
nineties,56,-8,"(-14, -4)"
conservative,55,-10,"(-11, -8)"
rural,46,-19,"(-21, -17)"
