# A generational model of support for gun control

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
# Configure Jupyter so figures appear in the notebook
%matplotlib inline

# Configure Jupyter to display the assigned value after an assignment
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'

import pandas as pd
import numpy as np

import thinkstats2
import thinkplot
import utils

import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns
sns.set(style='white', font_scale=1.0, context='talk')

from collections import Counter

import statsmodels.formula.api as smf
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.discrete.discrete_model import Logit

In [2]:
def read_samples(iters=101):
    """Read samples.
    
    iters: number of times to run
    """
    for i in range(iters):
        key = 'iter%d' % i
        sample = pd.read_hdf('iterations2016.h5', key)
        yield sample

In [3]:
for sample in read_samples(1):
    pass

### Run logistic models

In [4]:
sample.shape

(40339, 72)

In [5]:
sample.columns

Index(['year', 'gunage', 'gunnum', 'owngun', 'rowngun', 'realinc', 'conrinc',
       'hispanic', 'cohort', 'ballot', 'wtssall', 'gun', 'gunlaw', 'cappun',
       'id_', 'age', 'educ', 'sex', 'race', 'income', 'rincome', 'srcbelt',
       'polviews', 'natcrime', 'adults', 'cohort5', 'cohort10', 'year8',
       'year4', 'age10', 'age5', 'age3', 'twenties', 'thirties', 'forties',
       'fifties', 'sixties', 'seventies', 'eighties', 'nineties', 'favor',
       'gunhome', 'threatened', 'spendcrime', 'topincome', 'lowincome',
       'extremelyliberal', 'liberal', 'slightlyliberal', 'moderate',
       'slightlyconservative', 'conservative', 'extremelyconservative',
       'female', 'ishisp', 'black', 'otherrace', 'urban', 'suburban', 'rural',
       'college', 'lowrealinc', 'highrealinc', 'ones', 'c', 'a', 'y', 'c2',
       'a2', 'y2', 'y3', 'ac'],
      dtype='object')

In [6]:
# not including Hispanic, due to too much missing data

varnames = ['nineties', 'eighties', 'seventies', 'fifties', 'forties', 'thirties', 'twenties',
            'female', 'black', 'otherrace', 'extremelyconservative', 'conservative',
            'slightlyconservative', 'moderate', 'slightlyliberal', 'liberal', 
            'extremelyliberal', 'lowrealinc', 'highrealinc',
            'college', 'urban', 'rural']

all_varnames = varnames + ['y', 'y2', 'y3', 'favor']

['nineties',
 'eighties',
 'seventies',
 'fifties',
 'forties',
 'thirties',
 'twenties',
 'female',
 'black',
 'otherrace',
 'extremelyconservative',
 'conservative',
 'slightlyconservative',
 'moderate',
 'slightlyliberal',
 'liberal',
 'extremelyliberal',
 'lowrealinc',
 'highrealinc',
 'college',
 'urban',
 'rural',
 'y',
 'y2',
 'y3',
 'favor']

In [7]:
def copy_nan(df, varname, newvar):
    """Put a NaN into newvar in any place where varname is Nan.
    
    df: DataFrame
    varname: string old var name
    newvar: string new var name
    """
    df.loc[df[varname].isnull(), newvar] = np.nan

In [8]:
def make_boolean(df, varname, values, newvar):
    """Make a boolean variable.
    
    df: DataFrame
    varname: name of base variable
    values: sequence of values for varname
    newvar: name of new variable (recode)
    """
    #assert numnull(df[varname]) == 0
    df[newvar] = df[varname].isin(values)
    copy_nan(df, varname, newvar)

In [9]:
def make_booleans(df):
    df['cohort10'] = utils.RoundIntoBins(df, 'cohort', 10)
    make_boolean(df, 'cohort10', [1920], 'twenties')
    make_boolean(df, 'cohort10', [1930], 'thirties')
    make_boolean(df, 'cohort10', [1940], 'forties')
    make_boolean(df, 'cohort10', [1950], 'fifties')
    make_boolean(df, 'cohort10', [1960], 'sixties')
    make_boolean(df, 'cohort10', [1970], 'seventies')
    make_boolean(df, 'cohort10', [1980], 'eighties')
    make_boolean(df, 'cohort10', [1990], 'nineties')
    make_boolean(df, 'gunlaw', [1.0], 'favor')
    make_boolean(df, 'owngun', [1.0], 'gunhome')
    make_boolean(df, 'gun', [1.0], 'threatened')
    make_boolean(df, 'natcrime', [1.0], 'spendcrime')
    make_boolean(df, 'income', [12], 'topincome')
    make_boolean(df, 'income', [1,2,3,4,5,6,7,8], 'lowincome')
    make_boolean(df, 'polviews', [1], 'extremelyliberal')
    make_boolean(df, 'polviews', [2], 'liberal')
    make_boolean(df, 'polviews', [3], 'slightlyliberal')
    make_boolean(df, 'polviews', [4], 'moderate')
    make_boolean(df, 'polviews', [5], 'slightlyconservative')
    make_boolean(df, 'polviews', [6], 'conservative')
    make_boolean(df, 'polviews', [7], 'extremelyconservative')
    make_boolean(df, 'sex', [2], 'female')
    make_boolean(df, 'hispanic', [2], 'ishisp')
    make_boolean(df, 'race', [2], 'black')
    make_boolean(df, 'race', [3], 'otherrace')
    make_boolean(df, 'srcbelt', [1,2,5], 'urban')
    make_boolean(df, 'srcbelt', [3,4], 'suburban')
    make_boolean(df, 'srcbelt', [6], 'rural')

    df['college'] = df['educ'] >= 13 
    copy_nan(df, 'educ', 'college')
    
    quantile25 = df['realinc'].quantile(0.25)
    df['lowrealinc'] = df['realinc'] <= quantile25 
    copy_nan(df, 'realinc', 'lowrealinc')

    quantile75 = df['realinc'].quantile(0.75)
    df['highrealinc'] = df['realinc'] >= quantile75 
    copy_nan(df, 'realinc', 'highrealinc')

In [10]:
def replace_invalid(df):
    df.gunlaw.replace([8, 9, 0], np.nan, inplace=True)
    df.owngun.replace([3, 8, 9, 0], np.nan, inplace=True)
    df.gun.replace([8, 9, 0], np.nan, inplace=True)
    df.natcrime.replace([8, 9, 0], np.nan, inplace=True)
    df.income.replace([0, 13, 98, 99], np.nan, inplace=True)
    df.realinc.replace([0], np.nan, inplace=True)                  # TODO: check this
    df.educ.replace([98,99], np.nan, inplace=True)
    df.polviews.replace([8, 9, 0], np.nan, inplace=True)
    df.age.replace([98, 99], np.nan, inplace=True)               # 89 means 89 or older
    df.hispanic.replace([98, 99, 0], np.nan, inplace=True)
    df.cohort.replace([9999], np.nan, inplace=True)

In [11]:
gss = utils.read_gss('gss_gun')
replace_invalid(gss)
gss = gss.dropna(subset=['gunlaw', 'age', 'cohort'])
make_booleans(gss)
gss.shape

(41878, 57)

In [12]:
for varname in varnames:
    print(varname, sum(gss[varname].isnull()))

nineties 0
eighties 0
seventies 0
fifties 0
forties 0
thirties 0
twenties 0
female 0
black 0
otherrace 0
extremelyconservative 4719
conservative 4719
slightlyconservative 4719
moderate 4719
slightlyliberal 4719
liberal 4719
extremelyliberal 4719
lowrealinc 3833
highrealinc 3833
college 88
urban 0
rural 0


Select just the columns we need

In [13]:
data = sample[all_varnames]
data.shape

(40339, 26)

In [14]:
formula = ('favor ~ y + y2 + y3 + nineties + eighties + seventies + fifties + forties + thirties + twenties + '
           'female + black + otherrace + extremelyconservative + conservative + slightlyconservative + '
           'moderate + slightlyliberal + liberal + extremelyliberal + lowrealinc + highrealinc + '
           'college + urban + rural')
model = smf.logit(formula, data=data).fit()

model.summary()

         Current function value: 0.515093
         Iterations: 35


  bse_ = np.sqrt(np.diag(self.cov_params()))
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


0,1,2,3
Dep. Variable:,favor,No. Observations:,40339.0
Model:,Logit,Df Residuals:,40314.0
Method:,MLE,Df Model:,24.0
Date:,"Thu, 11 Jul 2019",Pseudo R-squ.:,0.05308
Time:,14:16:56,Log-Likelihood:,-20778.0
converged:,False,LL-Null:,-21943.0
Covariance Type:,nonrobust,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,0.9762,,,,,
y,0.0353,0.002,14.842,0.000,0.031,0.040
y2,0.0004,0.000,3.679,0.000,0.000,0.001
y3,-9.417e-05,6.81e-06,-13.822,0.000,-0.000,-8.08e-05
nineties,-0.3982,0.107,-3.723,0.000,-0.608,-0.189
eighties,-0.1924,0.066,-2.925,0.003,-0.321,-0.063
seventies,-0.1728,0.051,-3.370,0.001,-0.273,-0.072
fifties,-0.0248,0.036,-0.683,0.495,-0.096,0.046
forties,0.0067,0.039,0.172,0.863,-0.069,0.082


Make a row for someone in 2016 with all booleans false.

In [15]:
def make_base():
    y = 2016 - 1990
    y2 = y**2
    y3 = y**3

    d = dict(y=y, y2=y2, y3=y3)
    for varname in varnames:
        d[varname] = 0

    return pd.Series(d)

base = make_base()
base

y                           26
y2                         676
y3                       17576
nineties                     0
eighties                     0
seventies                    0
fifties                      0
forties                      0
thirties                     0
twenties                     0
female                       0
black                        0
otherrace                    0
extremelyconservative        0
conservative                 0
slightlyconservative         0
moderate                     0
slightlyliberal              0
liberal                      0
extremelyliberal             0
lowrealinc                   0
highrealinc                  0
college                      0
urban                        0
rural                        0
dtype: int64

Make a DataFrame that contains one row for each case we want to consider.

In [16]:
def make_df_pred():
    def add_yminus(df, varname, offset):
        """Add a column with y minus an offset.
        
        df: DataFrame
        varname: string new var name
        offset: how much to shift y
        """
        df.loc[varname] = base
        df.loc[varname, 'y'] += offset
        df.loc[varname, 'y2'] = df.loc[varname, 'y']**2
        df.loc[varname, 'y3'] = df.loc[varname, 'y']**3
    
    base = make_base()
    df_pred = pd.DataFrame(columns=base.index, dtype=float)    
    df_pred.loc['base'] = base

    for varname in varnames:
        df_pred.loc[varname] = base
        df_pred.loc[varname, varname] = 1
    
    add_yminus(df_pred, 'yminus10', -10)
    add_yminus(df_pred, 'yminus20', -20)
    add_yminus(df_pred, 'yminus30', -30)
    add_yminus(df_pred, 'yminus40', -40)
    
    #df_pred.loc['lowest combo'] = base
    #low_vars = ['gunhome', 'nineties', 'rural', 
    #            'conservative', 'lowrealinc']
    #df_pred.loc['lowest combo', low_vars] = 1
    
    #df_pred.loc['highest combo'] = base
    #high_vars = ['female', 'otherrace', 'liberal', 
    #            'college', 'highrealinc']
    #df_pred.loc['highest combo', high_vars] = 1
    
    return df_pred
    
df_pred = make_df_pred()

Unnamed: 0,y,y2,y3,nineties,eighties,seventies,fifties,forties,thirties,twenties,...,slightlyconservative,moderate,slightlyliberal,liberal,extremelyliberal,lowrealinc,highrealinc,college,urban,rural
base,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nineties,26.0,676.0,17576.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eighties,26.0,676.0,17576.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seventies,26.0,676.0,17576.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fifties,26.0,676.0,17576.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
forties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
thirties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
twenties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
female,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
black,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
pred = model.predict(df_pred) * 100

base                     62.278827
nineties                 52.578225
eighties                 57.663793
seventies                58.142619
fifties                  61.694854
forties                  62.435058
thirties                 59.928403
twenties                 62.712399
female                   77.390752
black                    69.661729
otherrace                75.852415
extremelyconservative    53.745930
conservative             57.017727
slightlyconservative     63.263835
moderate                 66.728903
slightlyliberal          71.463725
liberal                  73.274561
extremelyliberal         70.831347
lowrealinc               63.073288
highrealinc              62.705874
college                  65.334001
urban                    52.105091
rural                    42.096955
yminus10                 77.815581
yminus20                 76.527910
yminus30                 69.995202
yminus40                 69.330638
dtype: float64

In [18]:
pred - pred['base']

base                      0.000000
nineties                 -9.700602
eighties                 -4.615034
seventies                -4.136208
fifties                  -0.583973
forties                   0.156231
thirties                 -2.350424
twenties                  0.433572
female                   15.111925
black                     7.382902
otherrace                13.573588
extremelyconservative    -8.532897
conservative             -5.261100
slightlyconservative      0.985008
moderate                  4.450076
slightlyliberal           9.184898
liberal                  10.995734
extremelyliberal          8.552520
lowrealinc                0.794461
highrealinc               0.427047
college                   3.055174
urban                   -10.173736
rural                   -20.181872
yminus10                 15.536754
yminus20                 14.249083
yminus30                  7.716375
yminus40                  7.051811
dtype: float64

In [19]:
def make_result(pred):
    """Make a DataFrame with one row per case.
    
    pred: series of predictions
    """
    result = pd.DataFrame()
    result['pred'] = pred
    result['offset'] = pred - pred['base']
    return result

result = make_result(pred)

Unnamed: 0,pred,offset
base,62.278827,0.0
nineties,52.578225,-9.700602
eighties,57.663793,-4.615034
seventies,58.142619,-4.136208
fifties,61.694854,-0.583973
forties,62.435058,0.156231
thirties,59.928403,-2.350424
twenties,62.712399,0.433572
female,77.390752,15.111925
black,69.661729,7.382902


### Iterate

To estimate uncertainty due to random sampling and missing values, we have to iterate the procedure we just ran.

In [20]:
results = []
for sample in read_samples():
    data = sample[all_varnames]
    model = smf.logit(formula, data=data).fit(disp=0)

    df_pred = make_df_pred()
    pred = model.predict(df_pred) * 100
    result = make_result(pred)
        
    results.append(result)



Process the results.

In [21]:
preds = [result.pred for result in results]
median, low, high = thinkstats2.PercentileRows(preds, [50, 5, 95])

estimates = pd.DataFrame(index=result.index)
estimates['low5'] = low
estimates['median'] = median
estimates['high95'] = high
estimates.round(0).astype(int)

Unnamed: 0,low5,median,high95
base,59,61,63
nineties,48,52,56
eighties,53,55,58
seventies,55,57,60
fifties,57,59,61
forties,57,59,62
thirties,55,57,60
twenties,56,59,61
female,75,77,78
black,67,69,71


In [22]:
def make_table(estimates):
    lines = estimates.round(1).to_html().split('\n')
    for line in lines:
        print(line)

Generate the table for the offsets.

In [23]:
preds = [result.offset for result in results]
median, low, high = thinkstats2.PercentileRows(preds, [50, 2.5, 97.5])

offsets = pd.DataFrame(index=result.index)
offsets['low2.5'] = low
offsets['median'] = median
offsets['high97.5'] = high
table = offsets.sort_values('median', ascending=False).round(0).astype(int)

Unnamed: 0,low2.5,median,high97.5
female,14,16,17
yminus10,14,16,18
yminus20,13,15,18
otherrace,11,13,15
liberal,9,10,12
slightlyliberal,8,10,11
yminus30,7,9,12
black,7,8,10
extremelyliberal,5,8,11
yminus40,5,8,10


In [24]:
output = pd.DataFrame(columns=['support', 'offset', '90% CI'])
for label, row in table.iterrows():
    low, median, high = row
    support = estimates.loc[label]['median'].round(0).astype(int)
    ci = '(%d, %d)' % (low, high)
    output.loc[label] = support, median, ci
    
output

Unnamed: 0,support,offset,90% CI
female,77,16,"(14, 17)"
yminus10,76,16,"(14, 18)"
yminus20,76,15,"(13, 18)"
otherrace,74,13,"(11, 15)"
liberal,71,10,"(9, 12)"
slightlyliberal,70,10,"(8, 11)"
yminus30,70,9,"(7, 12)"
black,69,8,"(7, 10)"
extremelyliberal,69,8,"(5, 11)"
yminus40,68,8,"(5, 10)"


In [25]:
def make_table(offsets):
    lines = offsets.sort_values('median').round(1).to_html().split('\n')
    for line in lines:
        print(line)

In [26]:
output.loc[:'liberal']

Unnamed: 0,support,offset,90% CI
female,77,16,"(14, 17)"
yminus10,76,16,"(14, 18)"
yminus20,76,15,"(13, 18)"
otherrace,74,13,"(11, 15)"
liberal,71,10,"(9, 12)"


In [27]:
output.loc['college': 'thirties']

Unnamed: 0,support,offset,90% CI
college,64,3,"(2, 4)"
slightlyconservative,62,2,"(0, 3)"
highrealinc,62,1,"(0, 2)"
base,61,0,"(0, 0)"
lowrealinc,61,0,"(-2, 1)"
forties,59,-2,"(-4, 0)"
fifties,59,-2,"(-3, 0)"
twenties,59,-2,"(-4, 0)"
thirties,57,-4,"(-5, -2)"


In [28]:
output.loc['eighties': 'rural']

Unnamed: 0,support,offset,90% CI
eighties,55,-5,"(-8, -2)"
urban,53,-8,"(-10, -7)"
extremelyconservative,52,-9,"(-12, -6)"
nineties,52,-9,"(-15, -3)"
rural,42,-19,"(-21, -17)"
