# A generational model of support for gun control

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
# Configure Jupyter so figures appear in the notebook
%matplotlib inline

# Configure Jupyter to display the assigned value after an assignment
%config InteractiveShell.ast_node_interactivity='last_expr_or_assign'

import pandas as pd
import numpy as np

import thinkstats2
import thinkplot
import utils

import matplotlib.pyplot as plt
import matplotlib

import seaborn as sns
sns.set(style='white', font_scale=1.0, context='talk')
RED, BLUE, GREEN, PURPLE, ORANGE, YELLOW = sns.color_palette('Set1')

from collections import Counter

import statsmodels.formula.api as smf
from statsmodels.discrete.discrete_model import MNLogit
from statsmodels.discrete.discrete_model import Logit

In [2]:
def read_samples(iters=101):
    """Read samples.
    
    iters: number of times to run
    """
    for i in range(iters):
        key = 'iter%d' % i
        sample = pd.read_hdf('iterations.h5', key)
        yield sample

In [3]:
for sample in read_samples(1):
    pass

### Run logistic models

In [4]:
sample.shape

(40339, 67)

In [5]:
sample.columns

Index(['year', 'conrinc', 'cohort', 'ballot', 'wtssall', 'income', 'finrela',
       'realinc', 'sex', 'homosex', 'hispanic', 'rowngun', 'owngun', 'id_',
       'age', 'race', 'srcbelt', 'polviews', 'natcrime', 'gunlaw', 'gun',
       'gunage', 'gunnum', 'educ', 'cohort5', 'cohort10', 'year8', 'year4',
       'age10', 'age5', 'age3', 'twenties', 'thirties', 'forties', 'fifties',
       'sixties', 'seventies', 'eighties', 'nineties', 'favor', 'gunhome',
       'threatened', 'spendcrime', 'topincome', 'lowincome', 'liberal',
       'moderate', 'conservative', 'female', 'ishisp', 'black', 'otherrace',
       'urban', 'suburban', 'rural', 'college', 'lowrealinc', 'highrealinc',
       'ones', 'c', 'a', 'y', 'c2', 'a2', 'y2', 'y3', 'ac'],
      dtype='object')

In [6]:
# not including Hispanic, due to too much missing data

varnames = ['nineties', 'eighties', 'seventies', 'fifties', 'forties', 'thirties', 'twenties',
            'female', 'black', 'otherrace', 'conservative', 'liberal', 'lowrealinc', 'highrealinc',
            'college', 'urban', 'rural', 'gunhome']

all_varnames = varnames + ['y', 'y2', 'y3', 'favor']

['nineties',
 'eighties',
 'seventies',
 'fifties',
 'forties',
 'thirties',
 'twenties',
 'female',
 'black',
 'otherrace',
 'conservative',
 'liberal',
 'lowrealinc',
 'highrealinc',
 'college',
 'urban',
 'rural',
 'gunhome',
 'y',
 'y2',
 'y3',
 'favor']

In [7]:
def copy_nan(df, varname, newvar):
    """Put a NaN into newvar in any place where varname is Nan.
    
    df: DataFrame
    varname: string old var name
    newvar: string new var name
    """
    df.loc[df[varname].isnull(), newvar] = np.nan

In [8]:
def make_boolean(df, varname, values, newvar):
    """Make a boolean variable.
    
    df: DataFrame
    varname: name of base variable
    values: sequence of values for varname
    newvar: name of new variable (recode)
    """
    #assert numnull(df[varname]) == 0
    df[newvar] = df[varname].isin(values)
    copy_nan(df, varname, newvar)

In [9]:
def make_booleans(df):
    df['cohort10'] = utils.RoundIntoBins(df, 'cohort', 10)
    make_boolean(df, 'cohort10', [1920], 'twenties')
    make_boolean(df, 'cohort10', [1930], 'thirties')
    make_boolean(df, 'cohort10', [1940], 'forties')
    make_boolean(df, 'cohort10', [1950], 'fifties')
    make_boolean(df, 'cohort10', [1960], 'sixties')
    make_boolean(df, 'cohort10', [1970], 'seventies')
    make_boolean(df, 'cohort10', [1980], 'eighties')
    make_boolean(df, 'cohort10', [1990], 'nineties')
    make_boolean(df, 'gunlaw', [1.0], 'favor')
    make_boolean(df, 'owngun', [1.0], 'gunhome')
    make_boolean(df, 'gun', [1.0], 'threatened')
    make_boolean(df, 'natcrime', [1.0], 'spendcrime')
    make_boolean(df, 'income', [12], 'topincome')
    make_boolean(df, 'income', [1,2,3,4,5,6,7,8], 'lowincome')
    make_boolean(df, 'polviews', [1,2,3], 'liberal')
    make_boolean(df, 'polviews', [4], 'moderate')
    make_boolean(df, 'polviews', [6,7,8], 'conservative')
    make_boolean(df, 'sex', [2], 'female')
    make_boolean(df, 'hispanic', [2], 'ishisp')
    make_boolean(df, 'race', [2], 'black')
    make_boolean(df, 'race', [3], 'otherrace')
    make_boolean(df, 'srcbelt', [1,2,5], 'urban')
    make_boolean(df, 'srcbelt', [3,4], 'suburban')
    make_boolean(df, 'srcbelt', [6], 'rural')

    df['college'] = df['educ'] >= 13 
    copy_nan(df, 'educ', 'college')
    
    quantile25 = df['realinc'].quantile(0.25)
    df['lowrealinc'] = df['realinc'] <= quantile25 
    copy_nan(df, 'realinc', 'lowrealinc')

    quantile75 = df['realinc'].quantile(0.75)
    df['highrealinc'] = df['realinc'] >= quantile75 
    copy_nan(df, 'realinc', 'highrealinc')

In [10]:
def replace_invalid(df):
    df.gunlaw.replace([8, 9, 0], np.nan, inplace=True)
    df.owngun.replace([3, 8, 9, 0], np.nan, inplace=True)
    df.gun.replace([8, 9, 0], np.nan, inplace=True)
    df.natcrime.replace([8, 9, 0], np.nan, inplace=True)
    df.income.replace([0, 13, 98, 99], np.nan, inplace=True)
    df.realinc.replace([0], np.nan, inplace=True)                  # TODO: check this
    df.educ.replace([98,99], np.nan, inplace=True)
    df.polviews.replace([8, 9, 0], np.nan, inplace=True)
    df.age.replace([98, 99], np.nan, inplace=True)               # 89 means 89 or older
    df.hispanic.replace([98, 99, 0], np.nan, inplace=True)
    df.cohort.replace([9999], np.nan, inplace=True)

In [11]:
gss = utils.ReadGss('gss_gun')
replace_invalid(gss)
gss = gss.dropna(subset=['gunlaw', 'age', 'cohort'])
make_booleans(gss)
gss.shape

(40339, 52)

In [12]:
for varname in varnames:
    print(varname, sum(gss[varname].isnull()))

nineties 0
eighties 0
seventies 0
fifties 0
forties 0
thirties 0
twenties 0
female 0
black 0
otherrace 0
conservative 4658
liberal 4658
lowrealinc 3712
highrealinc 3712
college 86
urban 0
rural 0
gunhome 3497


Select just the columns we need

In [13]:
data = sample[all_varnames]
data.shape

(40339, 22)

In [14]:
formula = ('favor ~ y + y2 + y3 + nineties + eighties + seventies + fifties + forties + thirties + twenties + '
           'female + black + otherrace + conservative + liberal + lowrealinc + highrealinc + ' 
           'college + urban + rural + gunhome')
model = smf.logit(formula, data=data).fit()

model.summary()

Optimization terminated successfully.
         Current function value: 0.507006
         Iterations 6


0,1,2,3
Dep. Variable:,favor,No. Observations:,40339.0
Model:,Logit,Df Residuals:,40317.0
Method:,MLE,Df Model:,21.0
Date:,"Sat, 13 Oct 2018",Pseudo R-squ.:,0.07799
Time:,15:07:22,Log-Likelihood:,-20452.0
converged:,True,LL-Null:,-22182.0
,,LLR p-value:,0.0

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
Intercept,1.5249,0.045,33.550,0.000,1.436,1.614
y,0.0296,0.002,12.423,0.000,0.025,0.034
y2,9.082e-05,0.000,0.858,0.391,-0.000,0.000
y3,-7.73e-05,6.84e-06,-11.302,0.000,-9.07e-05,-6.39e-05
nineties,-0.4654,0.112,-4.166,0.000,-0.684,-0.246
eighties,-0.3440,0.065,-5.267,0.000,-0.472,-0.216
seventies,-0.1579,0.052,-3.034,0.002,-0.260,-0.056
fifties,0.0619,0.037,1.672,0.094,-0.011,0.135
forties,0.0641,0.039,1.645,0.100,-0.012,0.140


Make a row for someone in 2016 with all booleans false.

In [15]:
def make_base():
    y = 2016 - 1990
    y2 = y**2
    y3 = y**3

    d = dict(y=y, y2=y2, y3=y3)
    for varname in varnames:
        d[varname] = 0

    return pd.Series(d)

base = make_base()
base

y                  26
y2                676
y3              17576
nineties            0
eighties            0
seventies           0
fifties             0
forties             0
thirties            0
twenties            0
female              0
black               0
otherrace           0
conservative        0
liberal             0
lowrealinc          0
highrealinc         0
college             0
urban               0
rural               0
gunhome             0
dtype: int64

Make a DataFrame that contains one row for each case we want to consider.

In [44]:
def make_df_pred():
    def add_yminus(df, varname, offset):
        """Add a column with y minus an offset.
        
        df: DataFrame
        varname: string new var name
        offset: how much to shift y
        """
        df.loc[varname] = base
        df.loc[varname, 'y'] += offset
        df.loc[varname, 'y2'] = df.loc[varname, 'y']**2
        df.loc[varname, 'y3'] = df.loc[varname, 'y']**3
    
    base = make_base()
    df_pred = pd.DataFrame(columns=base.index, dtype=float)    
    df_pred.loc['base'] = base

    for varname in varnames:
        df_pred.loc[varname] = base
        df_pred.loc[varname, varname] = 1
    
    add_yminus(df_pred, 'yminus10', -10)
    add_yminus(df_pred, 'yminus20', -20)
    add_yminus(df_pred, 'yminus30', -30)
    add_yminus(df_pred, 'yminus40', -40)
    
    #df_pred.loc['lowest combo'] = base
    #low_vars = ['gunhome', 'nineties', 'rural', 
    #            'conservative', 'lowrealinc']
    #df_pred.loc['lowest combo', low_vars] = 1
    
    #df_pred.loc['highest combo'] = base
    #high_vars = ['female', 'otherrace', 'liberal', 
    #            'college', 'highrealinc']
    #df_pred.loc['highest combo', high_vars] = 1
    
    return df_pred
    
df_pred = make_df_pred()

Unnamed: 0,y,y2,y3,nineties,eighties,seventies,fifties,forties,thirties,twenties,...,black,otherrace,conservative,liberal,lowrealinc,highrealinc,college,urban,rural,gunhome
base,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nineties,26.0,676.0,17576.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eighties,26.0,676.0,17576.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
seventies,26.0,676.0,17576.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
fifties,26.0,676.0,17576.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
forties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
thirties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
twenties,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
female,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
black,26.0,676.0,17576.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [45]:
pred = model.predict(df_pred) * 100

base            74.241221
nineties        62.446486
eighties        67.333242
seventies       69.682272
fifties         72.817721
forties         73.834590
thirties        72.276778
twenties        73.031649
female          85.300399
black           79.685077
otherrace       81.082232
conservative    68.338998
liberal         80.041709
lowrealinc      70.461078
highrealinc     75.886907
college         74.940412
urban           68.753322
rural           63.429089
gunhome         53.805935
yminus10        85.339846
yminus20        85.405680
yminus30        81.700025
yminus40        79.761054
dtype: float64

In [46]:
pred - pred['base']

base             0.000000
nineties       -11.794734
eighties        -6.907979
seventies       -4.558949
fifties         -1.423500
forties         -0.406631
thirties        -1.964443
twenties        -1.209572
female          11.059178
black            5.443856
otherrace        6.841012
conservative    -5.902223
liberal          5.800488
lowrealinc      -3.780143
highrealinc      1.645687
college          0.699191
urban           -5.487898
rural          -10.812132
gunhome        -20.435285
yminus10        11.098625
yminus20        11.164459
yminus30         7.458804
yminus40         5.519833
dtype: float64

In [47]:
def make_result(pred):
    """Make a DataFrame with one row per case.
    
    pred: series of predictions
    """
    result = pd.DataFrame()
    result['pred'] = pred
    result['offset'] = pred - pred['base']
    return result

result = make_result(pred)

Unnamed: 0,pred,offset
base,74.241221,0.0
nineties,62.446486,-11.794734
eighties,67.333242,-6.907979
seventies,69.682272,-4.558949
fifties,72.817721,-1.4235
forties,73.83459,-0.406631
thirties,72.276778,-1.964443
twenties,73.031649,-1.209572
female,85.300399,11.059178
black,79.685077,5.443856


### Iterate

To estimate uncertainty due to random sampling and missing values, we have to iterate the procedure we just ran.

In [48]:
results = []
for sample in read_samples():
    data = sample[all_varnames]
    model = smf.logit(formula, data=data).fit(disp=0)

    df_pred = make_df_pred()
    pred = model.predict(df_pred) * 100
    result = make_result(pred)
        
    results.append(result)

Process the results.

In [49]:
preds = [result.pred for result in results]
median, low, high = thinkstats2.PercentileRows(preds, [50, 5, 95])

estimates = pd.DataFrame(index=result.index)
estimates['low5'] = low
estimates['median'] = median
estimates['high95'] = high
estimates.round(0).astype(int)

Unnamed: 0,low5,median,high95
base,71,72,75
nineties,60,63,67
eighties,64,67,69
seventies,67,69,72
fifties,70,72,74
forties,70,73,74
thirties,69,71,73
twenties,69,72,74
female,83,84,86
black,75,77,80


In [50]:
def make_table(estimates):
    lines = estimates.round(1).to_html().split('\n')
    for line in lines:
        print(line)

Generate the table for the offsets.

In [51]:
preds = [result.offset for result in results]
median, low, high = thinkstats2.PercentileRows(preds, [50, 2.5, 97.5])

offsets = pd.DataFrame(index=result.index)
offsets['low2.5'] = low
offsets['median'] = median
offsets['high97.5'] = high
table = offsets.sort_values('median', ascending=False).round(0).astype(int)

Unnamed: 0,low2.5,median,high97.5
yminus20,10,12,14
yminus10,10,12,14
female,11,11,13
yminus30,6,8,11
otherrace,5,7,9
yminus40,5,7,9
black,3,5,6
liberal,3,4,5
highrealinc,1,2,3
college,1,2,3


In [60]:
output = pd.DataFrame(columns=['support', 'offset', '90% CI'])
for label, row in table.iterrows():
    low, median, high = row
    support = estimates.loc[label]['median'].round(0).astype(int)
    ci = '(%d, %d)' % (low, high)
    output.loc[label] = support, median, ci
    
output

Unnamed: 0,support,offset,90% CI
yminus20,85,12,"(10, 14)"
yminus10,85,12,"(10, 14)"
female,84,11,"(11, 13)"
yminus30,81,8,"(6, 11)"
otherrace,80,7,"(5, 9)"
yminus40,79,7,"(5, 9)"
black,77,5,"(3, 6)"
liberal,77,4,"(3, 5)"
highrealinc,75,2,"(1, 3)"
college,74,2,"(1, 3)"


In [53]:
def make_table(offsets):
    lines = offsets.sort_values('median').round(1).to_html().split('\n')
    for line in lines:
        print(line)