# Searching for Simpson

Copyright 2021 Allen B. Downey

License: [Attribution-NonCommercial-ShareAlike 4.0 International (CC BY-NC-SA 4.0)](https://creativecommons.org/licenses/by-nc-sa/4.0/)

[Click here to run this notebook on Colab](https://colab.research.google.com/github/AllenDowney/ProbablyOverthinkingIt2/blob/master/simpson_wages.ipynb)

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
gss = pd.read_hdf('gss_eda.3.hdf5', 'gss0')
gss.shape

(64814, 169)

In [3]:
recode_polviews = {1:'Liberal', 
                   2:'Liberal', 
                   3:'Liberal', 
                   4:'Moderate', 
                   5:'Conservative', 
                   6:'Conservative', 
                   7:'Conservative'}

In [4]:
gss['polviews3'] = gss['polviews'].replace(recode_polviews)
gss['polviews3'].value_counts()

Moderate        21444
Conservative    19129
Liberal         14979
Name: polviews3, dtype: int64

>Generally speaking, do you usually think of yourself as a Republican, Democrat, Independent, or what?

The valid responses are:

```
0	Strong democrat
1	Not str democrat
2	Ind,near dem
3	Independent
4	Ind,near rep
5	Not str republican
6	Strong republican
7	Other party
```

You can [read the codebook for `partyid` here](https://gssdataexplorer.norc.org/projects/52787/variables/141/vshow).

In [5]:
recode_partyid = {0: 'Democrat',
                  1:'Democrat', 
                   2:'Independent', 
                   3:'Independent', 
                   4:'Independent', 
                   5:'Republican', 
                   6:'Republican', 
                   7:'Other'}

In [6]:
gss['partyid4'] = gss['partyid'].replace(recode_partyid)
gss['partyid4'].value_counts()

Independent    23404
Democrat       23308
Republican     16617
Other           1064
Name: partyid4, dtype: int64

Respondent's highest degree

```
0 	Lt high school
1 	High school
2 	Junior college
3 	Bachelor
4 	Graduate
8 	Don't know
9 	No answer
```



In [36]:
gss['degree'].value_counts()

1.0    33855
0.0    13274
3.0     9277
4.0     4465
2.0     3759
Name: degree, dtype: int64

> What is your religious preference? Is it Protestant, Catholic, Jewish, some other religion, or no religion?

```
1 	Protestant
2 	Catholic
3 	Jewish
4 	None
5 	Other
6 	Buddhism
7 	Hinduism
8 	Other eastern
9 	Moslem/islam
10 	Orthodox-christian
11 	Christian
12 	Native american
13 	Inter-nondenominational
```



In [39]:
recode_relig = {1:'Protestant', 
                   2:'Catholic', 
                   3:'Other', 
                   4:'None', 
                   5:'Other', 
                   6:'Other', 
                   7:'Other', 
                   8:'Other', 
                   9:'Other', 
                   10:'Other Christian', 
                   11:'Other Christian', 
                   12:'Other', 
                   13:'Other'}

In [40]:
gss['relig5'] = gss['relig'].replace(recode_relig)
gss['relig5'].value_counts()

Protestant         36378
Catholic           16501
None                7803
Other               2966
Other Christian      896
Name: relig5, dtype: int64

In [7]:
yvarname = 'gunlaw'
yvar = gss[yvarname]

In [8]:
counts = yvar.value_counts()
counts

1.0    31917
2.0    10076
Name: gunlaw, dtype: int64

In [9]:
most_common = counts.idxmax()
most_common

1.0

In [10]:
d = counts.copy()
d[:] = 0
d[most_common] = 1
d

1.0    1
2.0    0
Name: gunlaw, dtype: int64

In [12]:
gss['y'] = yvar.replace(d)
gss['y'].value_counts()

1.0    31917
0.0    10076
Name: y, dtype: int64

In [13]:
gss['y'].isnull().sum()

22821

In [14]:
xvarname = 'year'
gss['x'] = gss[xvarname]

In [27]:
import statsmodels.formula.api as smf

formula = 'y ~ x'
results = smf.logit(formula, data=gss).fit()

Optimization terminated successfully.
         Current function value: 0.550827
         Iterations 5


In [28]:
param = results.params['x']
param

0.0032343540300543414

In [29]:
pvalue = results.pvalues['x']
pvalue

6.733635524343815e-05

In [30]:
conf_int = results.conf_int().loc['x'].values
conf_int

array([0.00164379, 0.00482492])

In [31]:
def get_xresult(results):
    param = results.params['x']
    pvalue = results.pvalues['x']
    conf_int = results.conf_int().loc['x'].values
    return [param, pvalue, conf_int]

In [32]:
columns = ['param', 'pvalue', 'conf_inf']
result_df = pd.DataFrame(columns=columns, dtype=object)
result_df.loc['all'] = get_xresult(results)
result_df

Unnamed: 0,param,pvalue,conf_inf
all,0.003234,6.7e-05,"[0.0016437864240885886, 0.0048249216360200945]"


In [33]:
gvarname = 'partyid4'
grouped = gss.groupby(gvarname)

for name, group in grouped:
    print(name, len(group))

Democrat 23308
Independent 23404
Other 1064
Republican 16617


In [34]:
for name, group in grouped:
    results = smf.logit(formula, data=group).fit(disp=False)
    result_df.loc[name] = get_xresult(results)
    print(name, results.params['x'])

Democrat 0.01936258108189358
Independent 0.00250683212965543
Other -0.013315263914553907
Republican -0.00936623324946794


In [35]:
result_df

Unnamed: 0,param,pvalue,conf_inf
all,0.003234,6.733636e-05,"[0.0016437864240885886, 0.0048249216360200945]"
Democrat,0.019363,2.9446669999999995e-38,"[0.01642815207928425, 0.022297010084502907]"
Independent,0.002507,0.06016695,"[-0.00010722608201531704, 0.005120890341326177]"
Other,-0.013315,0.007678432,"[-0.02310464633097545, -0.0035258814981323633]"
Republican,-0.009366,3.977877e-09,"[-0.012485542444758965, -0.006246924054176914]"


In [None]:
import matplotlib.pyplot as plt

plt.rcParams['figure.figsize'] = (9, 4)

def decorate(**options):
    """Decorate the current axes.
    
    Call decorate with keyword arguments like
    decorate(title='Title',
             xlabel='x',
             ylabel='y')
             
    The keyword arguments can be any of the axis properties
    https://matplotlib.org/api/axes_api.html
    """
    ax = plt.gca()
    ax.set(**options)
    
    handles, labels = ax.get_legend_handles_labels()
    if handles:
        ax.legend(handles, labels)

    plt.tight_layout()

In [None]:
def stretch_x(factor = 0.03):
    low, high = plt.xlim()
    space = (high-low) * factor
    plt.xlim(low - space, high + space)

In [None]:
def anchor_legend(x, y):
    """Place the upper left corner of the legend box.
    
    x: x coordinate
    y: y coordinate
    """
    plt.legend(bbox_to_anchor=(x, y), loc='upper left', ncol=1)   
    plt.tight_layout()

In [None]:
import numpy as np
from statsmodels.nonparametric.smoothers_lowess import lowess

def make_lowess(series):
    """Use LOWESS to compute a smooth line.
    
    series: pd.Series
    
    returns: pd.Series
    """
    y = series.values
    x = series.index.values

    smooth = lowess(y, x)
    index, data = np.transpose(smooth)

    return pd.Series(data, index=series.index) 

In [None]:
def plot_series_lowess(series, color, indexed=False, plot_series=True):
    """Plots a series of data points and a smooth line.
    
    series: pd.Series
    color: string or tuple
    """
    if plot_series:
        series.plot(linewidth=0, marker='o', color=color, alpha=0.1, label='_')
    smooth = make_lowess(series)
    if indexed:
        smooth /= smooth.iloc[0] / 100
        
    style = '--' if series.name=='Total, all educational levels' else '-'
    smooth.plot(style=style, label=series.name, color=color)

In [None]:
def plot_columns_lowess(table, columns, colors, **options):
    """Plot the columns in a DataFrame.
    
    table: DataFrame with a cross tabulation
    columns: list of column names, in the desired order
    colors: mapping from column names to colors
    """
    for col in columns:
        series = table[col]
        plot_series_lowess(series, colors[col], **options)

In [None]:
colors = {
    'Total, all educational levels' : 'gray',
    'Less than a high school diploma' : 'C0',
    'High school graduates, no college' : 'C1',
    'Some college or associate degree' : 'C2',
    "Bachelor's degree only" : 'C3',
    'Advanced degree' : 'C4',
}