# Extract data for the Political Alignment Case Study

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
%load_ext nb_black

<IPython.core.display.Javascript object>


Reading data from the [General Social Survey](https://gssdataexplorer.norc.org)


In [4]:
columns = sorted(
    [
        "year",
        "id",
        "divorce",
        "sibs",
        "childs",
        "age",
        "educ",
        "degree",
        "sex",
        "race",
        "res16",
        "reg16",
        "adults",
        "income",
        "rincome",
        "region",
        "srcbelt",
        "partyid",
        "pres96",
        "pres00",
        "pres04",
        "pres08",
        "pres12",
        "polviews",
        "natspac",
        "natenvir",
        "natheal",
        "natcity",
        "natcrime",
        "natdrug",
        "nateduc",
        "natrace",
        "natarms",
        "nataid",
        "natfare",
        "natroad",
        "natsoc",
        "natmass",
        "natpark",
        "natchld",
        "natsci",
        "natenrgy",
        "spkath",
        "colath",
        "libath",
        "spksoc",
        "colsoc",
        "libsoc",
        "spkrac",
        "colrac",
        "librac",
        "spkcom",
        "colcom",
        "libcom",
        "spkmil",
        "colmil",
        "libmil",
        "spkhomo",
        "colhomo",
        "libhomo",
        "spkmslm",
        "colmslm",
        "libmslm",
        "cappun",
        "gunlaw",
        "grass",
        "relig",
        "fund",
        "attend",
        "reliten",
        "relitenv",
        "relitennv",
        "postlife",
        "pray",
        "popespks",
        "relig16",
        "prayer",
        "bible",
        "racmar",
        "racdin",
        "racpush",
        "racseg",
        "racopen",
        "raclive",
        "racclos",
        "racdis",
        "racinteg",
        "rachome",
        "racschol",
        "racfew",
        "rachaf",
        "racmost",
        "racpres",
        "racchurh",
        "affrmact",
        "happy",
        "hapmar",
        "health",
        "life",
        "helpful",
        "fair",
        "trust",
        "confinan",
        "conbus",
        "conclerg",
        "coneduc",
        "confed",
        "conlabor",
        "conpress",
        "conmedic",
        "contv",
        "conjudge",
        "consci",
        "conlegis",
        "conarmy",
        "satjob",
        "class",
        "satfin",
        "finrela",
        "union",
        "fehome",
        "fework",
        "fepres",
        "fepol",
        "abdefect",
        "abnomore",
        "abhlth",
        "abpoor",
        "abrape",
        "absingle",
        "abany",
        "chldidel",
        "sexeduc",
        "divlaw",
        "premarsx",
        "teensex",
        "xmarsex",
        "homosex",
        "marhomo",
        "marhomo1",
        "pornlaw",
        "spanking",
        "letdie1",
        "polhitok",
        "polabuse",
        "polmurdr",
        "polescap",
        "polattak",
        "fear",
        "owngun",
        "pistol",
        "rowngun",
        "hunt",
        "phone",
        "fechld",
        "fehelp",
        "fepresch",
        "fefam",
        "racdif1",
        "racdif2",
        "racdif3",
        "racdif4",
        "god",
        "reborn",
        "savesoul",
        "racwork",
        "fejobaff",
        "discaffm",
        "discaffw",
        "fehire",
        "relpersn",
        "sprtprsn",
        "relexp",
        "spklang",
        "compuse",
        "hrsrelax",
        "trdunion",
        "wkracism",
        "wksexism",
        "wkharsex",
        "databank",
        "goodlife",
        "meovrwrk",
        "miracles",
        "relexper",
        "relactiv",
        "matesex",
        "frndsex",
        "acqntsex",
        "pikupsex",
        "paidsex",
        "othersex",
        "sexsex",
        "sexfreq",
        "sexsex5",
        "sexornt",
        "hhrace",
        "cohort",
        "ballot",
        "wtssall",
        "wtssps",
        "sexbirth",
        "sexnow",
        "eqwlth",
        "realinc",
        "realrinc",
        "coninc",
        "conrinc",
        "commun",
    ]
)

<IPython.core.display.Javascript object>

In [5]:
!ls ../data/raw

gss7221_r2.dta.gz  gss7222_r2.dta.gz   gss7222_r3.dta.gz  GSS_stata
gss7222_r1.dta.gz  gss7222_r3a.dta.gz  gss7222_r4.dta.gz  GSS_stata.zip


<IPython.core.display.Javascript object>

In [6]:
# 2021 data
filename = "../data/raw/gss7221_r2.dta"

# 2022 data
filename = "../data/raw/gss7222_r4.dta.gz"

gss = pd.read_stata(filename, columns=columns, convert_categoricals=False)

<IPython.core.display.Javascript object>

In [7]:
gss.groupby("year")["wtssps"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1972,1613.0,0.999827,0.46917,0.21325,0.725256,0.911848,1.1532,5.114857
1973,1504.0,0.999916,0.439902,0.296,0.731279,0.878354,1.128607,3.401482
1974,1484.0,1.000213,0.452839,0.334948,0.711713,0.866729,1.158761,5.038814
1975,1490.0,0.999844,0.465884,0.289444,0.697593,0.890888,1.181739,3.922165
1976,1499.0,1.000063,0.475174,0.277629,0.685789,0.899926,1.190005,4.063941
1977,1530.0,1.000094,0.494082,0.28979,0.66496,0.896281,1.216632,4.317543
1978,1532.0,1.000059,0.773778,0.290107,0.526309,0.687569,1.22523,6.156765
1980,1468.0,1.000086,0.712144,0.183707,0.653929,0.875932,1.079192,9.422703
1982,1860.0,1.0,0.827104,0.121504,0.520751,0.739754,1.246939,7.39304
1983,1599.0,0.999912,0.802026,0.154227,0.530458,0.692255,1.217181,8.714284


<IPython.core.display.Javascript object>

In [8]:
# weights are different in 2021 and 2022
# but we only use them for resampling within each year of the survey
gss["wtssall"].fillna(gss["wtssps"], inplace=True)
gss["wtssall"].describe()

count    72390.000000
mean         1.000014
std          0.550871
min          0.073972
25%          0.549300
50%          0.961700
75%          1.098500
max         14.272462
Name: wtssall, dtype: float64

<IPython.core.display.Javascript object>

In the most recent data, reliten is split into two variables, with and without voluntary responses.
I'm combining them into a single variable.

In [9]:
gss["reliten"].describe()

count    61513.000000
mean         1.979955
std          0.998750
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: reliten, dtype: float64

<IPython.core.display.Javascript object>

In [10]:
gss["reliten"] = gss["reliten"].fillna(gss["relitenv"]).fillna(gss["relitennv"])

<IPython.core.display.Javascript object>

In [11]:
gss["reliten"].value_counts()

reliten
2.0    26154
1.0    24826
4.0     9727
3.0     6405
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [12]:
del gss["relitenv"]
del gss["relitennv"]

<IPython.core.display.Javascript object>

In [13]:
print(gss.shape)
gss.head()

(72390, 207)


Unnamed: 0,abany,abdefect,abhlth,abnomore,abpoor,abrape,absingle,acqntsex,adults,affrmact,...,trdunion,trust,union,wkharsex,wkracism,wksexism,wtssall,wtssps,xmarsex,year
0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,,...,,3.0,,,,,0.4446,0.663196,,1972
1,,1.0,1.0,2.0,2.0,1.0,1.0,,2.0,,...,,1.0,,,,,0.8893,0.91737,,1972
2,,1.0,1.0,1.0,1.0,1.0,1.0,,2.0,,...,,2.0,,,,,0.8893,0.897413,,1972
3,,2.0,1.0,2.0,1.0,1.0,1.0,,2.0,,...,,2.0,,,,,0.8893,1.066341,,1972
4,,1.0,1.0,1.0,1.0,1.0,1.0,,2.0,,...,,2.0,,,,,0.8893,0.944324,,1972


<IPython.core.display.Javascript object>

In [14]:
!rm -f ../data/interim/gss_pacs_2022.hdf

<IPython.core.display.Javascript object>

In [15]:
gss.to_hdf("../data/interim/gss_pacs_2022.hdf", "gss", complevel=7)

<IPython.core.display.Javascript object>

In [16]:
!ls -lh ../data/interim/gss_pacs_2022.hdf

-rw-rw-r-- 1 downey downey 10M Mar  6 16:48 ../data/interim/gss_pacs_2022.hdf


<IPython.core.display.Javascript object>

In [17]:
!git add -f ../data/interim/gss_pacs_2022.hdf
!git commit -m "Adding PACS data file for 2022 r4"
!git push

[main e954d99] Adding PACS data file for 2022 r4
 1 file changed, 0 insertions(+), 0 deletions(-)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 12 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 3.78 MiB | 2.52 MiB/s, done.
Total 5 (delta 3), reused 0 (delta 0)
remote: Resolving deltas: 100% (3/3), completed with 3 local objects.[K
To https://github.com/AllenDowney/GssExtract.git
   b74371e..e954d99  main -> main


<IPython.core.display.Javascript object>