# Extract data for the Political Alignment Case Study

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>


Reading data from the [General Social Survey](https://gssdataexplorer.norc.org)


In [3]:
columns = sorted(
    [
        "year",
        "id",
        "divorce",
        "sibs",
        "childs",
        "age",
        "educ",
        "degree",
        "sex",
        "race",
        "res16",
        "reg16",
        "adults",
        "income",
        "rincome",
        "region",
        "srcbelt",
        "partyid",
        "pres96",
        "pres00",
        "pres04",
        "pres08",
        "pres12",
        "polviews",
        "natspac",
        "natenvir",
        "natheal",
        "natcity",
        "natcrime",
        "natdrug",
        "nateduc",
        "natrace",
        "natarms",
        "nataid",
        "natfare",
        "natroad",
        "natsoc",
        "natmass",
        "natpark",
        "natchld",
        "natsci",
        "natenrgy",
        "spkath",
        "colath",
        "libath",
        "spksoc",
        "colsoc",
        "libsoc",
        "spkrac",
        "colrac",
        "librac",
        "spkcom",
        "colcom",
        "libcom",
        "spkmil",
        "colmil",
        "libmil",
        "spkhomo",
        "colhomo",
        "libhomo",
        "spkmslm",
        "colmslm",
        "libmslm",
        "cappun",
        "gunlaw",
        "grass",
        "relig",
        "fund",
        "attend",
        "reliten",
        "relitenv",
        "relitennv",
        "postlife",
        "pray",
        "popespks",
        "relig16",
        "prayer",
        "bible",
        "racmar",
        "racdin",
        "racpush",
        "racseg",
        "racopen",
        "raclive",
        "racclos",
        "racdis",
        "racinteg",
        "rachome",
        "racschol",
        "racfew",
        "rachaf",
        "racmost",
        "racpres",
        "racchurh",
        "affrmact",
        "happy",
        "hapmar",
        "health",
        "life",
        "helpful",
        "fair",
        "trust",
        "confinan",
        "conbus",
        "conclerg",
        "coneduc",
        "confed",
        "conlabor",
        "conpress",
        "conmedic",
        "contv",
        "conjudge",
        "consci",
        "conlegis",
        "conarmy",
        "satjob",
        "class",
        "satfin",
        "finrela",
        "union",
        "fehome",
        "fework",
        "fepres",
        "fepol",
        "abdefect",
        "abnomore",
        "abhlth",
        "abpoor",
        "abrape",
        "absingle",
        "abany",
        "chldidel",
        "sexeduc",
        "divlaw",
        "premarsx",
        "teensex",
        "xmarsex",
        "homosex",
        "pornlaw",
        "spanking",
        "letdie1",
        "polhitok",
        "polabuse",
        "polmurdr",
        "polescap",
        "polattak",
        "fear",
        "owngun",
        "pistol",
        "rowngun",
        "hunt",
        "phone",
        "fechld",
        "fehelp",
        "fepresch",
        "fefam",
        "racdif1",
        "racdif2",
        "racdif3",
        "racdif4",
        "god",
        "reborn",
        "savesoul",
        "racwork",
        "fejobaff",
        "discaffm",
        "discaffw",
        "fehire",
        "relpersn",
        "sprtprsn",
        "relexp",
        "spklang",
        "compuse",
        "hrsrelax",
        "trdunion",
        "wkracism",
        "wksexism",
        "wkharsex",
        "databank",
        "goodlife",
        "meovrwrk",
        "miracles",
        "relexper",
        "relactiv",
        "matesex",
        "frndsex",
        "acqntsex",
        "pikupsex",
        "paidsex",
        "othersex",
        "sexsex",
        "sexfreq",
        "sexsex5",
        "sexornt",
        "hhrace",
        "cohort",
        "ballot",
        "wtssall",
        "wtssps",
        "sexbirth",
        "sexnow",
        "eqwlth",
        "realinc",
        "realrinc",
        "coninc",
        "conrinc",
        "commun",
    ]
)

<IPython.core.display.Javascript object>

In [4]:
# 2021 data
filename = "../data/raw/gss7221_r2.dta"

# 2022 data
filename = "../data/raw/gss7222_r1.dta"

gss = pd.read_stata(filename, columns=columns, convert_categoricals=False)

<IPython.core.display.Javascript object>

In [5]:
gss.groupby("year")["wtssps"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1972,0.0,,,,,,,
1973,0.0,,,,,,,
1974,0.0,,,,,,,
1975,0.0,,,,,,,
1976,0.0,,,,,,,
1977,0.0,,,,,,,
1978,0.0,,,,,,,
1980,0.0,,,,,,,
1982,0.0,,,,,,,
1983,0.0,,,,,,,


<IPython.core.display.Javascript object>

In [6]:
# weights are different in 2021 and 2022
# but we only use them for resampling within each year of the survey
gss["wtssall"].fillna(gss["wtssps"], inplace=True)
gss["wtssall"].describe()

count    72390.000000
mean         1.000014
std          0.550871
min          0.073972
25%          0.549300
50%          0.961700
75%          1.098500
max         14.272462
Name: wtssall, dtype: float64

<IPython.core.display.Javascript object>

In the most recent data, reliten is split into two variables, with and without voluntary responses.
I'm combining them into a single variable.

In [7]:
gss["reliten"].describe()

count    59755.000000
mean         1.972253
std          0.992474
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          4.000000
Name: reliten, dtype: float64

<IPython.core.display.Javascript object>

In [8]:
gss["reliten"] = gss["reliten"].fillna(gss["relitenv"]).fillna(gss["relitennv"])

<IPython.core.display.Javascript object>

In [9]:
gss["reliten"].value_counts()

reliten
2.0    25071
1.0    23759
4.0     8750
3.0     6083
Name: count, dtype: int64

<IPython.core.display.Javascript object>

In [10]:
del gss["relitenv"]
del gss["relitennv"]

<IPython.core.display.Javascript object>

In [11]:
print(gss.shape)
gss.head()

(72390, 205)


Unnamed: 0,abany,abdefect,abhlth,abnomore,abpoor,abrape,absingle,acqntsex,adults,affrmact,...,trdunion,trust,union,wkharsex,wkracism,wksexism,wtssall,wtssps,xmarsex,year
0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,,...,,3.0,,,,,0.4446,,,1972
1,,1.0,1.0,2.0,2.0,1.0,1.0,,2.0,,...,,1.0,,,,,0.8893,,,1972
2,,1.0,1.0,1.0,1.0,1.0,1.0,,2.0,,...,,2.0,,,,,0.8893,,,1972
3,,2.0,1.0,2.0,1.0,1.0,1.0,,2.0,,...,,2.0,,,,,0.8893,,,1972
4,,1.0,1.0,1.0,1.0,1.0,1.0,,2.0,,...,,2.0,,,,,0.8893,,,1972


<IPython.core.display.Javascript object>

In [12]:
!rm -f ../data/interim/gss_pacs_2022.hdf

<IPython.core.display.Javascript object>

In [13]:
gss.to_hdf("../data/interim/gss_pacs_2022.hdf", "gss", complevel=7)

<IPython.core.display.Javascript object>

In [14]:
!ls -lh ../data/interim/gss_pacs_2022.hdf

-rw-rw-r-- 1 downey downey 9.6M Jan 27 20:43 ../data/interim/gss_pacs_2022.hdf


<IPython.core.display.Javascript object>

In [15]:
!git add -f ../data/interim/gss_pacs_2022.hdf
!git commit -m "Adding PACS data file for 2022"
!git push

[main 6ced411] Adding PACS data file for 2022
 1 file changed, 0 insertions(+), 0 deletions(-)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 12 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 9.12 MiB | 3.13 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/AllenDowney/GssExtract.git
   6a5f398..6ced411  main -> main


<IPython.core.display.Javascript object>