# Extract GSS 2022 r3

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>

Download the Stata data from https://gss.norc.org/get-the-data/stata

Move to data/raw and unzip


In [3]:
!ls ../data/raw/

gss7221_r2.dta.gz  gss7222_r2.dta.gz   gss7222_r3.dta.gz
gss7222_r1.dta.gz  gss7222_r3a.dta.gz  GSS_stata


<IPython.core.display.Javascript object>

In [4]:
filename = "../data/raw/gss7222_r3a.dta.gz"

<IPython.core.display.Javascript object>

In [5]:
columns = sorted(
    [
        "year",
        "id",
        "age",
        "cohort",
        "educ",
        "degree",
        "sex",
        "race",
        "res16",
        "reg16",
        "rincome",
        "region",
        "srcbelt",
        "partyid",
        "relig",
        "relig16",
        "fund",
        "attend",
        "reliten",
        "happy",
        "hapmar",
        "health",
        "life",
        "fear",
        "helpful",
        "fair",
        "trust",
        "confinan",
        "conbus",
        "conclerg",
        "coneduc",
        "confed",
        "conlabor",
        "conpress",
        "conmedic",
        "contv",
        "conjudge",
        "consci",
        "conlegis",
        "conarmy",
        "satjob",
        "class",
        "satfin",
        "goodlife",
        "ballot",
        "wtssall",
        "wtssps",
        "sexbirth",
        "sexnow",
        "eqwlth",
        "realinc",
        "realrinc",
        "coninc",
        "conrinc",
        "commun",
        "clmtcaus",
        "clmtwrld",
        "clmtusa",
    ]
)

<IPython.core.display.Javascript object>

In [6]:
gss = pd.read_stata(filename, columns=columns, convert_categoricals=False)

<IPython.core.display.Javascript object>

In [7]:
# weights are different in 2021 and 2022 so mixing them in might seem like a bad idea,
# but we only use them for resampling within one year of the survey,
# so I think it's ok
gss["wtssall"].fillna(gss["wtssps"], inplace=True)
gss["wtssall"].describe()

count    72390.000000
mean         1.000014
std          0.550871
min          0.073972
25%          0.549300
50%          0.961700
75%          1.098500
max         14.272462
Name: wtssall, dtype: float64

<IPython.core.display.Javascript object>

In [8]:
del gss["wtssps"]

<IPython.core.display.Javascript object>

In [9]:
print(gss.shape)
gss.head()

(72390, 57)


Unnamed: 0,age,attend,ballot,class,clmtcaus,clmtusa,clmtwrld,cohort,commun,conarmy,...,rincome,satfin,satjob,sex,sexbirth,sexnow,srcbelt,trust,wtssall,year
0,23.0,2.0,,3.0,,,,1949.0,,,...,,3.0,3.0,2.0,,,3.0,3.0,0.4446,1972
1,70.0,7.0,,3.0,,,,1902.0,,,...,,2.0,,1.0,,,3.0,1.0,0.8893,1972
2,48.0,4.0,,2.0,,,,1924.0,,,...,,1.0,2.0,2.0,,,3.0,2.0,0.8893,1972
3,27.0,0.0,,3.0,,,,1945.0,,,...,,3.0,1.0,2.0,,,3.0,2.0,0.8893,1972
4,61.0,0.0,,2.0,,,,1911.0,,,...,,1.0,,2.0,,,3.0,2.0,0.8893,1972


<IPython.core.display.Javascript object>

In [10]:
from utils import resample_by_year

np.random.seed(17)
sample = resample_by_year(gss, "wtssall")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [11]:
!rm -f ../data/interim/gss_extract_2022_3a.hdf

<IPython.core.display.Javascript object>

In [12]:
sample.to_hdf("../data/interim/gss_extract_2022_3a.hdf", "gss", complevel=6)

<IPython.core.display.Javascript object>

In [13]:
!ls -lh ../data/interim/gss_eds_2022.hdf

-rw-rw-r-- 1 downey downey 9.1M Aug  9 17:15 ../data/interim/gss_eds_2022.hdf


<IPython.core.display.Javascript object>

In [14]:
!git add -f ../data/interim/gss_extract_2022_3a.hdf
!git commit -m "Updating GSS extract"
!git push

[main 0e851fb] Updating GSS extract
 1 file changed, 0 insertions(+), 0 deletions(-)
 rewrite data/interim/gss_extract_2022_3a.hdf (62%)
Enumerating objects: 9, done.
Counting objects: 100% (9/9), done.
Delta compression using up to 12 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 3.58 MiB | 2.66 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/AllenDowney/GssExtract.git
   9cf7f52..0e851fb  main -> main


<IPython.core.display.Javascript object>