# Extract GSS 2024 r1

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import resample_by_year

Download the Stata data from https://gss.norc.org/get-the-data/stata

Move to data/raw and unzip


In [4]:
!ls ../data/raw/

gss7221_r2.dta.gz  gss7222_r3a.dta.gz  gss7224_r1.dta.gz
gss7222_r1.dta.gz  gss7222_r3.dta.gz   GSS_stata
gss7222_r2.dta.gz  gss7222_r4.dta.gz   GSS_stata.zip


In [5]:
filename = "../data/raw/gss7224_r1.dta.gz"

In [6]:
columns = sorted(
    [
        "year",
        "id",
        "age",
        "cohort",
        "educ",
        "degree",
        "sex",
        "race",
        "res16",
        "reg16",
        "rincome",
        "region",
        "srcbelt",
        "partyid",
        "polviews",
        "relig",
        "relig16",
        "fund",
        "attend",
        "reliten",
        "happy",
        "hapmar",
        "health",
        "life",
        "fear",
        "helpful",
        "fair",
        "trust",
        "confinan",
        "conbus",
        "conclerg",
        "coneduc",
        "confed",
        "conlabor",
        "conpress",
        "conmedic",
        "contv",
        "conjudge",
        "consci",
        "conlegis",
        "conarmy",
        "satjob",
        "class",
        "satfin",
        "goodlife",
        "ballot",
        "wtssall",
        "wtssps",
        "sexbirth",
        "sexnow",
        "sexbirth1",
        "sexnow1",
        "eqwlth",
        "realinc",
        "realrinc",
        "coninc",
        "conrinc",
        "commun",
        "clmtcaus",
        "clmtwrld",
        "clmtusa",
        "pres16",
        "pres20",
    ]
)

In [7]:
gss = pd.read_stata(filename, columns=columns, convert_categoricals=False)

In [8]:
# weights are different in 2021 and 2022 so mixing them in might seem like a bad idea,
# but we only use them for resampling within one year of the survey,
# so I think it's ok
gss["wtssall"].fillna(gss["wtssps"], inplace=True)
gss["wtssall"].describe()

count    75699.000000
mean         1.000013
std          0.576286
min          0.073972
25%          0.549300
50%          0.956994
75%          1.098500
max         14.272462
Name: wtssall, dtype: float64

In [9]:
del gss["wtssps"]

In [10]:
print(gss.shape)
gss.head()

(75699, 62)


Unnamed: 0,age,attend,ballot,class,clmtcaus,clmtusa,clmtwrld,cohort,commun,conarmy,...,satjob,sex,sexbirth,sexbirth1,sexnow,sexnow1,srcbelt,trust,wtssall,year
0,23.0,2.0,,3.0,,,,1949.0,,,...,3.0,2.0,,,,,3.0,3.0,0.4446,1972
1,70.0,7.0,,3.0,,,,1902.0,,,...,,1.0,,,,,3.0,1.0,0.8893,1972
2,48.0,4.0,,2.0,,,,1924.0,,,...,2.0,2.0,,,,,3.0,2.0,0.8893,1972
3,27.0,0.0,,3.0,,,,1945.0,,,...,1.0,2.0,,,,,3.0,2.0,0.8893,1972
4,61.0,0.0,,2.0,,,,1911.0,,,...,,2.0,,,,,3.0,2.0,0.8893,1972


In [11]:
from utils import resample_by_year

np.random.seed(17)
sample = resample_by_year(gss, "wtssall")

In [12]:
!rm -f ../data/interim/gss_extract_2024_1.hdf

In [13]:
sample.to_hdf("../data/interim/gss_extract_2024_1.hdf", "gss", complevel=6)

In [14]:
!ls -lh ../data/interim/gss_extract_2024_1.hdf

-rw-rw-r-- 1 downey downey 4.1M Aug 13 10:51 ../data/interim/gss_extract_2024_1.hdf


In [15]:
!git add -f ../data/interim/gss_extract_2024_1.hdf
!git commit -m "Updating GSS extract"
!git push

[main ad1de11] Updating GSS extract
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/interim/gss_extract_2024_1.hdf
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 12 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 3.92 MiB | 2.36 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/AllenDowney/GssExtract.git
   f974baf..ad1de11  main -> main
