# Extract GSS Data for the PACS Workshop

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import resample_by_year

Download the Stata data from https://gss.norc.org/get-the-data/stata

Move to data/raw and unzip

In [2]:
!ls ../data/raw/

gss7221_r2.dta.gz  gss7222_r2.dta.gz   gss7222_r3.dta.gz  GSS_stata
gss7222_r1.dta.gz  gss7222_r3a.dta.gz  gss7222_r4.dta.gz  GSS_stata.zip


In [3]:
filename = "../data/raw/gss7222_r4.dta.gz"

In [4]:
columns = sorted(
    [
        "year",
        "id",
        "age",
        "cohort",
        "educ",
        "degree",
        "sex",
        "race",
        "rincome",
        "region",
        "srcbelt",
        "class",
        "partyid",
        "polviews",
        "relig",
        "attend",
        "happy",
        "hapmar",
        "health",
        "life",
        "fear",
        "helpful",
        "fair",
        "trust",
        "satjob",
        "satfin",
        "goodlife",
        "ballot",
        "wtssall",
        "wtssps",
    ]
)

In [5]:
gss = pd.read_stata(filename, columns=columns, convert_categoricals=False)

In [6]:
# Weights are different in the most recent iterations
# We'll combine them into a single column, but we will never
# mix weights from differnet iterations.
gss["wtssall"].fillna(gss["wtssps"], inplace=True)
gss["wtssall"].describe()

count    72390.000000
mean         1.000014
std          0.550871
min          0.073972
25%          0.549300
50%          0.961700
75%          1.098500
max         14.272462
Name: wtssall, dtype: float64

In [7]:
del gss["wtssps"]

In [8]:
print(gss.shape)
gss.head()

(72390, 29)


Unnamed: 0,age,attend,ballot,class,cohort,degree,educ,fair,fear,goodlife,...,region,relig,rincome,satfin,satjob,sex,srcbelt,trust,wtssall,year
0,23.0,2.0,,3.0,1949.0,3.0,16.0,2.0,,,...,3,3.0,,3.0,3.0,2.0,3.0,3.0,0.4446,1972
1,70.0,7.0,,3.0,1902.0,0.0,10.0,2.0,,,...,3,2.0,,2.0,,1.0,3.0,1.0,0.8893,1972
2,48.0,4.0,,2.0,1924.0,1.0,12.0,1.0,,,...,3,1.0,,1.0,2.0,2.0,3.0,2.0,0.8893,1972
3,27.0,0.0,,3.0,1945.0,3.0,17.0,2.0,,,...,3,5.0,,3.0,1.0,2.0,3.0,2.0,0.8893,1972
4,61.0,0.0,,2.0,1911.0,1.0,12.0,2.0,,,...,3,1.0,,1.0,,2.0,3.0,2.0,0.8893,1972


In [9]:
from utils import resample_by_year

np.random.seed(17)
sample = resample_by_year(gss, "wtssall")

In [10]:
!rm -f ../data/interim/gss_extract_pacs_workshop.hdf

In [11]:
sample.to_hdf("../data/interim/gss_extract_pacs_workshop.hdf", "gss", complevel=6)

In [12]:
!ls -lh ../data/interim/gss_extract_pacs_workshop.hdf

-rw-rw-r-- 1 downey downey 1.9M Apr 23 13:26 ../data/interim/gss_extract_pacs_workshop.hdf


In [13]:
!git add -f ../data/interim/gss_extract_pacs_workshop.hdf
!git commit -m "Updating extract for the PACS workshop"
!git push

[main 6dd0983] Updating extract for the PACS workshop
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/interim/gss_extract_pacs_workshop.hdf
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 12 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 1.76 MiB | 1.36 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/AllenDowney/GssExtract.git
   e954d99..6dd0983  main -> main
