# Extract data for Elements of Data Science

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
%load_ext nb_black

<IPython.core.display.Javascript object>


Reading data from the [General Social Survey](https://gssdataexplorer.norc.org)


In [3]:
columns = sorted(
    [
        "year",
        "id",
        "divorce",
        "sibs",
        "childs",
        "age",
        "educ",
        "degree",
        "sex",
        "race",
        "res16",
        "reg16",
        "adults",
        "income",
        "rincome",
        "region",
        "srcbelt",
        "partyid",
        "pres96",
        "pres00",
        "pres04",
        "pres08",
        "pres12",
        "polviews",
        "natspac",
        "natenvir",
        "natheal",
        "natcity",
        "natcrime",
        "natdrug",
        "nateduc",
        "natrace",
        "natarms",
        "nataid",
        "natfare",
        "natroad",
        "natsoc",
        "natmass",
        "natpark",
        "natchld",
        "natsci",
        "natenrgy",
        "spkath",
        "colath",
        "libath",
        "spksoc",
        "colsoc",
        "libsoc",
        "spkrac",
        "colrac",
        "librac",
        "spkcom",
        "colcom",
        "libcom",
        "spkmil",
        "colmil",
        "libmil",
        "spkhomo",
        "colhomo",
        "libhomo",
        "spkmslm",
        "colmslm",
        "libmslm",
        "cappun",
        "gunlaw",
        "grass",
        "relig",
        "fund",
        "attend",
        "reliten",
        "postlife",
        "pray",
        "popespks",
        "relig16",
        "prayer",
        "bible",
        "racmar",
        "racdin",
        "racpush",
        "racseg",
        "racopen",
        "raclive",
        "racclos",
        "racdis",
        "racinteg",
        "rachome",
        "racschol",
        "racfew",
        "rachaf",
        "racmost",
        "racpres",
        "racchurh",
        "affrmact",
        "happy",
        "hapmar",
        "health",
        "life",
        "helpful",
        "fair",
        "trust",
        "confinan",
        "conbus",
        "conclerg",
        "coneduc",
        "confed",
        "conlabor",
        "conpress",
        "conmedic",
        "contv",
        "conjudge",
        "consci",
        "conlegis",
        "conarmy",
        "satjob",
        "class",
        "satfin",
        "finrela",
        "union",
        "fehome",
        "fework",
        "fepres",
        "fepol",
        "abdefect",
        "abnomore",
        "abhlth",
        "abpoor",
        "abrape",
        "absingle",
        "abany",
        "chldidel",
        "sexeduc",
        "divlaw",
        "premarsx",
        "teensex",
        "xmarsex",
        "homosex",
        "pornlaw",
        "spanking",
        "letdie1",
        "polhitok",
        "polabuse",
        "polmurdr",
        "polescap",
        "polattak",
        "fear",
        "owngun",
        "pistol",
        "rowngun",
        "hunt",
        "phone",
        "fechld",
        "fehelp",
        "fepresch",
        "fefam",
        "racdif1",
        "racdif2",
        "racdif3",
        "racdif4",
        "god",
        "reborn",
        "savesoul",
        "racwork",
        "fejobaff",
        "discaffm",
        "discaffw",
        "fehire",
        "relpersn",
        "sprtprsn",
        "relexp",
        "spklang",
        "compuse",
        "hrsrelax",
        "trdunion",
        "wkracism",
        "wksexism",
        "wkharsex",
        "databank",
        "goodlife",
        "meovrwrk",
        "miracles",
        "relexper",
        "relactiv",
        "matesex",
        "frndsex",
        "acqntsex",
        "pikupsex",
        "paidsex",
        "othersex",
        "sexsex",
        "sexfreq",
        "sexsex5",
        "sexornt",
        "hhrace",
        "cohort",
        "ballot",
        "wtssall",
        "wtssps",
        "sexbirth",
        "sexnow",
        "eqwlth",
        "realinc",
        "realrinc",
        "coninc",
        "conrinc",
        "commun",
    ]
)

<IPython.core.display.Javascript object>

In [4]:
filename = "../data/raw/gss7222_r1.dta"

gss = pd.read_stata(filename, columns=columns, convert_categoricals=False)

<IPython.core.display.Javascript object>

In [5]:
# weights are different in 2021; mixing them in would be problematic,
# but we only use them for resampling within one year of the survey
gss["wtssall"].fillna(gss["wtssps"], inplace=True)
gss["wtssall"].describe()

count    72390.000000
mean         1.000014
std          0.550871
min          0.073972
25%          0.549300
50%          0.961700
75%          1.098500
max         14.272462
Name: wtssall, dtype: float64

<IPython.core.display.Javascript object>

In [6]:
del gss["wtssps"]

<IPython.core.display.Javascript object>

In [7]:
print(gss.shape)
gss.head()

(72390, 204)


Unnamed: 0,abany,abdefect,abhlth,abnomore,abpoor,abrape,absingle,acqntsex,adults,affrmact,...,teensex,trdunion,trust,union,wkharsex,wkracism,wksexism,wtssall,xmarsex,year
0,,1.0,1.0,1.0,1.0,1.0,1.0,,1.0,,...,,,3.0,,,,,0.4446,,1972
1,,1.0,1.0,2.0,2.0,1.0,1.0,,2.0,,...,,,1.0,,,,,0.8893,,1972
2,,1.0,1.0,1.0,1.0,1.0,1.0,,2.0,,...,,,2.0,,,,,0.8893,,1972
3,,2.0,1.0,2.0,1.0,1.0,1.0,,2.0,,...,,,2.0,,,,,0.8893,,1972
4,,1.0,1.0,1.0,1.0,1.0,1.0,,2.0,,...,,,2.0,,,,,0.8893,,1972


<IPython.core.display.Javascript object>

In [8]:
!rm -f ../data/interim/gss_eds_2022.hdf

<IPython.core.display.Javascript object>

In [9]:
gss.to_hdf("../data/interim/gss_eds_2022.hdf", "gss", complevel=6)

<IPython.core.display.Javascript object>

In [10]:
!ls -lh ../data/interim/gss_eds_2022.hdf

-rw-rw-r-- 1 downey downey 9.0M Jul 24 15:58 ../data/interim/gss_eds_2022.hdf


<IPython.core.display.Javascript object>

In [11]:
!git add -f ../data/interim/gss_eds_2022.hdf
!git commit -m "Updating EDS data file"
!git push

[main 6a5f398] Updating EDS data file
 1 file changed, 0 insertions(+), 0 deletions(-)
 create mode 100644 data/interim/gss_eds_2022.hdf
Enumerating objects: 8, done.
Counting objects: 100% (8/8), done.
Delta compression using up to 12 threads
Compressing objects: 100% (4/4), done.
Writing objects: 100% (5/5), 8.61 MiB | 5.20 MiB/s, done.
Total 5 (delta 2), reused 0 (delta 0)
remote: Resolving deltas: 100% (2/2), completed with 2 local objects.[K
To https://github.com/AllenDowney/GssExtract.git
   869670c..6a5f398  main -> main


<IPython.core.display.Javascript object>