# EDS Case Study

Load and resample GSS data

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Read extract

https://gssdataexplorer.norc.org/projects/52787/extracts

In [2]:
import re

class FixedWidthVariables(object):
    """Represents a set of variables in a fixed width file."""

    def __init__(self, variables, index_base=0):
        """Initializes.

        variables: DataFrame
        index_base: are the indices 0 or 1 based?

        Attributes:
        colspecs: list of (start, end) index tuples
        names: list of string variable names
        """
        self.variables = variables

        # note: by default, subtract 1 from colspecs
        self.colspecs = variables[['start', 'end']] - index_base

        # convert colspecs to a list of pair of int
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
        self.names = variables['name']

    def ReadFixedWidth(self, filename, **options):
        """Reads a fixed width ASCII file.

        filename: string filename

        returns: DataFrame
        """
        df = pd.read_fwf(filename,
                         colspecs=self.colspecs, 
                         names=self.names,
                         **options)
        return df


def ReadStataDct(dct_file, **options):
    """Reads a Stata dictionary file.

    dct_file: string filename
    options: dict of options passed to open()

    returns: FixedWidthVariables object
    """
    type_map = dict(byte=int, int=int, long=int, float=float, 
                    double=float, numeric=float)

    var_info = []
    with open(dct_file, **options) as f:
        for line in f:
            match = re.search( r'_column\(([^)]*)\)', line)
            if not match:
                continue
            start = int(match.group(1))
            t = line.split()
            vtype, name, fstring = t[1:4]
            name = name.lower()
            if vtype.startswith('str'):
                vtype = str
            else:
                vtype = type_map[vtype]
            long_desc = ' '.join(t[4:]).strip('"')
            var_info.append((start, vtype, name, fstring, long_desc))
            
    columns = ['start', 'type', 'name', 'fstring', 'desc']
    variables = pd.DataFrame(var_info, columns=columns)

    # fill in the end column by shifting the start column
    variables['end'] = variables.start.shift(-1)
    variables.loc[len(variables)-1, 'end'] = 0

    dct = FixedWidthVariables(variables, index_base=1)
    return dct

def read_gss(dirname):
    """Reads GSS files from the given directory.
    
    dirname: string
    
    returns: DataFrame
    """
    dct = ReadStataDct(dirname + '/GSS.dct')
    gss = dct.ReadFixedWidth(dirname + '/GSS.dat.gz',
                             compression='gzip')
    return gss

In [3]:
gss = read_gss('gss_eda')
print(gss.shape)
gss.head()

(64814, 105)


Unnamed: 0,year,id_,agewed,divorce,sibs,childs,age,educ,paeduc,maeduc,...,ballot,wtssall,adults,compuse,databank,wtssnr,spkrac,spkcom,spkmil,spkmslm
0,1972,1,0,0,3,0,23,16,10,97,...,0,0.4446,1,0,0,1.0,0,1,0,0
1,1972,2,21,2,4,5,70,10,8,8,...,0,0.8893,2,0,0,1.0,0,2,0,0
2,1972,3,20,2,5,4,48,12,8,8,...,0,0.8893,2,0,0,1.0,0,2,0,0
3,1972,4,24,2,5,0,27,17,16,12,...,0,0.8893,2,0,0,1.0,0,1,0,0
4,1972,5,22,2,2,2,61,12,8,8,...,0,0.8893,2,0,0,1.0,0,1,0,0


In [4]:
def replace_invalid(df):
    df.realinc.replace([0], np.nan, inplace=True)                  
    df.educ.replace([98, 99], np.nan, inplace=True)
    # 89 means 89 or older
    df.age.replace([98, 99], np.nan, inplace=True) 
    df.cohort.replace([9999], np.nan, inplace=True)
    df.adults.replace([9], np.nan, inplace=True)
    df.colhomo.replace([0, 8, 9], np.nan, inplace=True)
    df.libhomo.replace([0, 8, 9], np.nan, inplace=True)
    df.cappun.replace([0, 8, 9], np.nan, inplace=True)
    df.gunlaw.replace([0, 8, 9], np.nan, inplace=True)
    df.grass.replace([0, 8, 9], np.nan, inplace=True)
    df.fepol.replace([0, 8, 9], np.nan, inplace=True)
    df.abany.replace([0, 8, 9], np.nan, inplace=True)
    df.prayer.replace([0, 8, 9], np.nan, inplace=True)
    df.sexeduc.replace([0, 8, 9], np.nan, inplace=True)
    df.premarsx.replace([0, 8, 9], np.nan, inplace=True)
    df.xmarsex.replace([0, 8, 9], np.nan, inplace=True)
    df.homosex.replace([0, 5, 8, 9], np.nan, inplace=True)
    df.racmar.replace([0, 8, 9], np.nan, inplace=True)
    df.spanking.replace([0, 8, 9], np.nan, inplace=True)
    df.racpres.replace([0, 8, 9], np.nan, inplace=True)
    df.fear.replace([0, 8, 9], np.nan, inplace=True)
    df.databank.replace([0, 8, 9], np.nan, inplace=True)
    df.affrmact.replace([0, 8, 9], np.nan, inplace=True)
    df.happy.replace([0, 8, 9], np.nan, inplace=True)
    df.hapmar.replace([0, 8, 9], np.nan, inplace=True)
    df.natspac.replace([0, 8, 9], np.nan, inplace=True)
    df.natenvir.replace([0, 8, 9], np.nan, inplace=True)
    df.natheal.replace([0, 8, 9], np.nan, inplace=True)
    df.natcity.replace([0, 8, 9], np.nan, inplace=True)
    df.natcrime.replace([0, 8, 9], np.nan, inplace=True)
    df.natdrug.replace([0, 8, 9], np.nan, inplace=True)
    df.nateduc.replace([0, 8, 9], np.nan, inplace=True)
    df.natrace.replace([0, 8, 9], np.nan, inplace=True)
    df.natarms.replace([0, 8, 9], np.nan, inplace=True)
    df.nataid.replace([0, 8, 9], np.nan, inplace=True)
    df.natfare.replace([0, 8, 9], np.nan, inplace=True)
    df.health.replace([0, 8, 9], np.nan, inplace=True)
    df.life.replace([0, 8, 9], np.nan, inplace=True)
    df.helpful.replace([0, 8, 9], np.nan, inplace=True)
    df.fair.replace([0, 8, 9], np.nan, inplace=True)
    df.trust.replace([0, 8, 9], np.nan, inplace=True)
    df.conclerg.replace([0, 8, 9], np.nan, inplace=True)
    df.coneduc.replace([0, 8, 9], np.nan, inplace=True)
    df.confed.replace([0, 8, 9], np.nan, inplace=True)
    df.conpress.replace([0, 8, 9], np.nan, inplace=True)
    df.conjudge.replace([0, 8, 9], np.nan, inplace=True)
    df.conlegis.replace([0, 8, 9], np.nan, inplace=True)
    df.conarmy.replace([0, 8, 9], np.nan, inplace=True)
    df.spkhomo.replace([0, 8, 9], np.nan, inplace=True)
    df.spkath.replace([0, 8, 9], np.nan, inplace=True)
    df.colath.replace([0, 8, 9], np.nan, inplace=True)
    df.libath.replace([0, 8, 9], np.nan, inplace=True)
    df.spkrac.replace([0, 8, 9], np.nan, inplace=True)
    df.spkcom.replace([0, 8, 9], np.nan, inplace=True)
    df.spkmil.replace([0, 8, 9], np.nan, inplace=True)
    df.satjob.replace([0, 8, 9], np.nan, inplace=True)
    df.satfin.replace([0, 8, 9], np.nan, inplace=True)
    df.finrela.replace([0, 8, 9], np.nan, inplace=True)

    df.union_.replace([0, 8, 9], np.nan, inplace=True)
    df.res16.replace([0, 8, 9], np.nan, inplace=True)

    df.fund.replace([0, 8, 9], np.nan, inplace=True)
    df.memchurh.replace([0, 8, 9], np.nan, inplace=True)
    df.fund16.replace([0, 8, 9], np.nan, inplace=True)
    df.reliten.replace([0, 8, 9], np.nan, inplace=True)
    df.postlife.replace([0, 8, 9], np.nan, inplace=True)
    df.pray.replace([0, 8, 9], np.nan, inplace=True)
    df.sprel16.replace([0, 8, 9], np.nan, inplace=True)
    df.hunt.replace([0, 8, 9], np.nan, inplace=True)
    df.polviews.replace([0, 8, 9], np.nan, inplace=True)

    df.compuse.replace([0, 8, 9], np.nan, inplace=True)

    df.degree.replace([8, 9], np.nan, inplace=True)
    df.padeg.replace([8, 9], np.nan, inplace=True)
    df.madeg.replace([8, 9], np.nan, inplace=True)
    df.spdeg.replace([8, 9], np.nan, inplace=True)
    df.partyid.replace([8, 9], np.nan, inplace=True)

    df.chldidel.replace([-1, 8, 9], np.nan, inplace=True)

    df.attend.replace([9], np.nan, inplace=True)
    df.childs.replace([9], np.nan, inplace=True)
    df.adults.replace([9], np.nan, inplace=True)

    df.divorce.replace([0, 8, 9], np.nan, inplace=True)
    df.agewed.replace([0, 98, 99], np.nan, inplace=True)
    df.relig.replace([0, 98, 99], np.nan, inplace=True)
    df.relig16.replace([0, 98, 99], np.nan, inplace=True)
    df.age.replace([0, 98, 99], np.nan, inplace=True)
    
    # note: sibs contains some unlikely numbers
    df.sibs.replace([-1, 98, 99], np.nan, inplace=True)
    df.educ.replace([97, 98, 99], np.nan, inplace=True)
    df.maeduc.replace([97, 98, 99], np.nan, inplace=True)
    df.paeduc.replace([97, 98, 99], np.nan, inplace=True)
    df.speduc.replace([97, 98, 99], np.nan, inplace=True)

    df.cohort.replace([0, 9999], np.nan, inplace=True)
    df.marcohrt.replace([0, 9999], np.nan, inplace=True)

    df.phone.replace([0, 2, 9], np.nan, inplace=True)
    df.owngun.replace([0, 3, 8, 9], np.nan, inplace=True)
    df.pistol.replace([0, 3, 8, 9], np.nan, inplace=True)
    df.class_.replace([0, 5, 8, 9], np.nan, inplace=True)
    df.pres04.replace([0, 8, 9], np.nan, inplace=True)
    df.pres08.replace([0, 8, 9], np.nan, inplace=True)
    df.pres12.replace([0, 8, 9], np.nan, inplace=True)

replace_invalid(gss)

In [6]:
import utils

np.random.seed(19)
sample = utils.resample_by_year(gss, 'wtssall')

In [7]:
!rm eds.gss.hdf5
sample.to_hdf('eds.gss.hdf5', 'gss')

rm: cannot remove 'eds.gss.hdf5': No such file or directory


In [9]:
%time gss = pd.read_hdf('eds.gss.hdf5', 'gss')
gss.shape

CPU times: user 12 ms, sys: 32 ms, total: 44 ms
Wall time: 44.7 ms


(64814, 105)