# Exploratory Data Analysis

Load and validate GSS data

Allen Downey

[MIT License](https://en.wikipedia.org/wiki/MIT_License)

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='white')

import utils
from utils import decorate
from distribution import Pmf, Cdf

In [2]:
def underride(d, **options):
    """Add key-value pairs to d only if key is not in d.

    d: dictionary
    options: keyword args to add to d
    """
    for key, val in options.items():
        d.setdefault(key, val)

    return d

## Loading and validation


In [3]:
import re

class FixedWidthVariables(object):
    """Represents a set of variables in a fixed width file."""

    def __init__(self, variables, index_base=0):
        """Initializes.

        variables: DataFrame
        index_base: are the indices 0 or 1 based?

        Attributes:
        colspecs: list of (start, end) index tuples
        names: list of string variable names
        """
        self.variables = variables

        # note: by default, subtract 1 from colspecs
        self.colspecs = variables[['start', 'end']] - index_base

        # convert colspecs to a list of pair of int
        self.colspecs = self.colspecs.astype(np.int).values.tolist()
        self.names = variables['name']

    def ReadFixedWidth(self, filename, **options):
        """Reads a fixed width ASCII file.

        filename: string filename

        returns: DataFrame
        """
        df = pd.read_fwf(filename,
                         colspecs=self.colspecs, 
                         names=self.names,
                         **options)
        return df


def ReadStataDct(dct_file, **options):
    """Reads a Stata dictionary file.

    dct_file: string filename
    options: dict of options passed to open()

    returns: FixedWidthVariables object
    """
    type_map = dict(byte=int, int=int, long=int, float=float, 
                    double=float, numeric=float)

    var_info = []
    with open(dct_file, **options) as f:
        for line in f:
            match = re.search( r'_column\(([^)]*)\)', line)
            if not match:
                continue
            start = int(match.group(1))
            t = line.split()
            vtype, name, fstring = t[1:4]
            name = name.lower()
            if vtype.startswith('str'):
                vtype = str
            else:
                vtype = type_map[vtype]
            long_desc = ' '.join(t[4:]).strip('"')
            var_info.append((start, vtype, name, fstring, long_desc))
            
    columns = ['start', 'type', 'name', 'fstring', 'desc']
    variables = pd.DataFrame(var_info, columns=columns)

    # fill in the end column by shifting the start column
    variables['end'] = variables.start.shift(-1)
    variables.loc[len(variables)-1, 'end'] = 0

    dct = FixedWidthVariables(variables, index_base=1)
    return dct

def read_gss(dirname):
    """Reads GSS files from the given directory.
    
    dirname: string
    
    returns: DataFrame
    """
    dct = ReadStataDct(dirname + '/GSS.dct')
    gss = dct.ReadFixedWidth(dirname + '/GSS.dat.gz',
                             compression='gzip')
    return gss

In [4]:
gss = read_gss('gss_eda')
print(gss.shape)
gss.head()

(62466, 101)


Unnamed: 0,year,id_,agewed,divorce,sibs,childs,age,educ,paeduc,maeduc,...,memchurh,realinc,cohort,marcohrt,ballot,wtssall,adults,compuse,databank,wtssnr
0,1972,1,0,0,3,0,23,16,10,97,...,0,18951.0,1949,0,0,0.4446,1,0,0,1.0
1,1972,2,21,2,4,5,70,10,8,8,...,0,24366.0,1902,1923,0,0.8893,2,0,0,1.0
2,1972,3,20,2,5,4,48,12,8,8,...,0,24366.0,1924,1944,0,0.8893,2,0,0,1.0
3,1972,4,24,2,5,0,27,17,16,12,...,0,30458.0,1945,1969,0,0.8893,2,0,0,1.0
4,1972,5,22,2,2,2,61,12,8,8,...,0,50763.0,1911,1933,0,0.8893,2,0,0,1.0


In [5]:
gss[gss.year==2016].adults.value_counts().sort_index()

1     994
2    1436
3     317
4      88
5      23
6       6
8       2
9       1
Name: adults, dtype: int64

In [6]:
def replace_invalid(df):
    df.realinc.replace([0], np.nan, inplace=True)                  
    df.educ.replace([98,99], np.nan, inplace=True)
    # 89 means 89 or older
    df.age.replace([98, 99], np.nan, inplace=True) 
    df.cohort.replace([9999], np.nan, inplace=True)
    df.adults.replace([9], np.nan, inplace=True)

replace_invalid(gss)

The proportion of women in this dataset is slightly higher than it probably is in the population, even after weighting.

The issue seems to be that the GSS excludes people living in institutions, including prisons and army housing, which disproportionaly excludes men.

In [10]:
sex = gss.loc[gss.year==2010, 'sex']

In [11]:
pmf = Pmf([1,2])
pmf[1] = np.sum(sex==1)
pmf[2] = np.sum(sex==2)
pmf.normalize()
pmf

1    0.43591
2    0.56409
Name: Pmf, dtype: float64

In [12]:
pmf = Pmf([1,2])
pmf[1] = np.sum((sex==1) * gss.wtssall)
pmf[2] = np.sum((sex==2) * gss.wtssall)
pmf.normalize()
pmf

1    0.451634
2    0.548366
Name: Pmf, dtype: float64

In [13]:
pmf = Pmf([1,2])
pmf[1] = np.sum((sex==1) * gss.wtssnr)
pmf[2] = np.sum((sex==2) * gss.wtssnr)
pmf.normalize()
pmf

1    0.453784
2    0.546216
Name: Pmf, dtype: float64

In [14]:
pmf = Pmf([1,2])
pmf[1] = np.sum((sex==1) * gss.wtssall * gss.adults)
pmf[2] = np.sum((sex==2) * gss.wtssall * gss.adults)
pmf.normalize()
pmf

1    0.463868
2    0.536132
Name: Pmf, dtype: float64

In [15]:
pmf = Pmf([1,2])
pmf[1] = 114173831
pmf[2] = 121043794
pmf.normalize()
pmf

1    0.485397
2    0.514603
Name: Pmf, dtype: float64

In [16]:
gss['wtsample'] = gss['wtssall']
gss.loc[gss.sex==1, 'wtsample'] *= 1.145

In [17]:
pmf = Pmf([1,2])
pmf[1] = np.sum((sex==1) * gss.wtsample)
pmf[2] = np.sum((sex==2) * gss.wtsample)
pmf.normalize()
pmf

1    0.485338
2    0.514662
Name: Pmf, dtype: float64

In [18]:
variables = ['year', 'age', 'cohort', 'sex', 'race', 
             'educ', 'realinc', 'adults', 'wtssall']

subset = gss[variables]
subset.head()

Unnamed: 0,year,age,cohort,sex,race,educ,realinc,adults,wtssall
0,1972,23.0,1949.0,2,1,16.0,18951.0,1.0,0.4446
1,1972,70.0,1902.0,1,1,10.0,24366.0,2.0,0.8893
2,1972,48.0,1924.0,2,1,12.0,24366.0,2.0,0.8893
3,1972,27.0,1945.0,2,1,17.0,30458.0,2.0,0.8893
4,1972,61.0,1911.0,2,1,12.0,50763.0,2.0,0.8893


In [19]:
# drop the 65 respondents with unknown household size
# subset = subset.dropna(subset=['adults'])

In [20]:
np.random.seed(19)
sample = utils.resample_by_year(subset, 'wtssall')

In [21]:
!rm gss.hdf5
sample.to_hdf('gss.hdf5', 'gss')

In [22]:
%time gss = pd.read_hdf('gss.hdf5', 'gss')
gss.shape

CPU times: user 12 ms, sys: 4 ms, total: 16 ms
Wall time: 17.1 ms


(62466, 9)

In [23]:
gss.head()

Unnamed: 0,year,age,cohort,sex,race,educ,realinc,adults,wtssall
0,1972,26.0,1946.0,1,1,18.0,13537.0,2.0,0.8893
1,1972,38.0,1934.0,2,1,12.0,18951.0,1.0,0.4446
2,1972,57.0,1915.0,1,1,12.0,30458.0,3.0,1.3339
3,1972,61.0,1911.0,2,1,14.0,37226.0,2.0,0.8893
4,1972,59.0,1913.0,1,1,12.0,30458.0,2.0,0.8893


In [24]:
gss['year'].describe()

count    62466.000000
mean      1994.072359
std         12.937941
min       1972.000000
25%       1984.000000
50%       1994.000000
75%       2006.000000
max       2016.000000
Name: year, dtype: float64

In [25]:
gss['sex'].describe()

count    62466.000000
mean         1.541415
std          0.498286
min          1.000000
25%          1.000000
50%          2.000000
75%          2.000000
max          2.000000
Name: sex, dtype: float64

In [26]:
gss['age'].describe()

count    62281.000000
mean        44.648320
std         17.072244
min         18.000000
25%         30.000000
50%         43.000000
75%         57.000000
max         89.000000
Name: age, dtype: float64

In [27]:
gss['cohort'].describe()

count    62282.000000
mean      1949.429996
std         20.734302
min       1883.000000
25%       1935.000000
50%       1951.000000
75%       1964.000000
max       1998.000000
Name: cohort, dtype: float64

In [28]:
gss['race'].describe()

count    62466.000000
mean         1.254955
std          0.554694
min          1.000000
25%          1.000000
50%          1.000000
75%          1.000000
max          3.000000
Name: race, dtype: float64

In [29]:
gss['educ'].describe()

count    62304.000000
mean        12.831311
std          3.117027
min          0.000000
25%         12.000000
50%         12.000000
75%         15.000000
max         20.000000
Name: educ, dtype: float64

In [30]:
gss['realinc'].describe()

count     55499.000000
mean      34702.430164
std       30665.659411
min         234.000000
25%       13750.000000
50%       26015.000000
75%       43426.000000
max      162607.000000
Name: realinc, dtype: float64

In [31]:
gss['wtssall'].describe()

count    62466.000000
mean         1.213340
std          0.585544
min          0.411898
25%          0.918400
50%          1.062100
75%          1.515500
max          8.739876
Name: wtssall, dtype: float64