In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import value_counts, decorate


## Data

Monitoring the Future: A Continuing Study of American Youth (8th- and 10th-Grade Surveys), 2022:  (ICPSR 38883)

https://www.icpsr.umich.edu/web/NAHDAP/studies/38883#

In [3]:
from glob import glob

filenames = glob('data/ICPSR*.zip')
filenames

['data/ICPSR_02475-V1.zip',
 'data/ICPSR_35166-V2.zip',
 'data/ICPSR_37183-V1.zip',
 'data/ICPSR_38502-V1.zip',
 'data/ICPSR_20180-V2.zip',
 'data/ICPSR_37415-V1.zip',
 'data/ICPSR_34574-V2.zip',
 'data/ICPSR_02523-V1.zip',
 'data/ICPSR_02390-V1.zip',
 'data/ICPSR_02752-V2.zip',
 'data/ICPSR_03752-V2.zip',
 'data/ICPSR_38189-V1.zip',
 'data/ICPSR_33902-V1.zip',
 'data/ICPSR_02350-V2.zip',
 'data/ICPSR_04537-V2.zip',
 'data/ICPSR_22500-V1.zip',
 'data/ICPSR_30984-V1.zip',
 'data/ICPSR_36149-V1.zip',
 'data/ICPSR_38883-V1.zip',
 'data/ICPSR_36407-V1.zip',
 'data/ICPSR_02940-V1.zip',
 'data/ICPSR_04263-V2.zip',
 'data/ICPSR_04018-V2.zip',
 'data/ICPSR_03426-V1.zip',
 'data/ICPSR_02521-V2.zip',
 'data/ICPSR_02522-V2.zip',
 'data/ICPSR_25422-V2.zip',
 'data/ICPSR_36799-V1.zip',
 'data/ICPSR_02476-V1.zip',
 'data/ICPSR_37842-V1.zip',
 'data/ICPSR_28402-V1.zip',
 'data/ICPSR_39171-V1.zip',
 'data/ICPSR_03183-V1.zip']

In [4]:
import zipfile


def read_dta_from_zip(zip_filename, index=0):
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        # Get the list of files and find the first .dta file
        file_list = zip_ref.namelist()
        dta_files = [f for f in file_list if f.lower().endswith('.dta')]
        if len(dta_files) == 0:
            print(file_list)
            raise FileNotFoundError("No .dta file found in the ZIP archive.")

        # Read the .dta file into a DataFrame
        stata_path = dta_files[index]
        with zip_ref.open(stata_path) as dta_file:
            df = pd.read_stata(dta_file, convert_categoricals=False)

    return df

In [5]:
import zipfile
import pandas as pd
from functools import reduce

def read_all_dta_from_zip(zip_filename):
    dfs = []
    
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        dta_files = [f for f in zip_ref.namelist() if f.lower().endswith('.dta')]

        if not dta_files:
            raise FileNotFoundError("No .dta files found in the ZIP archive.")

        for path in dta_files:
            with zip_ref.open(path) as dta_file:
                df = pd.read_stata(dta_file, convert_categoricals=False)
                dfs.append(df)
                
    return dfs

    # Merge all dataframes on 'CASEID'
    merged_df = reduce(lambda left, right: pd.merge(left, right, on="CASEID", how="outer"), 
                       dfs)
    return merged_df


In [6]:
def get_year_to_filename(filenames, index=0):
    year_to_filename = {}

    for zip_filename in filenames:
        print(zip_filename)
        df = read_dta_from_zip(zip_filename, index=index)

        years = df['V1'].value_counts()
        if len(years) != 1:
            raise ValueError(f"Unexpected number of unique years in V1 for {zip_filename}: {len(years)}")

        year = int(years.index[0])
        if year < 1900:
            year += 1900
        print(year)
        year_to_filename[year] = zip_filename

    return year_to_filename


In [7]:
ZIPFILE = {
    1991: 'data/ICPSR_02521-V2.zip',
    1992: 'data/ICPSR_02522-V2.zip',
    1993: 'data/ICPSR_02523-V1.zip',
    191994: 'data/ICPSR_02475-V1.zip',
    1995: 'data/ICPSR_02390-V1.zip',
    1996: 'data/ICPSR_02350-V2.zip',
    1997: 'data/ICPSR_02476-V1.zip',
    1998: 'data/ICPSR_02752-V2.zip',
    1999: 'data/ICPSR_02940-V1.zip',
    2000: 'data/ICPSR_03183-V1.zip',
    2001: 'data/ICPSR_03426-V1.zip',
    2002: 'data/ICPSR_03752-V2.zip',
    2003: 'data/ICPSR_04018-V2.zip',
    2004: 'data/ICPSR_04263-V2.zip',
    2005: 'data/ICPSR_04537-V2.zip',
    2006: 'data/ICPSR_20180-V2.zip',
    2007: 'data/ICPSR_22500-V1.zip',
    2008: 'data/ICPSR_25422-V2.zip',
    2009: 'data/ICPSR_28402-V1.zip',
    2010: 'data/ICPSR_30984-V1.zip',
    2011: 'data/ICPSR_33902-V1.zip',
    2012: 'data/ICPSR_34574-V2.zip',
    2013: 'data/ICPSR_35166-V2.zip',
    2014: 'data/ICPSR_36149-V1.zip',
    2015: 'data/ICPSR_36407-V1.zip',
    2016: 'data/ICPSR_36799-V1.zip',
    2017: 'data/ICPSR_37183-V1.zip',
    2018: 'data/ICPSR_37415-V1.zip',
    2019: 'data/ICPSR_37842-V1.zip',
    2020: 'data/ICPSR_38189-V1.zip',
    2021: 'data/ICPSR_38502-V1.zip',
    2022: 'data/ICPSR_38883-V1.zip',
    2023: 'data/ICPSR_39171-V1.zip'
}

In [8]:
year = 2021
zip_filename = ZIPFILE[year]
zip_filename

df0 = read_dta_from_zip(zip_filename, index=0)


In [9]:
from collections import defaultdict

def make_defaultdict(d, default_value):
    return defaultdict(lambda: default_value, d)

SPLIT = {
    1994: True,
}
SPLIT = make_defaultdict(SPLIT, False)

In [10]:
if SPLIT[year]:
    df2 = read_dta_from_zip(zip_filename, index=2)
    df = pd.concat([df_8, df_10], ignore_index=True)
else:
    df = df0

In [11]:
df.head()

Unnamed: 0,AI_08,AI_10,V1,V3,V5,V501,V507,V508,V509,SURVEY_VERSION,...,V7099D,V7133D,V7134D,V7135D,V7142D,V7143D,V7144D,V7139D,V7140D,V7141D
0,10001.0,,2021,1,1.144673,8,3,0,1,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
1,10002.0,,2021,1,0.676939,8,2,1,1,4,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
2,10003.0,,2021,1,0.606877,8,1,0,0,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
3,10004.0,,2021,1,0.601276,8,1,0,0,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
4,10005.0,,2021,1,1.004319,8,3,0,1,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8


In [12]:
df.shape

(23238, 695)

In [13]:
df.head()

Unnamed: 0,AI_08,AI_10,V1,V3,V5,V501,V507,V508,V509,SURVEY_VERSION,...,V7099D,V7133D,V7134D,V7135D,V7142D,V7143D,V7144D,V7139D,V7140D,V7141D
0,10001.0,,2021,1,1.144673,8,3,0,1,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
1,10002.0,,2021,1,0.676939,8,2,1,1,4,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
2,10003.0,,2021,1,0.606877,8,1,0,0,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
3,10004.0,,2021,1,0.601276,8,1,0,0,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8
4,10005.0,,2021,1,1.004319,8,3,0,1,3,...,-9,-9,-9,-9,-8,-8,-8,-8,-8,-8


In [14]:
for column in df.columns:
    print(column)

AI_08
AI_10
V1
V3
V5
V501
V507
V508
V509
SURVEY_VERSION
V545
V548
V7101
V7104
V7105
V7112
V7115
V7118
V7127
V7097
V7133
V7139
V7142
V8451
V7426
V7121
V7124
V7164
V7145
V7109
V7152
V7155
V7158
V7161
V7601
V8480
V7648
V7780
V7783
V7786
V7693
V7794
V7831
V7106
V7113
V7116
V7119
V7128
V7098
V7134
V7140
V7143
V8452
V7122
V7125
V7165
V7146
V7110
V7153
V7156
V7159
V7162
V7602
V7488
V7489
V7491
V7492
V7814
V8481
V7495
V7554
V7561
V7564
V7566
V7568
V7569
V7694
V7819
V7781
V7784
V7787
V7795
V7832
V7102
V7107
V7114
V7117
V7120
V7129
V7099
V7135
V7141
V7144
V8453
V7427
V7123
V7126
V7166
V7147
V7111
V7154
V7157
V7160
V7163
V7603
V8482
V7615
V7616
V7617
V7642
V7695
V7669
V7782
V7785
V7788
V7796
V7833
V7108
V7641
V8454
V7731
V7442
V7441
V8413
V7443
V7444
V7445
V8417
V8418
V8419
V8483
V8421
V7446
V7447
V8424
V8425
V7448
V7751
V7449
V8564
V7548
V7789
V7103
V7180
V7181
V7475
V7476
V7477
V7478
V7479
V7480
V7790
V7829
V7830
V7547
V7549
V7550
V7643
V7176
V7587
V7724
V7885
V7791
V7792
V7793
V7834
V7835
V783

In [15]:
WEIGHT = {
    1994: 'V5',
    2022: 'V5'
}
WEIGHT = make_defaultdict(WEIGHT, 'V5')

In [16]:
df[WEIGHT[year]].describe()

count    23238.000000
mean         1.000000
std          0.720223
min          0.125902
25%          0.536388
50%          0.753045
75%          1.345625
max          8.817833
Name: V5, dtype: float64

In [17]:
GRADE = {
    1994: 'V1101',
    2022: 'V501'
}
GRADE = make_defaultdict(GRADE, 'V501')

In [18]:
df[GRADE[year]] = df[GRADE[year]].replace([0, 9], np.nan)
# TODO Replace code 2 with grade 8
# TODO Replace code 4 with grade 10

value_counts(df[GRADE[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
8,11446
10,11792


In [19]:
"""
V7341: 07970:D05 MN=ACHV/WMN=HOME F2
Item number: 07970
How much do you agree or disagree with each statement below?
It is usually better for everyone involved if the man is the achiever 
outside the home and the woman takes care of
the home and family
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
Responses from the Western region intentionally deleted.
"""

'\nV7341: 07970:D05 MN=ACHV/WMN=HOME F2\nItem number: 07970\nHow much do you agree or disagree with each statement below?\nIt is usually better for everyone involved if the man is the achiever \noutside the home and the woman takes care of\nthe home and family\n1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"\nResponses from the Western region intentionally deleted.\n'

In [20]:
FEFAM = {
    1994: 'V1141',
    2022: 'V7341'
}
FEFAM = make_defaultdict(FEFAM, 'V7341')

In [21]:
df[FEFAM[year]] = df[FEFAM[year]].replace([0, 9, -8, -9], np.nan)
value_counts(df[FEFAM[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,2210
2.0,917
3.0,1324
4.0,745
5.0,551
,17491


In [22]:
def set_target(df, varname, values, newname):
    valid = df[varname].notna()
    df[newname] = np.where(valid, df[varname].isin(values), np.nan)

In [23]:
set_target(df, FEFAM[year], [1, 2], 'fefam')
df['fefam'].mean()

np.float64(0.5441099704193493)

In [24]:
"""V7339: 07930:D06 MEN+WOMN/=$,=WRK F2
Item number: 07930
The next questions ask your opinions about a number of different topics. How much do you agree or disagree with
each statement below?
Men and women should be paid the same money if they do the same work
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
"""

'V7339: 07930:D06 MEN+WOMN/=$,=WRK F2\nItem number: 07930\nThe next questions ask your opinions about a number of different topics. How much do you agree or disagree with\neach statement below?\nMen and women should be paid the same money if they do the same work\n1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"\n'

In [25]:
FEWORK = {
    1994: 'V1139',
    2022: 'V7339'
}
FEWORK = make_defaultdict(FEWORK, 'V7339')

In [26]:
df[FEWORK[year]] = df[FEWORK[year]].replace([0, 9, -8, -9], np.nan)
value_counts(df[FEWORK[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,230
2.0,182
3.0,359
4.0,1095
5.0,5565
,15807


In [27]:
set_target(df, FEWORK[year], [4, 5], 'fework')
value_counts(df['fework'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
0.0,771
1.0,6660
,15807


In [28]:
df['fework'].mean()

np.float64(0.8962454582155833)

In [29]:
"""
V7340: 07950:D06 WMN SHD =JOB OPP F2
Item number: 07950
How much do you agree or disagree with each statement below?
A woman should have exactly the same job opportunities as a man
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
""";

In [30]:
FEJOB = {
    1994: 'V1140',
    2022: 'V7340'
}
FEJOB = make_defaultdict(FEJOB, 'V7340')

In [31]:
df[FEJOB[year]] = df[FEJOB[year]].replace([0, 9, -8, -9], np.nan)
value_counts(df[FEJOB[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,316
2.0,313
3.0,460
4.0,1397
5.0,4930
,15822


In [32]:
set_target(df, FEJOB[year], [4, 5], 'fejob')
value_counts(df['fejob'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
0.0,1089
1.0,6327
,15822


In [33]:
df['fejob'].mean()

np.float64(0.8531553398058253)

In [34]:
"""
V7202: 00030:R01 R'S SEX F1234
Item number: 00030
What is your sex?
- 382 -1="Male" 2="Female" 3="Other" 4="Prefer not to answer"
"""

'\nV7202: 00030:R01 R\'S SEX F1234\nItem number: 00030\nWhat is your sex?\n- 382 -1="Male" 2="Female" 3="Other" 4="Prefer not to answer"\n'

In [35]:
GENDER = {
    1994: 'V1227',
    2022: 'V7202'
}
GENDER = make_defaultdict(GENDER, 'V7202')

In [36]:
df[GENDER[year]] = df[GENDER[year]].replace([0, 9, -9], np.nan)
value_counts(df[GENDER[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,10561
2.0,9906
3.0,1060
,1711


In [37]:
df.groupby(GENDER[year])[['fejob', 'fework', 'fefam']].mean()

Unnamed: 0_level_0,fejob,fework,fefam
V7202,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.775533,0.84399,0.384305
2.0,0.937841,0.95614,0.699415
3.0,0.885387,0.894886,0.72973


In [38]:
df.groupby([GRADE[year], GENDER[year]])[['fejob', 'fework', 'fefam']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,fejob,fework,fefam
V501,V7202,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8,1.0,0.745986,0.829226,0.379536
8,2.0,0.937539,0.944513,0.690063
8,3.0,0.909574,0.904762,0.735294
10,1.0,0.80463,0.858512,0.389177
10,2.0,0.938132,0.96734,0.708558
10,3.0,0.857143,0.883436,0.723577


In [39]:
"""
V1070: 89501:R01 RACE--B/W/H F1234
Item Number: 00041-49
Recoded from the following: Q. C04 (00041-00049): "How do you describe yourself? (Select one or more responses.)
A.Black or African American; B. Mexican American or Chicano; C. Cuban American; D. Puerto Rican;
E. Other Hispanic or Latino; F. Asian American; G. White (Caucasian); H. American Indian or Alaska Native;
I. Native Hawaiian or Other Pacific Islander."
1="Black or African American" , 2="White (Caucasian)",
3=Hispanic ("Mexican . . . " or "Cuban . . . " or "Puerto Rican" or "Other Hispanic . . . ").
All other responses, including those of respondents who fell into more than one of the three recoded categories,
were recoded to missing data.
"""

'\nV1070: 89501:R01 RACE--B/W/H F1234\nItem Number: 00041-49\nRecoded from the following: Q. C04 (00041-00049): "How do you describe yourself? (Select one or more responses.)\nA.Black or African American; B. Mexican American or Chicano; C. Cuban American; D. Puerto Rican;\nE. Other Hispanic or Latino; F. Asian American; G. White (Caucasian); H. American Indian or Alaska Native;\nI. Native Hawaiian or Other Pacific Islander."\n1="Black or African American" , 2="White (Caucasian)",\n3=Hispanic ("Mexican . . . " or "Cuban . . . " or "Puerto Rican" or "Other Hispanic . . . ").\nAll other responses, including those of respondents who fell into more than one of the three recoded categories,\nwere recoded to missing data.\n'

In [40]:
"""
"""

'\n'

In [41]:
"""
"""

'\n'

In [42]:
"""
"""

'\n'

In [43]:
"""
"""

'\n'

In [44]:
"""
"""

'\n'

In [45]:
"""
"""

'\n'

In [46]:
"""
"""

'\n'

In [47]:
"""
"""

'\n'