In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import value_counts, decorate


## Data

Monitoring the Future: A Continuing Study of American Youth (8th- and 10th-Grade Surveys), 2022 (ICPSR 38883)

https://www.icpsr.umich.edu/web/NAHDAP/studies/38883#

In [3]:
import zipfile


def read_dta_from_zip(zip_filename):
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        # Get the list of files and find the first .dta file
        file_list = zip_ref.namelist()
        stata_path = next((f for f in file_list if f.lower().endswith('.dta')), None)

        if stata_path is None:
            raise FileNotFoundError("No .dta file found in the ZIP archive.")

        # Read the .dta file into a DataFrame
        with zip_ref.open(stata_path) as dta_file:
            df = pd.read_stata(dta_file, convert_categoricals=False)

    return df

In [4]:
zip_filename = 'ICPSR_38883-V1.zip'
df = read_dta_from_zip(zip_filename)

In [5]:
df.shape

(21839, 663)

In [6]:
for column in df.columns:
    print(column)

AI_08
AI_10
V1
V3
V5
V501
V507
V508
V509
V545
V548
V549
V550
V7101
V7105
V7112
V7115
V7118
V7127
V7097
V7133
V7136
V7139
V7142
V8451
V7426
V7121
V7124
V7164
V7145
V7109
V7158
V7161
V7601
V8480
V7648
V7780
V7783
V7786
V7693
V7831
V7106
V7113
V7116
V7119
V7128
V7098
V7134
V7137
V7140
V7143
V8452
V7122
V7125
V7165
V7146
V7110
V7159
V7162
V7602
V7488
V7489
V7491
V7492
V7814
V8481
V7495
V7554
V7561
V7564
V7566
V7568
V7569
V7694
V7819
V7781
V7784
V7787
V7832
V7102
V7107
V7114
V7117
V7120
V7129
V7099
V7135
V7138
V7141
V7144
V8453
V7427
V7123
V7126
V7166
V7147
V7111
V7160
V7163
V7603
V8482
V7615
V7616
V7617
V7642
V7695
V7669
V7782
V7785
V7788
V7833
V7108
V7641
V7442
V7441
V8413
V7443
V7444
V7445
V8417
V8418
V8419
V8483
V8421
V7446
V7447
V8424
V8425
V7448
V7751
V7449
V8564
V7548
V7789
V7103
V7180
V7181
V7475
V7476
V7477
V7478
V7479
V7480
V7790
V7829
V7830
V7547
V7549
V7550
V7643
V7176
V7587
V7724
V7885
V7791
V7793
V7834
V7835
V7836
V7837
V7838
V7839
V7840
V7841
V7842
V7884
V7557
V7558
V7428
V74

In [7]:
GRADE = 'V501'
value_counts(df[GRADE])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
8,9889
10,11950


In [8]:
"""
V7341: 07970:D05 MN=ACHV/WMN=HOME F2
Item number: 07970
How much do you agree or disagree with each statement below?
It is usually better for everyone involved if the man is the achiever 
outside the home and the woman takes care of
the home and family
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
Responses from the Western region intentionally deleted.
"""

'\nV7341: 07970:D05 MN=ACHV/WMN=HOME F2\nItem number: 07970\nHow much do you agree or disagree with each statement below?\nIt is usually better for everyone involved if the man is the achiever \noutside the home and the woman takes care of\nthe home and family\n1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"\nResponses from the Western region intentionally deleted.\n'

In [9]:
FEFAM = 'V7341'
df[FEFAM] = df[FEFAM].replace([-8, -9], np.nan)
value_counts(df[FEFAM])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,1981
2.0,968
3.0,1360
4.0,786
5.0,541
,16203


In [10]:
def set_target(df, varname, values, newname):
    valid = df[varname].notna()
    df[newname] = np.where(valid, df[varname].isin(values), np.nan)

In [11]:
set_target(df, FEFAM, [1, 2], 'fefam')
df['fefam'].mean()

np.float64(0.5232434350603264)

In [12]:
"""V7339: 07930:D06 MEN+WOMN/=$,=WRK F2
Item number: 07930
The next questions ask your opinions about a number of different topics. How much do you agree or disagree with
each statement below?
Men and women should be paid the same money if they do the same work
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
"""

'V7339: 07930:D06 MEN+WOMN/=$,=WRK F2\nItem number: 07930\nThe next questions ask your opinions about a number of different topics. How much do you agree or disagree with\neach statement below?\nMen and women should be paid the same money if they do the same work\n1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"\n'

In [13]:
FEWORK = 'V7339'
df[FEWORK] = df[FEWORK].replace([-8, -9], np.nan)
value_counts(df[FEWORK])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,244
2.0,170
3.0,371
4.0,1032
5.0,5041
,14981


In [14]:
set_target(df, FEWORK, [4, 5], 'fework')
value_counts(df['fework'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
0.0,785
1.0,6073
,14981


In [15]:
df['fework'].mean()

np.float64(0.8855351414406533)

In [16]:
"""
V7340: 07950:D06 WMN SHD =JOB OPP F2
Item number: 07950
How much do you agree or disagree with each statement below?
A woman should have exactly the same job opportunities as a man
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
""";

In [17]:
FEJOB = 'V7340'
df[FEJOB] = df[FEJOB].replace([-8, -9], np.nan)
value_counts(df[FEJOB])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,316
2.0,262
3.0,439
4.0,1318
5.0,4509
,14995


In [18]:
set_target(df, FEJOB, [4, 5], 'fejob')
value_counts(df['fejob'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
0.0,1017
1.0,5827
,14995


In [19]:
df['fejob'].mean()

np.float64(0.8514026884862653)

In [20]:
"""
V7202: 00030:R01 R'S SEX F1234
Item number: 00030
What is your sex?
- 382 -1="Male" 2="Female" 3="Other" 4="Prefer not to answer"
"""

'\nV7202: 00030:R01 R\'S SEX F1234\nItem number: 00030\nWhat is your sex?\n- 382 -1="Male" 2="Female" 3="Other" 4="Prefer not to answer"\n'

In [21]:
GENDER = 'V7202'
df[GENDER] = df[GENDER].replace(-9, np.nan)
value_counts(df[GENDER])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,9930
2.0,8997
3.0,407
4.0,910
,1595


In [22]:
df.groupby(GENDER)[['fejob', 'fework', 'fefam']].mean()

Unnamed: 0_level_0,fejob,fework,fefam
V7202,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.770482,0.828992,0.362346
2.0,0.941236,0.954284,0.683067
3.0,0.886792,0.869159,0.712644
4.0,0.888112,0.889273,0.670996


In [23]:
df.groupby([GRADE, GENDER])[['fejob', 'fework', 'fefam']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,fejob,fework,fefam
V501,V7202,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
8,1.0,0.753623,0.839421,0.342259
8,2.0,0.93483,0.948699,0.655979
8,3.0,0.9,0.901961,0.804878
8,4.0,0.921429,0.929577,0.622807
10,1.0,0.783978,0.82064,0.378543
10,2.0,0.946897,0.95921,0.707393
10,3.0,0.875,0.839286,0.630435
10,4.0,0.856164,0.85034,0.717949


In [24]:
"""
V1070: 89501:R01 RACE--B/W/H F1234
Item Number: 00041-49
Recoded from the following: Q. C04 (00041-00049): "How do you describe yourself? (Select one or more responses.)
A.Black or African American; B. Mexican American or Chicano; C. Cuban American; D. Puerto Rican;
E. Other Hispanic or Latino; F. Asian American; G. White (Caucasian); H. American Indian or Alaska Native;
I. Native Hawaiian or Other Pacific Islander."
1="Black or African American" , 2="White (Caucasian)",
3=Hispanic ("Mexican . . . " or "Cuban . . . " or "Puerto Rican" or "Other Hispanic . . . ").
All other responses, including those of respondents who fell into more than one of the three recoded categories,
were recoded to missing data.
"""

'\nV1070: 89501:R01 RACE--B/W/H F1234\nItem Number: 00041-49\nRecoded from the following: Q. C04 (00041-00049): "How do you describe yourself? (Select one or more responses.)\nA.Black or African American; B. Mexican American or Chicano; C. Cuban American; D. Puerto Rican;\nE. Other Hispanic or Latino; F. Asian American; G. White (Caucasian); H. American Indian or Alaska Native;\nI. Native Hawaiian or Other Pacific Islander."\n1="Black or African American" , 2="White (Caucasian)",\n3=Hispanic ("Mexican . . . " or "Cuban . . . " or "Puerto Rican" or "Other Hispanic . . . ").\nAll other responses, including those of respondents who fell into more than one of the three recoded categories,\nwere recoded to missing data.\n'

In [25]:
"""
"""

'\n'

In [26]:
"""
"""

'\n'

In [27]:
"""
"""

'\n'

In [28]:
"""
"""

'\n'

In [29]:
"""
"""

'\n'

In [30]:
"""
"""

'\n'

In [31]:
"""
"""

'\n'

In [32]:
"""
"""

'\n'