In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils import value_counts, decorate

In [3]:
from os.path import basename, exists

def download(url):
    filename = basename(url)
    if not exists(filename):
        from urllib.request import urlretrieve
        local, _ = urlretrieve(url, filename)
        print('Downloaded ' + local)

## Use IRN mapping

In [4]:
download('https://raw.githubusercontent.com/dawaldron/boys-gender-equality/refs/heads/main/data-in/MtF/irn_ref_edit.csv')

In [79]:
irn_ref = pd.read_csv('irn_ref_edit.csv').set_index(['year', 'irn'])
irn_ref.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,ds,form,V4,varname
year,irn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1991,30,ICPSR_02521,1,8th,v1226
1991,30,ICPSR_02521,2,8th,v2226
1991,30,ICPSR_02521,3,8th,v1226
1991,30,ICPSR_02521,4,8th,v2226
1991,90,ICPSR_02521,1,8th,v1230


In [81]:
irn_ref.loc[1991, 30]

Unnamed: 0_level_0,Unnamed: 1_level_0,ds,form,V4,varname
year,irn,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1991,30,ICPSR_02521,1,8th,v1226
1991,30,ICPSR_02521,2,8th,v2226
1991,30,ICPSR_02521,3,8th,v1226
1991,30,ICPSR_02521,4,8th,v2226


In [62]:
year = 1991

In [122]:
filename = 'irn_ref_edit.csv'
irn_ref = pd.read_csv(filename).set_index(['year', 'irn', 'form'])

# Add irn 5 rows with mapping info
years = irn_ref.index.get_level_values('year').unique()
for year in years:
    ds = irn_ref.loc[(year, 30), 'ds'].iloc[0]
    v4 = irn_ref.loc[(year, 30), 'V4'].iloc[0]
    
    irns = irn_ref.loc[year].index.get_level_values('irn').unique()
    irn = irns[0]
    forms = irn_ref.loc[(year, irn)].index.get_level_values('form').unique()

    for form in forms:
        irn_ref.loc[(year, 5, form), ['ds', 'V4', 'varname']] = [ds, v4, 'V5']

    irn_ref = irn_ref.sort_index()

irn_mapping.xs(5, level='irn')

Unnamed: 0_level_0,Unnamed: 1_level_0,ds,V4,varname
year,form,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1991,1,ICPSR_02521,8th,V5
1991,2,ICPSR_02521,8th,V5
1991,3,ICPSR_02521,8th,V5
1991,4,ICPSR_02521,8th,V5
1992,1,ICPSR_02522,8th,V5
...,...,...,...,...
2019,1,ICPSR_37842,8th,V5
2020,1,ICPSR_38189,8th,V5
2021,1,ICPSR_38502,8th,V5
2022,1,ICPSR_38883,8th,V5


In [123]:
from mtf import get_irn_mapping

irn_ref = get_irn_mapping()
irn_ref.xs(5, level='irn')

Unnamed: 0_level_0,Unnamed: 1_level_0,ds,V4,varname
year,form,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1991,1,ICPSR_02521,8th,V5
1991,2,ICPSR_02521,8th,V5
1991,3,ICPSR_02521,8th,V5
1991,4,ICPSR_02521,8th,V5
1992,1,ICPSR_02522,8th,V5
...,...,...,...,...
2019,1,ICPSR_37842,8th,V5
2020,1,ICPSR_38189,8th,V5
2021,1,ICPSR_38502,8th,V5
2022,1,ICPSR_38883,8th,V5


In [77]:
from mtf import read_year_data

df = read_year_data(year)
df.head()

Unnamed: 0,CASEID,30,7930,7950,24770,5
0,1,1,5,5,0,0.8556
1,2,1,1,1,0,0.8815
2,3,1,9,9,9,1.2194
3,4,1,5,5,9,0.4084
4,5,9,9,9,9,0.4367


In [71]:
df.columns

Index(['CASEID', 'V1', 'V3', 'V4', 'V5', 'V507', 'V508', 'V509', 'V1174',
       'V1175',
       ...
       'V2375', 'V2376', 'V2377', 'V2378', 'V2379', 'V2380', 'V2381', 'V2382',
       'V1225', 'V2225'],
      dtype='object', length=555)

## Data

Monitoring the Future: A Continuing Study of American Youth (8th- and 10th-Grade Surveys)

https://www.icpsr.umich.edu/web/NAHDAP/studies/38883#

In [6]:
from glob import glob

filenames = glob('data/ICPSR*.zip')
filenames

['data/ICPSR_02475-V1.zip',
 'data/ICPSR_35166-V2.zip',
 'data/ICPSR_37183-V1.zip',
 'data/ICPSR_38502-V1.zip',
 'data/ICPSR_20180-V2.zip',
 'data/ICPSR_37415-V1.zip',
 'data/ICPSR_34574-V2.zip',
 'data/ICPSR_02523-V1.zip',
 'data/ICPSR_02390-V1.zip',
 'data/ICPSR_02752-V2.zip',
 'data/ICPSR_03752-V2.zip',
 'data/ICPSR_38189-V1.zip',
 'data/ICPSR_33902-V1.zip',
 'data/ICPSR_02350-V2.zip',
 'data/ICPSR_04537-V2.zip',
 'data/ICPSR_22500-V1.zip',
 'data/ICPSR_30984-V1.zip',
 'data/ICPSR_36149-V1.zip',
 'data/ICPSR_38883-V1.zip',
 'data/ICPSR_36407-V1.zip',
 'data/ICPSR_02940-V1.zip',
 'data/ICPSR_04263-V2.zip',
 'data/ICPSR_04018-V2.zip',
 'data/ICPSR_03426-V1.zip',
 'data/ICPSR_02521-V2.zip',
 'data/ICPSR_02522-V2.zip',
 'data/ICPSR_25422-V2.zip',
 'data/ICPSR_36799-V1.zip',
 'data/ICPSR_02476-V1.zip',
 'data/ICPSR_37842-V1.zip',
 'data/ICPSR_28402-V1.zip',
 'data/ICPSR_39171-V1.zip',
 'data/ICPSR_03183-V1.zip']

In [7]:
import zipfile


def read_dta_from_zip(zip_filename, index=0):
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        # Get the list of files and find the first .dta file
        file_list = zip_ref.namelist()
        dta_files = [f for f in file_list if f.lower().endswith('.dta')]

        if len(dta_files) == 0:
            raise FileNotFoundError("No .dta file found in the ZIP archive.")

        # Read the .dta file into a DataFrame
        stata_path = dta_files[index]
        with zip_ref.open(stata_path) as dta_file:
            df = pd.read_stata(dta_file, convert_categoricals=False)

    return df

In [8]:
import zipfile
import pandas as pd
from functools import reduce

def read_all_dta_from_zip(zip_filename):
    dfs = []
    
    with zipfile.ZipFile(zip_filename, 'r') as zip_ref:
        dta_files = [f for f in zip_ref.namelist() if f.lower().endswith('.dta')]

        if not dta_files:
            raise FileNotFoundError("No .dta files found in the ZIP archive.")

        for path in dta_files:
            with zip_ref.open(path) as dta_file:
                df = pd.read_stata(dta_file, convert_categoricals=False)
                dfs.append(df)
                
    return dfs

    # Merge all dataframes on 'CASEID'
    merged_df = reduce(lambda left, right: pd.merge(left, right, on="CASEID", how="outer"), 
                       dfs)
    return merged_df


In [9]:
def get_year_to_filename(filenames, index=0):
    year_to_filename = {}

    for zip_filename in filenames:
        print(zip_filename)
        df = read_dta_from_zip(zip_filename, index=index)

        years = df['V1'].value_counts()
        if len(years) != 1:
            raise ValueError(f"Unexpected number of unique years in V1 for {zip_filename}: {len(years)}")

        year = int(years.index[0])
        if year < 1900:
            year += 1900
        print(year)
        year_to_filename[year] = zip_filename

    return year_to_filename


In [10]:
ZIPFILE = {
    1991: 'data/ICPSR_02521-V2.zip',
    1992: 'data/ICPSR_02522-V2.zip',
    1993: 'data/ICPSR_02523-V1.zip',
    1994: 'data/ICPSR_02475-V1.zip',
    1995: 'data/ICPSR_02390-V1.zip',
    1996: 'data/ICPSR_02350-V2.zip',
    1997: 'data/ICPSR_02476-V1.zip',
    1998: 'data/ICPSR_02752-V2.zip',
    1999: 'data/ICPSR_02940-V1.zip',
    2000: 'data/ICPSR_03183-V1.zip',
    2001: 'data/ICPSR_03426-V1.zip',
    2002: 'data/ICPSR_03752-V2.zip',
    2003: 'data/ICPSR_04018-V2.zip',
    2004: 'data/ICPSR_04263-V2.zip',
    2005: 'data/ICPSR_04537-V2.zip',
    2006: 'data/ICPSR_20180-V2.zip',
    2007: 'data/ICPSR_22500-V1.zip',
    2008: 'data/ICPSR_25422-V2.zip',
    2009: 'data/ICPSR_28402-V1.zip',
    2010: 'data/ICPSR_30984-V1.zip',
    2011: 'data/ICPSR_33902-V1.zip',
    2012: 'data/ICPSR_34574-V2.zip',
    2013: 'data/ICPSR_35166-V2.zip',
    2014: 'data/ICPSR_36149-V1.zip',
    2015: 'data/ICPSR_36407-V1.zip',
    2016: 'data/ICPSR_36799-V1.zip',
    2017: 'data/ICPSR_37183-V1.zip',
    2018: 'data/ICPSR_37415-V1.zip',
    2019: 'data/ICPSR_37842-V1.zip',
    2020: 'data/ICPSR_38189-V1.zip',
    2021: 'data/ICPSR_38502-V1.zip',
    2022: 'data/ICPSR_38883-V1.zip',
    2023: 'data/ICPSR_39171-V1.zip'
}

In [11]:
year = 1992
zip_filename = ZIPFILE[year]
zip_filename

'data/ICPSR_02522-V2.zip'

In [12]:
from collections import defaultdict

import bisect

class YearLookupDict:
    def __init__(self, year_value_pairs, default=None):
        # Sort by year
        self._years, self._values = zip(*sorted(year_value_pairs.items()))
        self._default = default

    def __getitem__(self, year):
        i = bisect.bisect_right(self._years, year) - 1
        if i < 0:
            if self._default is not None:
                return self._default
            raise KeyError(f"No entry for year {year} and no default set.")
        return self._values[i]

    def __contains__(self, year):
        return year in self._years

    def items(self):
        return zip(self._years, self._values)

    def get(self, year, default=None):
        try:
            return self[year]
        except KeyError:
            return default

In [13]:
INDICES = {
    1991: [[0, 1], [2, 3]],
    1997: [[0, 1], [4, 5]],
    2012: [[0]]
}
INDICES = YearLookupDict(INDICES)

In [14]:
def read_forms(zip_filename, indices):
    dfs = [read_dta_from_zip(zip_filename, index=index) for index in indices]
    suffixes = [None, '_y']
    df = reduce(lambda left, right: 
                pd.merge(left, right, on="CASEID", how="outer", suffixes=suffixes), 
                dfs)
    return df

In [15]:
INDICES[year]

[[0, 1], [2, 3]]

In [16]:
dfs = [read_forms(zip_filename, indices) for indices in INDICES[year]]
df = pd.concat(dfs, ignore_index=True).copy()

In [17]:
df.head()

Unnamed: 0,CASEID,V1,V3,V4,V5,V507,V508,V509,V1173,V1174,...,V2134,V2135,V2136,V2325,V2326,V2327,V2328,V2329,V1224,V2223
0,1,92,1,70001,0.5647,3,0,1,9,1,...,4.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,,
1,2,92,1,70002,0.6976,3,0,1,9,9,...,9.0,9.0,9.0,1.0,1.0,1.0,1.0,9.0,,
2,3,92,1,70003,0.5744,3,1,1,9,9,...,9.0,9.0,9.0,1.0,1.0,3.0,6.0,9.0,,
3,4,92,1,70004,0.3331,4,0,1,9,9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,,
4,5,92,1,70005,0.8917,3,0,1,2,1,...,2.0,1.0,3.0,1.0,1.0,1.0,1.0,2.0,,


In [18]:
df.shape

(17010, 554)

In [19]:
df.head()

Unnamed: 0,CASEID,V1,V3,V4,V5,V507,V508,V509,V1173,V1174,...,V2134,V2135,V2136,V2325,V2326,V2327,V2328,V2329,V1224,V2223
0,1,92,1,70001,0.5647,3,0,1,9,1,...,4.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,,
1,2,92,1,70002,0.6976,3,0,1,9,9,...,9.0,9.0,9.0,1.0,1.0,1.0,1.0,9.0,,
2,3,92,1,70003,0.5744,3,1,1,9,9,...,9.0,9.0,9.0,1.0,1.0,3.0,6.0,9.0,,
3,4,92,1,70004,0.3331,4,0,1,9,9,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,,
4,5,92,1,70005,0.8917,3,0,1,2,1,...,2.0,1.0,3.0,1.0,1.0,1.0,1.0,2.0,,


In [20]:
for column in df.columns:
    print(column)

CASEID
V1
V3
V4
V5
V507
V508
V509
V1173
V1174
V1175
V1176
V1177
V1178
V1179
V1180
V1181
V1182
V1183
V1184
V1185
V1186
V1187
V1188
V1189
V1190
V1191
V1192
V1193
V1194
V1195
V1196
V1197
V1198
V1199
V1200
V1201
V1202
V1203
V1204
V1205
V1206
V1207
V1208
V1209
V1210
V1211
V1212
V1213
V1214
V1215
V1216
V1217
V1218
V1219
V1220
V1221
V1222
V1223
V1225
V1226
V1228
V1229
V1230
V1231
V1237
V1238
V1239
V1240
V1242
V1243
V1244
V1245
V1247
V1248
V1249
V1250
V1251
V1254
V1255
V1256
V1257
V1258
V1259
V1260
V1128
V1131
V1132
V1133
V1262
V1263
V1264
V1265
V1266
V1267
V1268
V1252
V1253
V1261
V1269
V1270
V1101
V1102
V1103
V1104
V1105
V1106
V1107
V1108
V1109
V1110
V1111
V1112
V1113
V1114
V1115
V1116
V1117
V1118
V1119
V1120
V1246
V1121
V1122
V1123
V1124
V1125
V1126
V1127
V1138
V1139
V1140
V1141
V1142
V1143
V1144
V1145
V1146
V1147
V1148
V1149
V1150
V1151
V1152
V1153
V1154
V1155
V1156
V1157
V1158
V1159
V1160
V1161
V1162
V1163
V1164
V1165
V1166
V1167
V1168
V1169
V1170
V1171
V1172
V1129
V1130
V1271
V1272
V1273


In [21]:
WEIGHT = {
    1991: 'V5',
}
WEIGHT = YearLookupDict(WEIGHT)

In [22]:
df[WEIGHT[year]].describe()

count    17010.000000
mean         0.998470
std          0.618610
min          0.176100
25%          0.602925
50%          0.840150
75%          1.258425
max          5.434700
Name: V5, dtype: float64

In [23]:
GRADE = {
    1991: 'V1101',
    2011: 'V1101',
    2012: 'V501'
}
GRADE = YearLookupDict(GRADE)

In [24]:
df[GRADE[year]] = df[GRADE[year]].replace([0, 9, -9], np.nan)
# TODO Replace code 2 with grade 8
# TODO Replace code 4 with grade 10

value_counts(df[GRADE[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
2.0,9411
4.0,7438
,161


In [25]:
"""
V7202: 00030:R01 R'S SEX F1234
Item number: 00030
What is your sex?
- 382 -1="Male" 2="Female" 3="Other" 4="Prefer not to answer"
""";

In [26]:
GENDER = {
    1991: 'V1226',
    1992: 'V1225',
    1993: 'V1226',
    1994: 'V1227',
    1995: 'V1233',
    1996: 'V1235',
    1998: 'V1233',
    2001: 'V1232',
    2004: 'V1233',
    2006: 'V1246',
    2010: 'V2238',
    2012: 'V7202'
}
GENDER = YearLookupDict(GENDER)

In [27]:
GENDER = irn_ref.query('irn == 30 and form==1')['varname'].str.upper()

In [28]:
df[GENDER[year]] = df[GENDER[year]].replace([0, 9, -9], np.nan)
value_counts(df[GENDER[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,8118
2.0,8412
,480


In [29]:
pd.crosstab(df[GRADE[year]], df[GENDER[year]], dropna=False)

V1225,1.0,2.0,NaN
V1101,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2.0,4442,4656,313
4.0,3575,3715,148
,101,41,19


In [30]:
male = df[GENDER[year]] == 1
male.sum()

np.int64(8118)

In [31]:
# df = df.loc[male]

## Target variables

In [32]:
def set_target(df, varname, values, newname):
    valid = df[varname].notna()
    df[newname] = np.where(valid, df[varname].isin(values), np.nan)

### FEWORK

same money

In [33]:
"""V7339: 07930:D06 MEN+WOMN/=$,=WRK F2
Item number: 07930
The next questions ask your opinions about a number of different topics. How much do you agree or disagree with
each statement below?
Men and women should be paid the same money if they do the same work
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
"""

'V7339: 07930:D06 MEN+WOMN/=$,=WRK F2\nItem number: 07930\nThe next questions ask your opinions about a number of different topics. How much do you agree or disagree with\neach statement below?\nMen and women should be paid the same money if they do the same work\n1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"\n'

In [34]:
FEWORK = {
    1991: 'V1139',
    1992: 'V2137',
    1995: 'V2138',
    2004: 'V1141',
    2006: 'V2139',
    2009: 'V2140',
    2012: 'V7339',
}
FEWORK = YearLookupDict(FEWORK)

In [35]:
df[FEWORK[year]] = df[FEWORK[year]].replace([0, 9, -8, -9], np.nan)
value_counts(df[FEWORK[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,760
2.0,313
3.0,417
4.0,1921
5.0,13426
,173


In [36]:
set_target(df, FEWORK[year], [5], 'fework')
value_counts(df['fework'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
0.0,3411
1.0,13426
,173


In [37]:
df['fework'].mean()

np.float64(0.7974104650472175)

### FEJOB

same job opportunities

In [38]:
"""
V7340: 07950:D06 WMN SHD =JOB OPP F2
Item number: 07950
How much do you agree or disagree with each statement below?
A woman should have exactly the same job opportunities as a man
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
""";

In [39]:
FEJOB = {
    1991: 'V1140',
    1992: 'V2138',
    1995: 'V2139',
    2004: 'V1142',
    2006: 'V2140',
    2009: 'V2141',
    2012: 'V7340'
}
FEJOB = YearLookupDict(FEJOB)

In [40]:
df[FEJOB[year]] = df[FEJOB[year]].replace([0, 9, -8, -9], np.nan)
value_counts(df[FEJOB[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,1032
2.0,585
3.0,578
4.0,3007
5.0,11623
,185


In [41]:
set_target(df, FEJOB[year], [5], 'fejob')
value_counts(df['fejob'])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
0.0,5202
1.0,11623
,185


In [42]:
df['fejob'].mean()

np.float64(0.6908172362555721)

In [43]:
valid = df.dropna(subset=['fejob', WEIGHT[year]])

In [44]:
np.average(valid['fejob'])

np.float64(0.6908172362555721)

In [45]:
np.average(valid['fejob'], weights=valid[WEIGHT[year]])

np.float64(0.6886721747442165)


### FEFAM

In [46]:
"""
V7341: 07970:D05 MN=ACHV/WMN=HOME F2
Item number: 07970
How much do you agree or disagree with each statement below?
It is usually better for everyone involved if the man is the achiever 
outside the home and the woman takes care of
the home and family
1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"
Responses from the Western region intentionally deleted.
"""

'\nV7341: 07970:D05 MN=ACHV/WMN=HOME F2\nItem number: 07970\nHow much do you agree or disagree with each statement below?\nIt is usually better for everyone involved if the man is the achiever \noutside the home and the woman takes care of\nthe home and family\n1="Disagree" 2="Mostly Disagree" 3="Neither" 4="Mostly Agree" 5="Agree"\nResponses from the Western region intentionally deleted.\n'

In [47]:
FEFAM = {
    1991: 'V1141',
    1992: 'V2139',
    1995: 'V2140',
    2004: 'V1143',
    2006: 'V2141',
    2009: 'V2142',
    2012: 'V7341'
}
FEFAM = YearLookupDict(FEFAM)

In [48]:
df[FEFAM[year]] = df[FEFAM[year]].replace([0, 6, 9, -8, -9], np.nan)
value_counts(df[FEFAM[year]])

Unnamed: 0_level_0,counts
values,Unnamed: 1_level_1
1.0,6715
2.0,2741
3.0,2617
4.0,2431
5.0,2279
,227


In [49]:
set_target(df, FEFAM[year], [1], 'fefam')
df['fefam'].mean()

np.float64(0.4001072513853304)

## Summarize

In [50]:
df.groupby(GENDER[year])[['fejob', 'fework', 'fefam']].mean()

Unnamed: 0_level_0,fejob,fework,fefam
V1225,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1.0,0.687024,0.797655,0.394486
2.0,0.696251,0.799018,0.40535


In [51]:
df.groupby([GRADE[year], GENDER[year]])[['fejob', 'fework', 'fefam']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,fejob,fework,fefam
V1101,V1225,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2.0,1.0,0.689158,0.786335,0.398576
2.0,2.0,0.696688,0.784539,0.401
4.0,1.0,0.686529,0.814386,0.391846
4.0,2.0,0.697618,0.817518,0.410985


## Debug one year

In [52]:
from mtf import process_year, compute_target_means

In [53]:
df2 = process_year(year, force_run=True).query('gender == 1.0')
result = compute_target_means(df2, weighted=False)

In [54]:
result

{'fework': np.float64(0.7976546906187625),
 'fefam': np.float64(0.3944862155388471),
 'fejob': np.float64(0.6870238541276383)}

In [55]:
stop

NameError: name 'stop' is not defined

In [None]:
def run_all_years():
    # Loop through all years, compute means, and collect results
    results = []
    for year in sorted(ZIPFILE.keys()):
        print(year)
        df = process_year(year, force_run=True).query('gender == 1.0')
        result = compute_target_means(df, weighted=True)
        result['year'] = year  # Add year to result after computing means
        results.append(result)

    # Create a DataFrame of the results and print
    results_df = pd.DataFrame(results).set_index('year', drop=True)


In [None]:
results_df.plot()

## Validate

In [None]:
download('https://raw.githubusercontent.com/dawaldron/boys-gender-equality/refs/heads/main/d3/mtf-gender-socmed.csv')

In [None]:
mtf = pd.read_csv('mtf-gender-socmed.csv').set_index('year')
mtf['value'] = mtf['value'].fillna('1-4')
mtf.tail()

In [None]:
subset = mtf.query('question == "jobop" and value=="1-4"')
subset.tail()

In [None]:
jobop_series = subset['pct']
jobop_series.tail(6)

In [None]:
eqpay_series = subset['pct']
eqpay_series.tail()

In [None]:
eqpay_series.plot(style='.', label='eqpay')
jobop_series.plot(style='.', label='jobop')
decorate()

In [None]:
results_df['fejob'].plot(label='replicated')
jobop_series.plot(style='.', label='jobop from csv')
decorate()