In [44]:
from genpeds import Admissions, Enrollment, Graduation
import numpy as np
import pandas as pd
from plotly.subplots import make_subplots
import plotly.graph_objects as go

# General Trends

In [2]:
admit_df = Admissions([2003,2013,2023]).run(merge_with_char=True)
enroll_df = Enrollment([1993,2003,2013,2023]).run(merge_with_char=True,student_level='undergrad')
grad_df = Graduation([2003,2013,2023]).run(merge_with_char=True,degree_level='bach')

In [149]:
for cat,df in [
    ('admissions',admit_df),
    ('enrollment',enroll_df),
    ('graduation',grad_df)
]:
    print(f'2023 Observations ({cat}): {len(df.loc[df['year']==2023])}')

2023 Observations (admissions): 1972
2023 Observations (enrollment): 5647
2023 Observations (graduation): 2004


## Empirical PDf and CDF

In [150]:
def plot_empirical_PMFandCDF(df,var):
    dat = np.sort(df.loc[df[var].notnull(),var].round(0))
    cdf = np.arange(1,len(dat)+1) / len(dat)

    fig = make_subplots(1,2,subplot_titles=(f'CDF of {var} (2023)',f'Histogram of {var} (2023)'))

    fig.add_trace(go.Scatter(
        x=dat,
        y=cdf,
        mode='lines'
    ),row=1,col=1)

    fig.add_trace(go.Histogram(
        x=dat,histnorm='probability',marker_color='lightblue'
    ),row=1,col=2)

    fig.update_layout(
        showlegend=False,
        width=1000, height=400,
        template='plotly_dark',
        margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 50}
    )
    fig.show()

In [151]:
# admissions
plot_empirical_PMFandCDF(admit_df.query('year==2023'),'accept_rate_men')

In [152]:
# enrollment
plot_empirical_PMFandCDF(enroll_df.query('year==2023'),'totmen_share')

In [153]:
# graduation
plot_empirical_PMFandCDF(grad_df.query('year==2023'),'gradrate_totmen')

## Time Differences

In [154]:
# plot time differences
def plot_time_differences(df,yr1,yr2,var):
    df1 = df.loc[df['year']==yr1]
    df2 = df.loc[df['year']==yr2]
    df1 = df1.loc[df1[var].notnull()]
    df2 = df2.loc[df2[var].notnull()]
    df_merged = pd.merge(df1,df2,on=['id'],suffixes=[f'_{yr1}',f'_{yr2}'])
    share_increased = len(df_merged.query(f'{var}_{yr1} < {var}_{yr2}')) / len(df_merged)
    line = np.arange(0,101)
    
    fig = go.Figure()
    fig.add_trace(go.Scatter(
        x=df_merged[f'{var}_{yr1}'],
        y=df_merged[f'{var}_{yr2}'],
        mode='markers',
        marker_color='pink',
        marker_opacity=.5
    ))
    fig.add_trace(go.Scatter(
        x=line,
        y=line,
        mode='lines',
        marker_color='white'
    ))
    fig.update_layout(
        title={'text': f'{yr1} (x-axis) and {yr2} (y-axis): Points above line mean an increase in {var}',
               'subtitle': {'text': f'Share of schools that INCREASED {var}: <b>{round(share_increased,2)}'}},
        xaxis={'title':{'text': f'{var} ({yr1})'}}, yaxis={'title':{'text': f'{var} ({yr2})'}},
        showlegend=False,
        width=1000, height=400,
        template='plotly_dark',
        margin={'pad': 0, 'b': 20, 'l': 20, 'r': 20, 't': 50}
    )
    fig.show()


In [155]:
# admissions
plot_time_differences(admit_df,2003,2023,'accept_rate_men')

In [156]:
# enrollment
plot_time_differences(enroll_df,1993,2023,'totmen_share')

In [157]:
# graduation
plot_time_differences(grad_df,2003,2023,'gradrate_totmen')

In [158]:
def wtd_quantile(df,var,weight_var,quantile):
    df = df.loc[df[var].notnull() & df[weight_var].notnull()]
    var_arr = df[var].to_numpy()
    weight_arr = df[weight_var].to_numpy()

    srt_idx = np.argsort(var_arr)
    srt_dat = var_arr[srt_idx]
    srt_wt = weight_arr[srt_idx]

    cum_wt = np.cumsum(srt_wt)
    totwt = np.sum(srt_wt)

    cutoff = totwt * quantile
    cutoff_idx = np.searchsorted(cum_wt,cutoff,side='left')
    return srt_dat[cutoff_idx]

def wtd_iqr(df,var,weight_var):
    return (
        wtd_quantile(df,var,weight_var,(1/4)),
        wtd_quantile(df,var,weight_var,(2/4)),
        wtd_quantile(df,var,weight_var,(3/4))
    )

In [159]:
for df,totqry,var,ttl in [(admit_df,'tot = tot_enrolled','accept_rate_men','Male Acceptance Rate'),
            (enroll_df,'tot = totmen + totwomen','totmen_share', 'Male Enrollment Share'),
            (grad_df,'tot = totmen + totwomen','gradrate_totmen','Male Graduation Rate'),
            (grad_df,'tot = totmen + totwomen','gradrate_totwomen','Female Graduation Rate')]:
    print(f'Weighted Medians of {ttl}:')
    for yr in [1993,2003,2013,2023]:
        if df is not enroll_df and yr == 1993:
            continue
        df = df.eval(totqry)
        val = wtd_quantile(df=df.query(f'year=={yr}'),var=var,weight_var='tot',quantile=.5)
        print(f'{(yr)}: {round(val,2)}')
    print()

Weighted Medians of Male Acceptance Rate:
2003: 71.13
2013: 65.09
2023: 74.01

Weighted Medians of Male Enrollment Share:
1993: 43.81
2003: 42.4
2013: 43.72
2023: 43.15

Weighted Medians of Male Graduation Rate:
2003: 51.66
2013: 55.75
2023: 60.92

Weighted Medians of Female Graduation Rate:
2003: 59.14
2013: 62.75
2023: 69.01



In [160]:
for df,totqry,var,ttl in [
    (admit_df,'tot = tot_enrolled','accept_rate_men','Male Acceptance Rate'),
    (enroll_df,'tot = totmen + totwomen','totmen_share', 'Male Enrollment Share'),
    (grad_df,'tot = totmen + totwomen','gradrate_totmen', 'Male Graduation Rate'),
    (grad_df,'tot = totmen + totwomen','gradrate_totwomen', 'Female Graduation Rate')
]:
    print(f'Interquartile Range (weighted) (2023) of {ttl}:')
    df = df.eval(totqry)
    a,b,c = wtd_iqr(df.query('year==2023'),var,'tot')
    print(f'25th: {round(a,2)}')
    print(f'50th: {round(b,2)}')
    print(f'75th: {round(c,2)}\n')

Interquartile Range (weighted) (2023) of Male Acceptance Rate:
25th: 53.42
50th: 74.01
75th: 86.14

Interquartile Range (weighted) (2023) of Male Enrollment Share:
25th: 38.84
50th: 43.15
75th: 46.88

Interquartile Range (weighted) (2023) of Male Graduation Rate:
25th: 47.38
50th: 60.92
75th: 75.5

Interquartile Range (weighted) (2023) of Female Graduation Rate:
25th: 57.01
50th: 69.01
75th: 81.72

