In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_colwidth', -1)
import numpy as np
import matplotlib.pyplot as plt
import probscale
import scipy
import scipy.stats as stats
import math
plt.style.use('fivethirtyeight')

In [None]:
# Import relevant dataframes:

main_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Main_Data\Main_Table_01192020.csv")
print(len(main_table))
faculty_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Faculty_Data\Faculty_Table_11222019.csv")
print(len(faculty_table))
skill_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Skills_Data\Skill_Table_06072019.csv")
print(len(skill_table))


In [None]:
# postdoc mask useful for isolating all ['Post-Doctoral' = 1] postings by Job ID
mask = faculty_table[faculty_table['Post-Doctoral']==1][['Job ID']]
print(len(mask))

In [None]:
bls = pd.read_csv(r"C:\Users\Public\_Data\_Data\bls_states\bls_state_employment_thousands.csv")
print(len(bls))

bls['Jobs'] = bls['Jobs']*1000

In [None]:
bls

In [None]:
def title_swap(string):
    dictionary = {'Community and Social Services':'Counseling and Religious Life',
                  'Customer and Client Support':'Online Support and University Information',
                  'Hospitality, Food, and Tourism':'Event Management and Hospitality', 
                  'Planning and Analysis':'Analysis', 
                  'Curriculum and Instructional Designer / Developer':'Curriculum and Instructional Designer', 
                  'Special Education Teacher':'Accessibility and Disability Services', 
                  'Teaching Assistant':'Faculty Support', 
                  'Tutor':'Academic Tutor',
                  'Clerical and Administrative':'Administrative',
                  'na' : 'Uncategorized'}
    if(string in dictionary):
        return dictionary[string]
    else:
        return string
    
def val_year(year1, year2):
    if(year1 >= year2):
        if(year1>year2):
            temp = year1
            year1 = year2
            year2 = temp
        else:
            raise ValueError('Years cannot be the same.')
            
    return (year1, year2)

def cat_validate(actual, desired_list):
    return actual in desired_list

In [None]:
# boolean_vec = (main_table['Job Title'].str.contains('postdoc', case=False, na=False) |
#                       main_table['Job Title'].str.contains('post doc', case=False, na=False)|
#                       main_table['Job Title'].str.contains('post-doc', case=False, na=False))

# main_table['post_doc'] = boolean_vec + 0

# faculty_table = faculty_table.merge(main_table[['Job ID', 'post_doc']], on='Job ID', how='inner')

In [None]:
# def boolean(x):
#     if x > 0:
#         return 1
#     else:
#         return 0

# faculty_table['Post-Doctoral'] = faculty_table['post_doc'].apply(boolean)

In [None]:
# faculty_table['Post-Doctoral'].sum()

In [None]:
def t_test(mean, null, std, n):
    
    var = ((arr - mean)**2).sum()/(n-1)
    std = math.sqrt(var)
    se = std/math.sqrt(n)
    t = (mean - 0)/se
    
    return mean, std, se, t

def two_t_test(mean1, mean2, null, var1, var2, n1, n2, sides='two', side='+'):
    
    comb_er = var1/n1 + var2/n2
    if comb_er > 0:
        comb_t = ((mean1 - mean2) - null)/math.sqrt(comb_er)
    else:
        return {'diff':(mean1-mean2), 'se':0,'t':float('+inf'), 'p':0}
    df = comb_er**2/(((var1/n1)**2/(n1-1))+((var2/n2)**2/(n2-1)))
    
    if(sides == 'two'):
        return {'diff':(mean1-mean2), 'se':math.sqrt(comb_er),'t':comb_t, 'p':2*scipy.stats.t.cdf(-math.fabs(comb_t), df)}
        
    elif(sides == 'one'):
        if(side == '+'):
            comb_t = -comb_t
        return {'diff':(mean1-mean2), 'se':math.sqrt(comb_er),'t':comb_t, 'p':scipy.stats.t.cdf(comb_t, df)}
        
    else:
        return None


In [None]:
def breakout(df, year, category, sig_level=0.01, drop=True, Job_ID=True):
    df = df[df['Year']==year]
    if Job_ID:
        num = df['Job ID'].nunique()
    else:
        num = int(df['Jobs'].sum())
    arr = np.zeros(num)
    p = 1
    i = 1
    while(i <= num):
        arr[:i] = 1
        p = stats.ttest_1samp(arr, popmean=0).pvalue
        if p/2 <= sig_level:
            print(num, '->', i)
            break
        i+=1
    
    if Job_ID:
        cat = pd.DataFrame(df[category].value_counts()).reset_index()
        cat = cat.rename(columns={category:'count'})
        cat['inc'] = np.true_divide(cat['count'], num)
    else:
        cat = pd.DataFrame(df[[category, 'Jobs']].groupby([category]).sum()).reset_index()
        cat = cat.rename(columns={'BEA_zone':'index', 'Jobs':'count'})
        cat['inc'] = np.true_divide(cat['count'], num)
    if drop:
        cat = cat[cat['count']>=i]
    
    cat['var'] = cat['inc'] * (1 - cat['inc'])
    
    return cat, num

def validate(row, n2, n1):
    return two_t_test(row['inc_x'], row['inc_y'], 0, row['var_x'], row['var_y'], n1, n2)['p']

def growth(df1, df2, n1, n2):
    df = df1.merge(df2, on='index', how='outer', indicator=True)
    df = df[df['_merge']=='both']
    df['p_change'] = df.apply(validate, axis=1, args=(n1, n2,))
#     df = df[df['p_change']<=0.01]
#     df['growth'] = np.true_divide(df['inc_y'] - df['inc_x'], df['inc_x'])
    df['growth'] = np.true_divide(df['count_y'] - df['count_x'], df['count_x'])
    return df.sort_values('growth', ascending=False).reset_index(drop=True)
    
mask = faculty_table[faculty_table['Post-Doctoral'] == 1][['Job ID']]
post = main_table.merge(mask, on='Job ID', how='inner')

main = main_table.drop(post.index)
main = main[main['BEA_zone']!='Territories']

year1 = 2007
year2 = 2017

inst1, num1 = breakout(post, year1, 'IPEDS Institution Name', drop=False)
inst2, num2 = breakout(post, year2, 'IPEDS Institution Name')

inst_g = growth(inst1, inst2, num1, num2)

main_inst1, num1 = breakout(main, year1, 'IPEDS Institution Name')
main_inst2, num2 = breakout(main, year2, 'IPEDS Institution Name')

main_inst_g = growth(main_inst1, main_inst2, num1, num2)

main_zone1, num1 = breakout(main, year1, 'Institution_BEA_zone')
main_zone2, num2 = breakout(main, year2, 'Institution_BEA_zone')

main_zone_g = growth(main_zone1, main_zone2, num1, num2)

zone1, num1 = breakout(post, year1, 'Institution_BEA_zone')
zone2, num2 = breakout(post, year2, 'Institution_BEA_zone')

zone_g = growth(zone1, zone2, num1, num2)

baseline_zone1, num1 = breakout(bls, year1, 'BEA_zone', Job_ID=False)
baseline_zone2, num2 = breakout(bls, year2, 'BEA_zone', Job_ID=False)

baseline_zone_g = growth(baseline_zone1, baseline_zone2, num1, num2)

In [None]:
inst_g

In [None]:
main_inst_g

In [None]:
zone_g

In [None]:
main_zone_g

In [None]:
baseline_zone_g

In [None]:
cat = pd.DataFrame(bls[['BEA_zone', 'Jobs']].groupby(['BEA_zone']).sum()).reset_index()
cat = cat.rename(columns={'Jobs':'count'})
cat

In [None]:
index = pd.DataFrame(main_table['IPEDS Institution Name'].value_counts()[:10]).reset_index()['index'].values

In [None]:
years = [2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
index = inst_g[:25]['index'].values
main = main_table.merge(mask, on='Job ID', how='inner')
w_df = main[main['IPEDS Institution Name'].apply(cat_validate, args=(index,))]

breakdown = pd.DataFrame(w_df[['IPEDS Institution Name', 'Year', 'Job ID']].groupby(['IPEDS Institution Name', 'Year']).count()).reset_index()

total = np.zeros((0,9))
for name in index:
    temp = breakdown[breakdown['IPEDS Institution Name']==name]
#     counts = temp['Job ID'].values
    counts = np.zeros(9,)
    i = 0
    for year in years:
        docs = temp[temp['Year']==year]['Job ID'].values
        if len(docs) == 1:
            counts[i] = docs[0]
        i+=1
    total = np.vstack((total, counts))
    
ind = np.arange(total.shape[1])

fix, ax = plt.subplots(figsize=(20,10))

i=0
for series in total:
    ax.plot(ind, series, label=index[i])
    i+=1

plt.xticks(ind, years)
plt.legend(loc='best')
plt.show()

In [None]:
penn = main_table[main_table['IPEDS Institution Name']=='Pennsylvania State University-Main Campus']
penn = penn.merge(mask, on='Job ID', how='inner')

penn[penn['Year']==2017]

In [None]:
main

In [None]:
def graph_growth(df, category, title, top=10, color1='royalblue', color2='gold'):
    
    df= df[:10]
    
    labels = df['index'].values
    N = len(labels)
    
#     first = df['inc_x'].values
#     second = df['inc_y'].values
    first = df['count_x'].values
    second = df['count_y'].values
    
    growth = df['growth'].values
    
    ind = np.arange(N)
    width = 0.2
    
    fig, ax = plt.subplots(figsize=(12,6))
    
    ax.bar(ind-width/2, first, width, color=color1, label='2007')
    ax.bar(ind+width/2, second, width, color=color2, label='2017')
    
    o_max = max(np.max(first), np.max(second))
    
    for i in range(len(ind)):
        i_max = 0
        if i_max < first[i]:
            i_max = first[i]
        if i_max < second[i]:
            i_max = second[i]
        plt.text(ind[i]-1.5*width, i_max+o_max/100, str(round(growth[i]*100, 2)) + '% Growth')
    
    plt.xticks(ind, labels, rotation=45, ha='right')
    plt.xlabel(category)
    plt.ylabel('Number of Postings in ' + category)
    plt.legend(loc='upper right')
    plt.title(title)
    
    plt.show()
    


In [None]:
graph_growth(main_zone_g, 'BEA Zone', 'Change in Total Data BEA Zone sizes from 2007 to 2017')
graph_growth(zone_g, 'BEA Zone', 'Change in Post-Doc BEA Zone sizes from 2007 to 2017')

graph_growth(baseline_zone_g, 'BEA Zone', 'Change in Baseline BLS Employment in BEA Zone from 2007 to 2017')

# graph_growth(main_inst_g, 'IPEDS Institution', 'Change in Relative Institution sizes from\n2007 to 2017 in all Postings',
#              color1='maroon', color2='seagreen')
# graph_growth(inst_g, 'IPEDS Institution', 'Change in Relative Institution sizes from\n2007 to 2017 in Post-Doc Postings',
#              color1='maroon', color2='seagreen')

In [None]:
# Superimposed Categorical growth:
#     Two dataframes are used to graph categorical changes across two datasets and
#     two time periods. An example of this is postings within the post-doc bucket being
#     comapared to postings in HEJP as a whole in 2007 and 2017.

def super_graph_growth(df1, df2, category, years, title, type1, type2, colors1, colors2, scale1=1, scale2=1, top=10):
    
    comb = df1.merge(df2, on='index', how='inner')
    comb = comb.sort_values(by='growth_y', ascending=False)
    comb = comb[:top]
    
#     display(comb)
    
    labels = comb['index'].values
    ind = np.arange(len(labels))
    width = 0.4
    
    first1 = comb['count_x_x'].values/scale1
    first2 = comb['count_y_x'].values/scale1
    
    second1 = comb['count_x_y'].values/scale2
    second2 = comb['count_y_y'].values/scale2
    
    top_y = max(max(np.amax(first1), np.amax(first2)), max(np.amax(second1), np.amax(second2))) * 1.1
    
    fig, ax2 = plt.subplots(figsize=(12, 6))
    
    bar3 = ax2.bar(ind-width/2, second1, width, color=colors2[0], label=str(years[0]) + ' ' + type2)
    bar4 = ax2.bar(ind+width/2, second2, width, color=colors2[1], label=str(years[1]) + ' ' + type2)
    
    ax2.set_ylim(top=top_y)
    ax2.set_ylabel('Number of ' + type2 + ' Jobs in ' + category + '\nScale Factor: ' + str(scale2))
    
    ax1 = ax2.twinx()
    ax1.grid(False)
    
    bar1 = ax1.bar(ind-width/2, first1, width/2, color=colors1[0], label=str(years[0]) + ' ' + type1)
    bar2 = ax1.bar(ind+width/2, first2, width/2, color=colors1[1], label=str(years[1]) + ' ' + type1)
    
    ax1.set_ylim(top=top_y)
    ax1.set_ylabel('Number of ' + type1 + ' Jobs in ' + category + '\nScale Factor: ' + str(scale1))
    
    plt.xticks(ind, labels, rotation=45, ha='right')
    plt.xlabel(category)
    
    bars = [bar1, bar3, bar2, bar4]
    labs = [b.get_label() for b in bars]
    plt.legend(bars, labs, loc='best')
    
    plt.title(title)
    
    plt.show()

super_graph_growth(zone_g, main_zone_g, 'BEA Zone', (2007, 2017), 'Post-Doc BEA Zone Growth vs HEJP Baseline\n2007 to 2017', 
                   'Post-Docs', 'HEJP', ('royalblue', 'goldenrod'), ('cornflowerblue', 'gold'), scale2=100)

super_graph_growth(main_zone_g, baseline_zone_g, 'BEA Zone', (2007, 2017), 'HEJP BEA Zone Growth vs BLS Baseline\n2007 to 2017',
                   'HEJP', 'BLS', ('cornflowerblue', 'gold'), ('lightskyblue', 'bisque'), scale1=100, scale2=10000)

In [None]:
# Per Capita Post-Doc investigations
def percap_breakout(df, category, year):
    df = df[df['Year']==year]

    group1 = df.groupby([category, 'IPEDS Institution Name']).count()
    group1 = pd.DataFrame(group1).reset_index()[[category, 'IPEDS Institution Name', 'Job ID']]
    
#     group1 = group1[group1['Job ID']>=6]

    group2 = group1[[category, 'IPEDS Institution Name']].groupby(category).count()
    group2 = pd.DataFrame(group2).reset_index()
    group2 = group2.rename(columns={'IPEDS Institution Name':'heads'})

    total = df[category].value_counts()
    total = pd.DataFrame(total).reset_index()
    total = total.rename(columns={category:'counts', 'index':category})

    final = total.merge(group2, on=category, how='inner')

    final['percap'] = np.true_divide(final['counts'], final['heads'])

    final = final.sort_values(by='percap', ascending=False).reset_index(drop=True)

    return final

category = 'Institution_BEA_zone'
year1 = 2007
year2 = 2017
title = 'Posting Counts and Per Institution Averages for\nPost docs from '+ str(year1) +' to '+ str(year2)

colors1 = ('olivedrab', 'darkgreen', 'tab:green')
colors2 = ('rebeccapurple', 'indigo', 'tab:purple')

percap1 = percap_breakout(post, category, year1)
percap2 = percap_breakout(post, category, year2)

final = percap1.merge(percap2, on=category, how='inner')
final = final.sort_values(by='counts_y', ascending=False)

display(final)

fig, ax1 = plt.subplots(figsize=(12,6))

labels = final['Institution_BEA_zone'].values
ind = np.arange(len(labels))
width = 0.2

counts1 = final['counts_x'].values
counts2 = final['counts_y'].values

percap1 = final['percap_x'].values
percap2 = final['percap_y'].values

bar1 = ax1.bar(ind-1.5*width, counts1, width, color=colors1[0], label='Count ' + str(year1))
bar2 = ax1.bar(ind-0.5*width, counts2, width, color=colors1[1], label='Count ' + str(year2))

ax1.set_ylabel('Number of Post-Doc Postings by BEA Zone', color=colors1[2])
ax1.tick_params(axis='y', labelcolor=colors1[2])

ax2 = ax1.twinx()
ax2.grid(False)

bar3  = ax2.bar(ind+0.5*width, percap1, width, color=colors2[0], label='Per Institution ' + str(year1))
bar4  = ax2.bar(ind+1.5*width, percap2, width, color=colors2[1], label='Per Institution ' + str(year2))

ax2.set_ylabel('Average Post-Doc Postings Per Institution\nby BEA Zone', color='tab:purple')
ax2.tick_params(axis='y', labelcolor=colors2[2])

bars = [bar1, bar2, bar3, bar4]
labs = [b.get_label() for b in bars]
plt.legend(bars, labs, loc='upper center')

plt.xticks(ind, labels, rotation=45, ha='right')
plt.title(title)

plt.show()

In [None]:
# Specialized Skill Shift



In [None]:
zone_g

In [None]:
two_t_test(0.155761, 0.234685, 0, 0.131500, 0.234685, 2812, 5697, sides='two', side='+')

In [None]:
stats.t.cdf(-1.96, 25)

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')

wSal = main_table[main_table['Annual Salary']>0]

print(wSal['Job ID'].nunique()/main['Job ID'].nunique())

desc = wSal['Annual Salary'].describe()

display(desc)
print("IQR:", desc['75%']-desc['50%'] )

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')

wSal = main[main['Annual Salary']>0]

print(wSal['Job ID'].nunique()/main['Job ID'].nunique())

desc = wSal['Annual Salary'].describe()

display(desc)
print("IQR:", desc['75%']-desc['50%'] )

In [None]:
wSal['Annual Salary'].median()

In [None]:
fig, ax = plt.subplots(figsize=(3,10))
result = ax.boxplot(wSal['Annual Salary'].values, whis='range')
plt.show()

In [None]:
plt.hist(wSal['Annual Salary'], bins=75)
plt.show()

In [None]:
position, sal = probscale.plot_pos(wSal['Annual Salary'].values)
position *= 100
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(position, sal, marker='.', linestyle='none', label='Salary')
ax.set_xlabel('Percentile')
ax.set_ylabel('Annual Salary')
plt.show()

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')
total = mask['Job ID'].nunique()
null = main['Occupation'].value_counts()[:1].values[0]

print('There are', total, 'number of observations, of which', null, 'have no recorded occupation.\nThis represents', str(round(null/total * 100, 2)) + '%', 'of the Post-Doc postings.')

In [None]:
category = "Career Area"

w_occ = main_table[main_table[category]!='na']
post_w = w_occ.merge(mask, on='Job ID', how='inner')
occ = pd.DataFrame(post_w[category].value_counts()).reset_index()

df = occ[:10]

fig, ax = plt.subplots(figsize=(12,6))

labels = df['index']
values = df[category]

ind = np.array([x for x, _ in enumerate(labels)])

ax.bar(x = ind, height = values)

plt.xticks(ind, labels, rotation = 'vertical')
plt.xlabel(category + 's')
plt.ylabel('Number of Jobs in\n' + category)

plt.show()

In [None]:
post_desc = wSal['Annual Salary'].describe()
post_w = post_w[post_w['Annual Salary']>0]
post_occ_desc = post_w['Annual Salary'].describe()

one = post_desc
two = post_occ_desc

two_t_test(one['mean'], two['mean'], 0, np.var(wSal['Annual Salary']), np.var(post_w['Annual Salary']), one['count'], two['count'], sides='two')

In [None]:
main_table[main_table['Career Area']=='Planning and Analysis']['Occupation'].value_counts()[:25]

In [None]:
# Benchmarks


In [None]:
post = main_table.merge(mask, on='Job ID', how='inner')
post = post[~post['IPEDS Institution Name'].isnull()]
years = [2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
# zones = list(post['Institution_BEA_zone'].unique())[:8]
zones = ['Plains', 'Mideast', 'New England']
tensor = np.zeros((len(years), 3, 0))

post = pd.DataFrame(post.groupby(['Institution_BEA_zone', 'Year', 'IPEDS Institution Name']).count()['Job ID']).reset_index()

for zone in zones:
    DF = post[post['Institution_BEA_zone']==zone]
    tuples = np.zeros((0, 3))
    for year in years:
        df = DF[DF['Year']==year]
        
        df = df[df['Job ID']>=6]
    
        jobs = df['Job ID'].sum()
        inst = df['IPEDS Institution Name'].nunique()

#         print(zone, year, jobs, inst)
        per = jobs/inst

        tuples = np.vstack((tuples, np.array((jobs, inst, per))))
    
    tensor = np.dstack((tensor, tuples))

print(tensor.shape)

In [None]:
ind = np.arange(len(years))
colors = ['royalblue', 'limegreen', 'firebrick', 'c', 'indigo', 'gold', 'deeppink', 'grey']
titles = ['Post-Doc Counts', 'Number of Institutions', 'Post per Institution']

for i in range(3):
    fig, ax = plt.subplots(figsize=(12,9))
    for j in range(len(zones)):
        ax.plot(tensor[:,i,j], color=colors[j], label=zones[j])
    plt.title(titles[i])
    plt.xticks(ind, years)
    plt.legend(loc='best')
    plt.show()

In [None]:
post = main_table.merge(mask, on='Job ID', how='inner')
group_w = pd.DataFrame(post[['Year', 'BEA_zone', 'IPEDS Institution Name', 'Job ID']].groupby(['BEA_zone', 'Year']).nunique())
group_w = group_w.drop(columns=['Year', 'BEA_zone'])

group_w['per'] = np.true_divide(group_w['Job ID'], group_w['IPEDS Institution Name'])
display(group_w)

group_w = group_w.reset_index()

In [None]:
post = main_table.merge(mask, on='Job ID', how='inner')
post = post[~post['IPEDS Institution Name'].isnull()]
group = pd.DataFrame(post[['Year', 'BEA_zone', 'IPEDS Institution Name', 'Job ID']].groupby(['BEA_zone', 'Year']).nunique())
group = group.drop(columns=['Year', 'BEA_zone'])

group['per'] = np.true_divide(group['Job ID'], group['IPEDS Institution Name'])
display(group)

group = group.reset_index()

In [None]:
total = group.merge(group_w[['BEA_zone', 'Year', 'Job ID']].rename(columns={'Job ID':'other'}),
                    on=['BEA_zone', 'Year'], how='inner')

total['no_inst'] = total['other'] - total['Job ID']

total = total.rename(columns={'IPEDS Institution Name':'inst_count', 'Job ID':'count'})
total = total.drop(columns='other')

total = total.set_index(keys=['BEA_zone', 'Year'])
total

In [None]:
temp = pd.DataFrame(post[post['BEA_zone'] == 'Plains'].groupby(['BEA_zone', 'Year', 'IPEDS Institution Name']).count()['Job ID'])

display(temp)

temp['Job ID'].value_counts()

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')
main = main[(main['IPEDS Institution Name']=='Princeton University')&(main['BEA_zone']=='Rocky Mountiains')]
main

In [None]:
fac = faculty_table.merge(mask, on='Job ID', how='inner')

def get_fields(df, year):

    df = df[df['Year']==year]
    
    fields = pd.DataFrame(df[NSF].sum()).reset_index()

    fields = fields.rename(columns={0:'count'})

    fields = fields.sort_values(by='count', ascending=False)

    return fields

fields_10 = get_fields(fac, 2010)
fields_17 = get_fields(fac, 2017)

fields = fields_10.merge(fields_17, on='index', how='outer')

fields

In [None]:
NSF = ['Agricultural sciences and natural resources', 'Biological and biomedical sciences', 'Health sciences', 'Chemistry', 'Geosciences, atmospheric, and ocean sciences', 'Physics and astronomy', 'Computer and information sciences', 'Mathematics and statistics', 'Psychology', 'Anthropology', 'Economics', 'Political science and government', 'Sociology', 'Other social sciences',   'Foreign languages and literature', 'History', 'Letters', 'Other humanities and arts', 'Business management and administration', 'Communication', 'FS_Life_sciences', 'FS_Physical_sciences_and_earth_sciences', 'FS_Mathematics_and_computer_sciences', 'FS_Psychology_and_social_sciences', 'FS_Engineering', 'FS_Education', 'FS_Humanities_and_arts', 'FS_Others']
fac = faculty_table[faculty_table['Post-Doctoral']==1]

def fac_cat(df, year, min):
    df = df[df['Year']==year]
    num = df['Job ID'].nunique()
    
    df = pd.DataFrame(df[NSF].sum()).reset_index().rename(columns={0:'count'})
    df['inc'] = np.true_divide(df['count'], num)
    
    df = df[df['count']>min]
    df['var'] = df['inc'] * (1 - df['inc'])
    
    return df, num
    
main_fac07, num07 = fac_cat(faculty_table, 2007, 6)
main_fac17, num17 = fac_cat(faculty_table, 2017, 6)

main_grow = growth(main_fac07, main_fac17, num07, num17)

fac07, num07 = fac_cat(fac, 2007, 6)
fac17, num17 = fac_cat(fac, 2017, 6)

grow = growth(fac07, fac17, num07, num17)

graph_growth(main_grow, 'NSF Field', 'Change in Relative NSF Field sizes from 2007 to 2017')
graph_growth(grow, 'NSF Field', 'Change in Post-Doc Relative NSF Representation from 2007 to 2017')

In [None]:
skill_p = skill_table.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).merge(mask, on='Job ID', how='inner')
skill_f = skill_table.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).merge(faculty_table[faculty_table['Faculty']==1][['Job ID']], on='Job ID', how='inner')

In [None]:
def skill_break(df, year, NSF=None):
    df = df[df['Year']==year]
    s_df = df[df['Is Specialized Skill?']==1]
    
    if NSF is not None:
        mask = faculty_table[faculty_table[NSF]==1][['Job ID']]
        df = df.merge(mask, on='Job ID', how='inner')
        s_df = s_df.merge(mask, on='Job ID', how='inner')
    
    skills = pd.DataFrame(df['Skill Name'].value_counts()).reset_index()
    skills = skills.rename(columns={'Skill Name':'count'})
    skills['inc'] = np.true_divide(skills['count'], df['Job ID'].nunique())
    
    s_skills = pd.DataFrame(s_df['Skill Name'].value_counts()).reset_index()
    s_skills = s_skills.rename(columns={'Skill Name':'count'})
    s_skills['inc'] = np.true_divide(s_skills['count'], s_df['Job ID'].nunique())
    
    return skills, s_skills

def s_graph_rank(df, title, color='blue', top=10):
    
    df = df[:top]
    
    fig, ax = plt.subplots(figsize=(12,6))
    
    labels = df['index'].values
    counts = df['count'].values
    
    ind = np.arange(len(labels))
    
    ax.bar(ind, counts, color=color)
    
    plt.xticks(ind, labels, rotation=45, ha='right')
    plt.xlabel('Skill Name')
    plt.ylabel('Number with Skill')
#     plt.legend(loc='upper right')
    plt.title(title)
    
    plt.show()


In [None]:
nsfs = ['Biological and biomedical sciences', 'Chemistry', 'Computer and information sciences', 'Economics', 'Psychology']
colors = [('royalblue', 'cornflowerblue'),('maroon', 'firebrick'),('goldenrod', 'gold'),('darkgreen', 'forestgreen'),('indigo', 'rebeccapurple')]
year1 = 2010
year2 = 2017

def s_graph_growth(df, title, color='blue', top=10):
    
    df = df[:top]
    
    fig, ax = plt.subplots(figsize=(12,6))
    
    labels = df['index'].values
    counts = df['change'].values
    
    ind = np.arange(len(labels))
    
    ax.bar(ind, counts, color=color)
    
    plt.xticks(ind, labels, rotation=45, ha='right')
    plt.xlabel('change')
    plt.ylabel('Change in Skill')
#     plt.legend(loc='upper right')
    plt.title(title)
    
    plt.show()

def get_diffs(df):
    df['change'] = df['inc_y'] - df['inc_x']
    df['growth'] = np.true_divide(df['inc_y'] - df['inc_x'], df['inc_x'])
    return df.sort_values(by='growth', ascending=False)

for i in range(len(nsfs)):
    nsf = nsfs[i]
    color = colors[i]
    
    print(nsf)
    
    skills1, s_skills1 = skill_break(skill_p, year1, NSF=nsf)
    main_skills1, main_s_skills1 = skill_break(skill_f, year1, NSF=nsf)

    skills2, s_skills2 = skill_break(skill_p, year2, NSF=nsf)
    main_skills2, main_s_skills2 = skill_break(skill_f, year2, NSF=nsf)
    
#     main_s_skills1 = main_s_skills1.rename(columns={'index':'skill', 'Skill Name':'count'}).reset_index()
#     main_s_skills2 = main_s_skills2.rename(columns={'index':'skill', 'Skill Name':'count'}).reset_index()
#     display(main_s_skills1.merge(main_s_skills2, on='index', how='inner').drop(columns='index')[:10])
    
#     s_skills1 = s_skills1.rename(columns={'index':'skill', 'Skill Name':'count'}).reset_index()
#     s_skills2 = s_skills2.rename(columns={'index':'skill', 'Skill Name':'count'}).reset_index()
#     display(s_skills1.merge(s_skills2, on='index', how='inner').drop(columns='index')[:10])

    fac_s = main_s_skills1.merge(main_s_skills2[:10], on='index', how='right')
    fac_s = get_diffs(fac_s)
    
    post_s = s_skills1.merge(s_skills2[:10], on='index', how='right')
    post_s = get_diffs(post_s)
    
    display(post_s)
    
#     s_graph_rank(main_s_skills2, 'Top Ranked Skills for ' + nsf + '\nFaculty in 2017', color=color[0])
#     s_graph_growth(fac_s, 'Growth of Top 2017 Skills for ' + nsf + '\nFaculty in 2017', color=color[0])
    
    s_graph_rank(s_skills2, 'Top Ranked Skills for ' + nsf + '\nPost-Docs in 2017', color=color[1])
    s_graph_growth(post_s, 'Growth of Top 2017 Skills for ' + nsf + '\nPost-Docs in 2017', color=color[1])
    
    
    
#     graph(main_s_skills1, 'Top 10 Requested ' + nsf + ' Specialized\nSkills in ' + str(year1) 
#           + ' for Faculty Postings', color=color)
#     graph(main_skills2, 'Top 10 Requested ' + nsf + ' Specialized\nSkills in ' + str(year2) 
#           + ' for Faculty Postings', color=color)

#     graph(s_skills1, 'Top 10 Requested ' + nsf + ' Specialized\nSkills in ' + str(year1) 
#           + ' for Post-Doc Postings', color=color)
#     graph(s_skills2, 'Top 10 Requested ' + nsf + ' Specialized\nSkills in ' + str(year2) 
#           + ' for Post-Doc Postings', color=color)

In [None]:
skills1, s_skills1 = skill_break(skill_p, 2007, NSF='Letters')

In [None]:
s_skills1

In [None]:
skill = skill_table.merge(faculty_table[faculty_table['Letters']==1][['Job ID']], on='Job ID', how='inner')

skill = skill[skill['Year']==2010]

skill.merge(mask, on='Job ID', how='inner')['Skill Name'].value_counts()

In [None]:
# Total Categorical Size Changes

year1 = 2007
year2 = 2017

post = main_table.merge(mask, on='Job ID', how='inner')
main = main_table.drop(post.index)

post1 = post[post['Year']==year1]['Job ID'].nunique()
main1 = main[main['Year']==year1]['Job ID'].nunique()

post2 = post[post['Year']==year2]['Job ID'].nunique()
main2 = main[main['Year']==year2]['Job ID'].nunique()

# bls1 = bls[bls['Year']==year1]['Jobs'].sum()
# bls2 = bls[bls['Year']==year2]['Jobs'].sum()

table = pd.DataFrame([['Post-Doc', post1, post2], ['HEJP', main1, main2]], columns=['Type', year1, year2])
# table.append(['BLS', bls1, bls2])

table['growth'] = np.true_divide(table[year2]-table[year1], table[year1])

In [None]:
labels = table['Type'].values
ind = np.arange(len(labels))

fig, ax = plt.subplots(figsize=(12,6))

width=0.5

x1 = table['growth'].values * 100
# x1 = table[year1].values
# x2 = table[year2].values

ax.bar(ind, x1, width, color='forestgreen')
# ax.bar(ind-width/2, x1, width, color='orange')
# ax.bar(ind+width/2, x2, width, color='blue')

plt.xticks(ind, labels)
plt.ylabel('Percent Growth of Category')
plt.xlabel('Posting Type')
plt.show()

In [None]:
cat1 = 'Institution_State'
cat2 = 'State'

diff = pd.DataFrame(post.groupby(['Year', cat1, cat2]).count()[['Job ID']]).reset_index()
diff = diff[diff[cat1]!=diff[cat2]]

diff.groupby(['Year', cat1]).sum()

In [None]:
post