In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', -1)
import numpy as np
import matplotlib.pyplot as plt
import probscale
import scipy
import scipy.stats as stats
import math
plt.style.use('fivethirtyeight')

In [None]:
# Import relevant dataframes:

main_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Main_Data\Main_Table_10182019.csv")
print(len(main_table))
faculty_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Faculty_Data\Faculty_Table_11222019.csv")
print(len(faculty_table))
skill_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Skills_Data\Skill_Table_06072019.csv")
print(len(skill_table))

In [None]:
# postdoc mask useful for isolating all ['Post-Doctoral' = 1] postings by Job ID
mask = faculty_table[faculty_table['Post-Doctoral']==1][['Job ID']]

In [None]:
def title_swap(string):
    dictionary = {'Community and Social Services':'Counseling and Religious Life',
                  'Customer and Client Support':'Online Support and University Information',
                  'Hospitality, Food, and Tourism':'Event Management and Hospitality', 
                  'Planning and Analysis':'Analysis', 
                  'Curriculum and Instructional Designer / Developer':'Curriculum and Instructional Designer', 
                  'Special Education Teacher':'Accessibility and Disability Services', 
                  'Teaching Assistant':'Faculty Support', 
                  'Tutor':'Academic Tutor',
                  'Clerical and Administrative':'Administrative',
                  'na' : 'Uncategorized'}
    if(string in dictionary):
        return dictionary[string]
    else:
        return string
    
def val_year(year1, year2):
    if(year1 >= year2):
        if(year1>year2):
            temp = year1
            year1 = year2
            year2 = temp
        else:
            raise ValueError('Years cannot be the same.')
            
    return (year1, year2)

def cat_validate(actual, desired_list):
    return actual in desired_list

In [None]:
post_doc = main_table[main_table['Job Title'].str.contains('postdoc', case=False, na=False) |
                      main_table['Job Title'].str.contains('post doc', case=False, na=False)|
                      main_table['Job Title'].str.contains('post-doc', case=False, na=False)]
print(len(post_doc))

In [None]:
print(len(mask))

In [None]:
# boolean_vec = (main_table['Job Title'].str.contains('postdoc', case=False, na=False) |
#                       main_table['Job Title'].str.contains('post doc', case=False, na=False)|
#                       main_table['Job Title'].str.contains('post-doc', case=False, na=False))

# main_table['post_doc'] = boolean_vec + 0

# faculty_table = faculty_table.merge(main_table[['Job ID', 'post_doc']], on='Job ID', how='inner')

In [None]:
# def boolean(x):
#     if x > 0:
#         return 1
#     else:
#         return 0

# faculty_table['Post-Doctoral'] = faculty_table['post_doc'].apply(boolean)

In [None]:
# faculty_table['Post-Doctoral'].sum()

In [None]:
def t_test(mean, null, std, n):
    
    var = ((arr - mean)**2).sum()/(n-1)
    std = math.sqrt(var)
    se = std/math.sqrt(n)
    t = (mean - 0)/se
    
    return mean, std, se, t

def two_t_test(mean1, mean2, null, var1, var2, n1, n2, sides='two', side='+'):
    
    comb_er = var1/n1 + var2/n2
    comb_t = ((mean1 - mean2) - null)/math.sqrt(comb_er)
    df = comb_er**2/(((var1/n1)**2/(n1-1))+((var2/n2)**2/(n2-1)))
    
    if(sides == 'two'):
        return {'val':(mean1-mean2), 'se':math.sqrt(comb_er),'t':comb_t, 'p':2*scipy.stats.t.cdf(-math.fabs(comb_t), df)}
        
    elif(sides == 'one'):
        if(side == '+'):
            comb_t = -comb_t
        return {'diff':(mean1-mean2), 'se':math.sqrt(comb_er),'t':comb_t, 'p':scipy.stats.t.cdf(comb_t, df)}
        
    else:
        return None


In [None]:
def breakout(df, year, category, sig_level=0.01, drop=True):
    df = df[df['Year']==year]
    num = df['Job ID'].nunique()
    arr = np.zeros(num)
    p = 1
    i = 1
    while(i <= num):
        arr[:i] = 1
        p = stats.ttest_1samp(arr, popmean=0).pvalue
        if p/2 <= sig_level:
            print(num, '->', i)
            break
        i+=1
    
    cat = pd.DataFrame(df[category].value_counts()).reset_index()
    cat = cat.rename(columns={category:'count'})
    if drop:
        cat = cat[cat['count']>=i]
    cat['inc'] = np.true_divide(cat['count'], num)
    cat['var'] = cat['inc'] * (1 - cat['inc'])
    
    return cat, num

def validate(row, n2, n1):
    return two_t_test(row['inc_x'], row['inc_y'], 0, row['var_x'], row['var_y'], n1, n2)['p']

def growth(df1, df2, n1, n2):
    df = df1.merge(df2, on='index', how='outer', indicator=True)
    df = df[df['_merge']=='both']
    df['p_change'] = df.apply(validate, axis=1, args=(n1, n2,))
#     df = df[df['p_change']<=0.01]
    df['growth'] = np.true_divide(df['inc_y'] - df['inc_x'], df['inc_x'])
    return df.sort_values('growth', ascending=False).reset_index(drop=True)
    
mask = faculty_table[faculty_table['Post-Doctoral'] == 1][['Job ID']]
post = main_table.merge(mask, on='Job ID', how='inner')

year1 = 2007
year2 = 2017

inst1, num1 = breakout(post, year1, 'IPEDS Institution Name', drop=False)
inst2, num2 = breakout(post, year2, 'IPEDS Institution Name')

inst_g = growth(inst1, inst2, num1, num2)

main_inst1, num1 = breakout(main_table, year1, 'IPEDS Institution Name')
main_inst2, num2 = breakout(main_table, year2, 'IPEDS Institution Name')

main_inst_g = growth(main_inst1, main_inst2, num1, num2)

main_zone1, num1 = breakout(main_table, year1, 'BEA_Zone')
main_zone2, num2 = breakout(main_table, year2, 'BEA_Zone')

main_zone_g = growth(main_zone1, main_zone2, num1, num2)

zone1, num1 = breakout(post, year1, 'BEA_Zone')
zone2, num2 = breakout(post, year2, 'BEA_Zone')

zone_g = growth(zone1, zone2, num1, num2)

In [None]:
inst_g

In [None]:
main_inst_g

In [None]:
zone_g

In [None]:
main_zone_g

In [None]:
index = pd.DataFrame(main_table['IPEDS Institution Name'].value_counts()[:10]).reset_index()['index'].values

In [None]:
years = [2007, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017]
index = inst_g[:25]['index'].values
main = main_table.merge(mask, on='Job ID', how='inner')
w_df = main[main['IPEDS Institution Name'].apply(cat_validate, args=(index,))]

breakdown = pd.DataFrame(w_df[['IPEDS Institution Name', 'Year', 'Job ID']].groupby(['IPEDS Institution Name', 'Year']).count()).reset_index()

total = np.zeros((0,9))
for name in index:
    temp = breakdown[breakdown['IPEDS Institution Name']==name]
#     counts = temp['Job ID'].values
    counts = np.zeros(9,)
    i = 0
    for year in years:
        docs = temp[temp['Year']==year]['Job ID'].values
        if len(docs) == 1:
            counts[i] = docs[0]
        i+=1
    total = np.vstack((total, counts))
    
ind = np.arange(total.shape[1])

fix, ax = plt.subplots(figsize=(20,10))

i=0
for series in total:
    ax.plot(ind, series, label=index[i])
    i+=1

plt.xticks(ind, years)
plt.legend(loc='best')
plt.show()

In [None]:
penn = main_table[main_table['IPEDS Institution Name']=='Pennsylvania State University-Main Campus']
penn = penn.merge(mask, on='Job ID', how='inner')

penn[penn['Year']==2017]

In [None]:
main

In [None]:
def graph_growth(df, category, title, top=10, color1='royalblue', color2='gold'):
    
    df= df[:10]
    
    labels = df['index'].values
    N = df['index'].nunique()
    
    first = df['inc_x'].values
    second = df['inc_y'].values
    growth = df['growth'].values
    
    ind = np.arange(N)
    width = 0.2
    
    fig, ax = plt.subplots(figsize=(12,6))
    
    ax.bar(ind-width/2, first, width, color=color1, label='2007')
    ax.bar(ind+width/2, second, width, color=color2, label='2017')
    
    o_max = max(np.max(first), np.max(second))
    
    for i in range(len(ind)):
        i_max = 0
        if i_max < first[i]:
            i_max = first[i]
        if i_max < second[i]:
            i_max = second[i]
        plt.text(ind[i]-1.5*width, i_max+o_max/100, str(round(growth[i]*100, 2)) + '% Growth')
    
    plt.xticks(ind, labels, rotation=45, ha='right')
    plt.xlabel(category)
    plt.ylabel('Incidence of ' + category)
    plt.legend(loc='upper right')
    plt.title(title)
    
    plt.show()
    
graph_growth(main_zone_g, 'BEA Zone', 'Change in Total Data BEA Zone sizes from 2007 to 2017')
graph_growth(zone_g, 'BEA Zone', 'Change in Post-Doc BEA Zone sizes from 2007 to 2017')

graph_growth(main_inst_g, 'IPEDS Institution', 'Change in Relative Institution sizes from\n2007 to 2017 in all Postings',
             color1='maroon', color2='seagreen')
graph_growth(inst_g, 'IPEDS Institution', 'Change in Relative Institution sizes from\n2007 to 2017 in Post-Doc Postings',
             color1='maroon', color2='seagreen')

In [None]:
zone_g

In [None]:
two_t_test(0.155761, 0.234685, 0, 0.131500, 0.234685, 2812, 5697, sides='two', side='+')

In [None]:
stats.t.cdf(-1.96, 25)

In [None]:
NSF = ['Agricultural sciences and natural resources', 'Biological and biomedical sciences', 'Health sciences', 'Chemistry', 'Geosciences, atmospheric, and ocean sciences', 'Physics and astronomy', 'Computer and information sciences', 'Mathematics and statistics', 'Psychology', 'Anthropology', 'Economics', 'Political science and government', 'Sociology', 'Other social sciences',   'Foreign languages and literature', 'History', 'Letters', 'Other humanities and arts', 'Business management and administration', 'Communication', 'FS_Life_sciences', 'FS_Physical_sciences_and_earth_sciences', 'FS_Mathematics_and_computer_sciences', 'FS_Psychology_and_social_sciences', 'FS_Engineering', 'FS_Education', 'FS_Humanities_and_arts', 'FS_Others']
fac = faculty_table[faculty_table['Post-Doctoral']==1]

def fac_cat(df, year, min):
    df = df[df['Year']==year]
    num = df['Job ID'].nunique()
    
    df = pd.DataFrame(df[NSF].sum()).reset_index().rename(columns={0:'count'})
    df['inc'] = np.true_divide(df['count'], num)
    
    df = df[df['count']>min]
    df['var'] = df['inc'] * (1 - df['inc'])
    
    return df, num
    
main_fac07, num07 = fac_cat(faculty_table, 2007, 6)
main_fac17, num17 = fac_cat(faculty_table, 2017, 6)

main_grow = growth(main_fac07, main_fac17, num07, num17)

fac07, num07 = fac_cat(fac, 2007, 6)
fac17, num17 = fac_cat(fac, 2017, 6)

grow = growth(fac07, fac17, num07, num17)

graph_growth(main_grow, 'NSF Field', 'Change in Relative NSF Field sizes from 2007 to 2017')
graph_growth(grow, 'NSF Field', 'Change in Post-Doc Relative NSF Representation from 2007 to 2017')

In [None]:
skill_p = skill_table.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).merge(mask, on='Job ID', how='inner')
skill_f = skill_table.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).merge(faculty_table[faculty_table['Faculty']==1][['Job ID']], on='Job ID', how='inner')

def skill_break(df, year, NSF=None):
    df = df[df['Year']==year]
    
    if NSF is not None:
        mask = faculty_table[faculty_table[NSF]==1][['Job ID']]
        df = df.merge(mask, on='Job ID', how='inner')
    
    skills = pd.DataFrame(df['Skill Name'].value_counts()).reset_index()
    s_skills = pd.DataFrame(df[df['Is Specialized Skill?']==1]['Skill Name'].value_counts()).reset_index()
    
    return skills, s_skills

NSF = 'Economics'
year1 = 2010
year2 = 2017

skills1, s_skills1 = skill_break(skill_p, year1, NSF=NSF)
main_skills1, main_s_skills1 = skill_break(skill_f, year1, NSF=NSF)

skills2, s_skills2 = skill_break(skill_p, year2, NSF=NSF)
main_skills2, main_s_skills2 = skill_break(skill_f, year2, NSF=NSF)

def graph(df, title, top=10):
    
    df = df[:top]
    
    fig, ax = plt.subplots(figsize=(12,6))
    
    labels = df['index'].values
    counts = df['Skill Name'].values
    
    ind = np.arange(len(labels))
    
    ax.bar(ind, counts)
    
    plt.xticks(ind, labels, rotation=45, ha='right')
    plt.xlabel('Skill Name')
    plt.ylabel('Incidence of Skill')
#     plt.legend(loc='upper right')
    plt.title(title)
    
    plt.show()


In [None]:
graph(main_s_skills1, 'Top 10 Requested Specialized Skills in ' + NSF + '\nfor All Postings')

In [None]:
graph(s_skills1, 'Top 10 Requested Specialized Skills in ' + NSF + '\nfor Post-Doc Postings')

In [None]:
skills1

In [None]:
skills1

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')

wSal = main_table[main_table['Annual Salary']>0]

print(wSal['Job ID'].nunique()/main['Job ID'].nunique())

desc = wSal['Annual Salary'].describe()

display(desc)
print("IQR:", desc['75%']-desc['50%'] )

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')

wSal = main[main['Annual Salary']>0]

print(wSal['Job ID'].nunique()/main['Job ID'].nunique())

desc = wSal['Annual Salary'].describe()

display(desc)
print("IQR:", desc['75%']-desc['50%'] )

In [None]:
wSal['Annual Salary'].median()

In [None]:
fig, ax = plt.subplots(figsize=(3,10))
result = ax.boxplot(wSal['Annual Salary'].values, whis='range')
plt.show()

In [None]:
plt.hist(wSal['Annual Salary'], bins=75)
plt.show()

In [None]:
position, sal = probscale.plot_pos(wSal['Annual Salary'].values)
position *= 100
fig, ax = plt.subplots(figsize=(12,6))
ax.plot(position, sal, marker='.', linestyle='none', label='Salary')
ax.set_xlabel('Percentile')
ax.set_ylabel('Annual Salary')
plt.show()

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')
total = mask['Job ID'].nunique()
null = main['Occupation'].value_counts()[:1].values[0]

print('There are', total, 'number of observations, of which', null, 'have no recorded occupation.\nThis represents', str(round(null/total * 100, 2)) + '%', 'of the Post-Doc postings.')

In [None]:
category = "Career Area"

w_occ = main_table[main_table[category]!='na']
post_w = w_occ.merge(mask, on='Job ID', how='inner')
occ = pd.DataFrame(post_w[category].value_counts()).reset_index()

df = occ[:10]

fig, ax = plt.subplots(figsize=(12,6))

labels = df['index']
values = df[category]

ind = np.array([x for x, _ in enumerate(labels)])

ax.bar(x = ind, height = values)

plt.xticks(ind, labels, rotation = 'vertical')
plt.xlabel(category + 's')
plt.ylabel('Number of Jobs in\n' + category)

plt.show()

In [None]:
post_desc = wSal['Annual Salary'].describe()
post_w = post_w[post_w['Annual Salary']>0]
post_occ_desc = post_w['Annual Salary'].describe()

one = post_desc
two = post_occ_desc

two_t_test(one['mean'], two['mean'], 0, np.var(wSal['Annual Salary']), np.var(post_w['Annual Salary']), one['count'], two['count'], sides='two')

In [None]:
main_table[main_table['Career Area']=='Planning and Analysis']['Occupation'].value_counts()[:25]

In [None]:
# Benchmarks