In [None]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import math
plt.style.use('fivethirtyeight')

In [None]:
root_directory = r"C:\Users\Public\_Data\Old_HEJP_Data\Latest_Version"

jobs_table = pd.read_csv(root_directory + r"\Main_Data\Main_Table_01192020.csv")
print(len(jobs_table))
NSF_table = pd.read_csv(root_directory + r"\Faculty_Data\Faculty_Table_11222019.csv")
print(len(NSF_table))
# skill_table = pd.read_csv(root_directory + r"\Skills_Data\Skill_Table_06072019.csv")
# print(len(skill_table))

In [None]:
#########################################################
# Visualization (1)
# Growth of Job Postings by NSF Field of Study: 2007-2017
#########################################################

# Only the NSF field table is needed for this visualization since all analysis is 
# done on faculty postings and their NSF Fields. To isolate the relevant postings 
# we will need to drop all observatiosn of non-faculty jobs and postdoc students. 

table = NSF_table[(NSF_table['Faculty']==1)|(NSF_table['Post-Doctoral']==0)]

# This function is used to extract the sum of all postings in each NSF field. It 
# essentially adds the entire table up along its dummy variables while doing some
# formating on the result.
def extract_fields(df, year, fields):
    # Isolate the years
    df = df[df['Year']==year]
    
    # Summation of an indicator attribute gives the number in that category
    field_totals = df[fields].sum()
    
    # Convert into a dataframe
    field_totals = pd.DataFrame(field_totals).reset_index()
    
    # Rename columns for clarity
    field_totals = field_totals.rename(columns={0:'count'})
    
    return field_totals
    
# This attribute list is the set of all NSF fields we care about for the purposes of 
# this visualization. We omit certain columns because they are uninteresting, because 
# they will introduce misleading numbers, or simply because they are far too specific. 
# For example we omit "Other humanities and arts" in the Old_HJEP sample due to some 
# uncertainty regarding how that category is populated by BG. Another example is Health 
# Sciences. It must also be removed since growth in Health sciences outstrips the 
# growth of other fields. To emphasisze this fact Health Sciences was given its own set 
# of visualizations. If instead it were included here, the scale of the graph for other 
# interesting fields would be hard to read.

NSF_fields = ['Agricultural sciences and natural resources', 'Biological and biomedical sciences',
               'Chemistry', 'Geosciences, atmospheric, and ocean sciences', 'Physics and astronomy',
               'Computer and information sciences', 'Mathematics and statistics', 'Psychology', 
               'Anthropology', 'Economics', 'Political science and government', 'Sociology', 
               'Other social sciences', 'FS_Engineering', 'FS_Education',
               'Foreign languages and literature', 'History', 'Letters', 
               'Business management and administration', 'Communication']

NSF_07 = extract_fields(table, 2007, NSF_fields)
NSF_17 = extract_fields(table, 2017, NSF_fields)

# Measure growth between the two years that we extracted
def get_growth(df1, df2):
    df = df1.merge(df2, on='index')
    
    df = df[df['count_y'] > 1000]
    
    df['growth'] = (df['count_y'] - df['count_x'])/df['count_x']
    
    df = df.sort_values(by='growth').reset_index(drop=True)
    
    return df
    
NSF_growth = get_growth(NSF_07, NSF_17)

display(NSF_growth)

# Graph using matplotlib's pyplot
def Graph_NSF(df, title):
    
    growth = df['growth'].values
    
    x1 = df['count_x'].values
    x2 = df['count_y'].values
    
    catagories = df['index'].values
    
    ind = np.arange(len(growth))

    for k in range(len(growth)):
       # print(growth[k], counts[k,:], ind[k])
        plt.annotate(s=str(round(growth[k]*100,1)) + '% Growth', xy=(x2[k]+250, ind[k]))
        
    width = 0.2
    
    plt.barh(ind + width, x1, width, label='2007', color='goldenrod')
    plt.barh(ind, x2, width, label='2017', color='royalblue')

    plt.yticks(ind, catagories)
    plt.ylabel('NSF Catagory')
    plt.xlabel('Number of Postings')
    plt.title(title)

    plt.legend(loc='upper right')
    
    fig = plt.gcf()
    fig.set_size_inches(10,10)
    
    plt.show()
    
    
Graph_NSF(NSF_growth, 'Growth of Job Postings by NSF Field: 2007-2017')

In [None]:
##################################################
# Visualization (2)
# All Faculty Openings: 2010-1017
# Grouped by Employee type and by institution type
##################################################

# The years of analysis
year1 = 2007
year2 = 2017

# Again remove the Health sciences faculty, but by a different method. Now we 
# need to omit the entire field rather than simply ignoring a column. Therefore, 
# all postings that are marked only as [Health sciences == 1] will be removed. 
# Note that this does not include postings that are listed in two different 
# faculty categories. For example a posting listed as 
# [Health sciences == 1 AND Biological and biomedical sciences == 1] would still 
# make it into the sample.
nsf = NSF_table[((NSF_table['Number of Detailed Fields of Study'] > 1) | (NSF_table['Health sciences'] != 1))]

# Only faculty postings
nsf = nsf[(nsf['Post-Doctoral']!=1)&(nsf['Faculty']==1)]

# Pull in relevant indicator dummies from the Jobs table
full = jobs_table[['Job ID', '2-year', '4-year', 'Public', 'Private', 'R1']].merge(nsf, on='Job ID', how='inner')

# Restrict the years
full = full[(full['Year']==year1)|(full['Year']==year2)]

# The institution sectors for which we want to isolate growth
sectors = ['All Higher Education', 'Public', 'Private', '4-year', '2-year', 'R1'] 

def get_sectorwise_counts(df, year, sectors):
    df = df[df['Year']==year]
    
    # Storage place for results
    all_sectors = None
    
    # For each of the relevant sectors, pull out the Tenure_Line, FTC, PTC, 
    # and contingent sums.
    for s in sectors:
        if s != 'All Higher Education':
            temp = df[df[s]==1]
        else:
            temp = df

        cat_sum = pd.DataFrame(temp[['Tenure_Line', 'Full-time Contingent', 'Part-time Contingent', 'Contingent']].sum())

        cat_sum = cat_sum.rename(columns={0:s})
        cat_sum = cat_sum.transpose()

        if all_sectors is None:
            all_sectors = cat_sum
        else:
            all_sectors = all_sectors.append(cat_sum)

    # Perform FTC and PTC sum estimation as detailed in Best Practices Manual
    total = all_sectors['Full-time Contingent'] + all_sectors['Part-time Contingent']

    ftc_perc = all_sectors['Full-time Contingent']/total
    ptc_perc = all_sectors['Part-time Contingent']/total

    all_sectors['Full-time Contingent'] = round(all_sectors['Contingent'] * ftc_perc)
    all_sectors['Part-time Contingent'] = round(all_sectors['Contingent'] * ptc_perc)
    
    all_sectors = all_sectors.astype('int').reset_index()

    return(all_sectors)   

sectors_07 = get_sectorwise_counts(full, year1, sectors)
sectors_17 = get_sectorwise_counts(full, year2, sectors)

# Graph results using pyplot
def graph_sectors(df1, df2):
        
        ind = np.arange(len(df1))
        
        labels = df1['index']
        
        for s in ['Tenure_Line', 'Full-time Contingent', 'Part-time Contingent']:
            
            x1 = df1[s]
            x2 = df2[s]

            growth = round((x2-x1)/x1 * 100, 2)
            
            width = 0.35
            fig, ax = plt.subplots(figsize=(12, 10))
            
            ax.barh(ind+width, x1, width, label=year1, color='royalblue')
            ax.barh(ind, x2, width, label=year2, color='gold')
            

            plt.yticks(ind, labels)
            plt.ylabel('Institution Type')
            plt.xlabel('Number of Postings')
            plt.title(f'Institution Growth for {s} Postings, {year1} to {year2}')

            for k in range(len(labels)):
                plt.annotate(s=str(round(growth[k],1)) + '% Growth', xy=(x2[k]+250, ind[k]))

            plt.legend(loc='upper right')

            plt.show()

graph_sectors(sectors_07, sectors_17)

In [None]:
#####################################################
# Visualization (3)
# Share of faculty vs Non-Faculty Openings: 2007-2017
# Group by type of institution
#####################################################

year1 = 2007
year2 = 2017

# Remove Health Care inclusing Nursing career area, because of massive growth in
# teaching hospitals recently. We believe this growth due to a dispersion of 
# medical research responsibilites away from from the NIH and into higher ed. If
# this is the case, we would not want this endogenous force to influence the
# ratios of Faculty and Non-Faulty. Non-faculty would be greatly increased in 
# institutions that have teaching hospitals. We are interested only in the traditional
# university side of the equation for this visual.
main = jobs_table[jobs_table['Career Area'] != 'Health Care including Nursing']

# Restrict the years
main = main[(main['Year'] == year1) | (main['Year'] == year2)]

# Create a Job ID mask for removing the post-doc and health sciences exclusive postings
temp = NSF_table[NSF_table['Post-Doctoral'] != 1]
temp = temp[(temp['Number of Detailed Fields of Study'] > 1) | (temp['Health sciences'] != 1)]
mask = temp[['Job ID', 'Faculty']]
del(temp)

main = main.merge(mask, how='inner')

# Graph Data using pyplot
def faculty_differences(df, title):
    
    # This pandas code is old and not very pretty. Pay it no mind...
    # It is simply used to isolate the information from the groupby command
    # and turn it into iterables that pyplot can work with.
    faculty = np.zeros(2)
    non_fac = np.zeros(2)
    for index, row in df.iterrows():
        if row['Year'] == year1:
            if row['Faculty'] == 0:
                non_fac[0] = row['Job ID']
            else:
                faculty[0] = row['Job ID']
        else:
            if row['Faculty'] == 0:
                non_fac[1] = row['Job ID']
            else:
                faculty[1] = row['Job ID']
                
    years = [year1, year2]
    ind = np.array([x for x, _ in enumerate(years)])

    total = faculty + non_fac

    prop_fac = np.true_divide(faculty, total) * 100
    prop_non = np.true_divide(non_fac, total) * 100

    fig, ax = plt.subplots(figsize=(6,6))
    
    ax.bar(ind, prop_fac, width=0.5, label='Faculty', color='deepskyblue', bottom=prop_non)
    ax.bar(ind, prop_non, width=0.5, label='Non-Faculty', color='goldenrod')
    # plt.bar(ind + 0.5,100, width=0.01, label='', color = 'black')

    for k in range(len(ind)):
        plt.annotate(s='(' + str(round(prop_fac[k], 1)) + '%)', xy=(ind[k]-0.1, prop_non[k]+0.5*prop_fac[k]))
        plt.annotate(s='(' + str(round(prop_non[k], 1)) + '%)', xy=(ind[k]-0.1, 0.5*prop_non[k]))

    plt.xticks(ind, years)
    plt.ylabel('Proportion')
    plt.xlabel('Year')
    plt.title(title)

    plt.legend(loc='lower center')
    plt.show()
    
# Reuse the same sector list from (2)
sectors = ['All Higher Education', 'Public', 'Private', 'R1', '4-year', '2-year']

for s in sectors:
    if s == 'All Higher Education':
        df = main
    else:
        df = main[main[s]==1]
        s = s + ' Institutions'
    
    # Group data by year and faculty status, count on Job ID to get the number
    # of postings in each of those groups.
    groups = df.groupby(['Year', 'Faculty']).count()[['Job ID']].reset_index()
    
    faculty_differences(groups, s)