In [None]:
import pandas as pd
pd.set_option('display.max_colwidth', None)
import numpy as np
import os
import math
import matplotlib.pyplot as plt

In [None]:
main_table = pd.read_parquet(r"A:\HEJP_revision_02152020\Jobs.parquet").convert_dtypes()
print(len(main_table))
print(main_table['BGTJobId'].nunique())

faculty_table = pd.read_parquet(r"A:\HEJP_revision_02152020\Faculty_and_Postdoc_Fields.parquet").convert_dtypes()
print(len(faculty_table))
print(faculty_table['BGTJobId'].nunique())

app_table = pd.read_parquet(r"A:\HEJP_revision_02152020\Faculty_Appointments.parquet").convert_dtypes()
print(len(app_table))
print(app_table['BGTJobId'].nunique())

skill_table = pd.read_parquet(r"A:\HEJP_revision_02152020\Skills.parquet").convert_dtypes()
print(len(skill_table))
print(skill_table['BGTJobId'].nunique())

In [None]:
faculty_table

In [None]:
def tenure(value):
    if value is pd.NA:
        return pd.NA
    elif value >= 2 or value == 1:
        return 1
    else:
        return 0

app_table['Tenure_Line'] = app_table['Tenured'] + app_table['Tenure_Track']

app_table['Tenure_Line'] = app_table['Tenure_Line'].apply(tenure)

In [None]:
phil = main_table[main_table['Job Title'].str.contains('Philosophy', case=False, na=False)]
app = app_table.merge(phil[['BGTJobId']], how='inner')

display(phil['Occupation'].value_counts())

year1 = 2007
year2 = 2019

def get_cats(year):
    
    total = main_table[main_table['Year']==year]['BGTJobId'].nunique()
    
    mask = phil[phil['Year']==year][['BGTJobId']]
    
    jobs = mask['BGTJobId'].nunique()
    df = app.merge(mask, how='inner')
    cont = df['Contingent'].sum()
    ten = df['Tenure_Line'].sum()
    
    df = pd.DataFrame(pd.Series([jobs, cont, ten], index=['Jobs', 'Contingent', 'Tenure_Line']))
    
    df = df.rename(columns={0:'Philosophy'})
    
    df['inc'] = df['Philosophy']/total
    
    return df

result1 = get_cats(year1).reset_index()
result2 = get_cats(year2).reset_index()

result = result1.merge(result2, on='index')

result['growth_r'] = (result['Philosophy_y'] - result['Philosophy_x']) / result['Philosophy_x']

result['growth_p'] = (result['inc_y'] - result['inc_x']) / result['inc_x']

result

# jobs2 = phil[phil['Year']==year2]['BGTJobId'].nunique()

# print(f'There were {jobs1} Philosophy jobs in {year1} and {jobs2} in {year2}\n')

# print(f'Raw growth from {year1} to {year2} is {round(float(jobs2-jobs1)/float(jobs1) * 100, 2)}%')

In [None]:
phil[phil['Occupation']!='College Professor / Instructor']

In [None]:
faculty_table.drop(columns='JobInsertDate').astype(int).merge(phil[['BGTJobId']], how='inner').sum()

In [None]:
main_table[main_table['Occupation'].str.contains('philosophy', case=False, na=False)]

In [None]:
app_table['Tenure_Line'].value_counts()

In [None]:
app_table

In [None]:
year1 = 2010
year2 = 2019

# Obtain isolated post-doctoral skill dataframe
skill_small = skill_table[(skill_table['Year']==year1)|(skill_table['Year']==year2)]

# skill_p = skill_small.merge(mask, on='BGTJobId', how='inner')
# skill_f = skill_table.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).merge(faculty_table[faculty_table['Faculty']==1][['BGTJobId']], 
#                                                                          on='BGTJobId', how='inner')

# Method for isolating the skills in a year
def skill_break(df, year, NSF=None):
    df = df[df['Year']==year]
    s_df = df[df['Specialized']==1]
    
    if NSF is not None:
        mask = faculty_table[faculty_table[NSF]==1][['BGTJobId']]
        df = df.merge(mask, on='BGTJobId', how='inner')
        s_df = s_df.merge(mask, on='BGTJobId', how='inner')
    
    skills = pd.DataFrame(df['Skill Name'].value_counts()).reset_index()
    skills = skills.rename(columns={'Skill Name':'count'})
    skills['inc'] = np.true_divide(skills['count'], df['BGTJobId'].nunique())
    
    s_skills = pd.DataFrame(s_df['Skill Name'].value_counts()).reset_index()
    s_skills = s_skills.rename(columns={'Skill Name':'count'})
    s_skills['inc'] = np.true_divide(s_skills['count'], s_df['BGTJobId'].nunique())
    
    return skills, s_skills

# Method for graphing the raw ranks of the skills
def s_graph_rank(df, title, color='blue', top=10):
    
    df = df[:top]
    
    fig, ax = plt.subplots(figsize=(12,6))
    
    labels = df['index'].values
    counts = df['count'].values
    
    ind = np.arange(len(labels))
    
    ax.bar(ind, counts, color=color)
    
    plt.xticks(ind, labels, rotation=45, ha='right')
    plt.xlabel('Skill Name')
    plt.ylabel('Number with Skill')
#     plt.legend(loc='upper right')
    plt.title(title)
    
    plt.show()

# Define NSF fields of intrest
nsfs = ['Economics', 'Economics']
colors = [('royalblue', 'cornflowerblue'),('maroon', 'firebrick'),('goldenrod', 'gold'),('darkgreen', 'forestgreen'),
          ('indigo', 'rebeccapurple')]

def s_graph_growth(df, title, color='blue', top=10):
    
    df = df[:top]
    
    fig, ax = plt.subplots(figsize=(6,12))
    
    labels = df['index'].values
    counts = df['change'].values
    
    ind = np.arange(len(labels))
    
    ax.barh(ind, counts, color=color)
    
    plt.yticks(ind, labels, rotation=45, ha='right')
    plt.ylabel('change')
    plt.xlabel('Change in Demand')
#     plt.legend(loc='upper right')
    plt.title(title)
    
    plt.show()

def get_diffs(df):
    df['change'] = df['inc_y'] - df['inc_x']
    df['growth'] = np.true_divide(df['inc_y'] - df['inc_x'], df['inc_x'])
    return df.sort_values(by='change', ascending=False)

mask_ftc = app_table[(app_table['FT_Contingent']==1)&~(app_table['FT_Contingent'].isnull())][['BGTJobId']]
mask_tenure = app_table[(app_table['Tenure_Line']==1)&~(app_table['Tenure_Line'].isnull())][['BGTJobId']]

for i in range(len(nsfs)):
    nsf = nsfs[i]
    color = colors[i]
    
    print(nsf)
    
    if(i == 0):
        df = skill_small.merge(mask_ftc, how='inner')
    if(i == 1):
        df = skill_small.merge(mask_tenure, how='inner')
    
    # Faculty dataframes are commneted out
    
    skills1, s_skills1 = skill_break(df, year1, NSF=nsf)
#     main_skills1, main_s_skills1 = skill_break(skill_f, year1, NSF=nsf)

    skills2, s_skills2 = skill_break(df, year2, NSF=nsf)
#     main_skills2, main_s_skills2 = skill_break(skill_f, year2, NSF=nsf)

#     fac_s = main_s_skills1.merge(main_s_skills2[:10], on='index', how='right')
#     fac_s = get_diffs(fac_s)
    
    post_s = s_skills1.merge(s_skills2[:10], on='index', how='right')
    post_s = get_diffs(post_s)
    
    display(post_s)
    
    s_graph_rank(s_skills2, 'Top Ranked Skills for ' + nsf + f'\n in {year2}', color=color[1])
    s_graph_growth(post_s, 'Change in Demanding Percentage for Top\n' + nsf + f' Skills in {year2}', color=color[1])
    

In [None]:
app_table.columns