In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', -1)
import numpy as np
import matplotlib.pyplot as plt
import scipy
import math
plt.style.use('default')

In [None]:
# Import relevant dataframes:

main_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Main_Data\Main_Table_10182019.csv")
print(len(main_table))
faculty_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Faculty_Data\Faculty_Table_11222019.csv")
print(len(faculty_table))
skill_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Skills_Data\Skill_Table_06072019.csv")
print(len(skill_table))

In [None]:
# postdoc mask useful for isolating all ['Post-Doctoral' = 1] postings by Job ID
mask = faculty_table[faculty_table['Post-Doctoral']==1][['Job ID']]

In [None]:
def title_swap(string):
    dictionary = {'Community and Social Services':'Counseling and Religious Life',
                  'Customer and Client Support':'Online Support and University Information',
                  'Hospitality, Food, and Tourism':'Event Management and Hospitality', 
                  'Planning and Analysis':'Analysis', 
                  'Curriculum and Instructional Designer / Developer':'Curriculum and Instructional Designer', 
                  'Special Education Teacher':'Accessibility and Disability Services', 
                  'Teaching Assistant':'Faculty Support', 
                  'Tutor':'Academic Tutor',
                  'Clerical and Administrative':'Administrative',
                  'na' : 'Uncategorized'}
    if(string in dictionary):
        return dictionary[string]
    else:
        return string
    
def val_year(year1, year2):
    if(year1 >= year2):
        if(year1>year2):
            temp = year1
            year1 = year2
            year2 = temp
        else:
            raise ValueError('Years cannot be the same.')
            
    return (year1, year2)

def cat_validate(actual, desired_list):
    return actual in desired_list

In [None]:
post_doc = main_table[main_table['Job Title'].str.contains('postdoc', case=False, na=False) |
                      main_table['Job Title'].str.contains('post doc', case=False, na=False)|
                      main_table['Job Title'].str.contains('post-doc', case=False, na=False)]
print(len(post_doc))

In [None]:
mask = faculty_table[faculty_table['Post-Doctoral'] == 1][['Job ID']]
print(len(mask))

In [None]:
boolean_vec = (main_table['Job Title'].str.contains('postdoc', case=False, na=False) |
                      main_table['Job Title'].str.contains('post doc', case=False, na=False)|
                      main_table['Job Title'].str.contains('post-doc', case=False, na=False))

main_table['post_doc'] = boolean_vec + 0

faculty_table = faculty_table.merge(main_table[['Job ID', 'post_doc']], on='Job ID', how='inner')

In [None]:
def boolean(x):
    if x > 0:
        return 1
    else:
        return 0

faculty_table['Post-Doctoral'] = faculty_table['post_doc'].apply(boolean)

In [None]:
faculty_table['Post-Doctoral'].sum()

In [None]:
def t_test(mean, null, std, n):
    
    var = ((arr - mean)**2).sum()/(n-1)
    std = math.sqrt(var)
    se = std/math.sqrt(n)
    t = (mean - 0)/se
    
    return mean, std, se, t

def two_t_test(mean1, mean2, null, var1, var2, n1, n2, sides='two', side='+'):
    
    comb_er = var1/n1 + var2/n2
    comb_t = ((mean1 - mean2) - null)/math.sqrt(comb_er)
    df = comb_er**2/(((var1/n1)**2/(n1-1))+((var2/n2)**2/(n2-1)))
    
    if(sides == 'two'):
        return {'val':(mean1-mean2), 'se':math.sqrt(comb_er),'t':comb_t, 'p':2*scipy.stats.t.cdf(-math.fabs(comb_t), df)}
        
    elif(sides == 'one'):
        if(side == '+'):
            comb_t = -comb_t
        return {'val':(mean1-mean2), 'se':math.sqrt(comb_er),'t':comb_t, 'p':scipy.stats.t.cdf(comb_t, df)}
        
    else:
        return None


In [None]:
def breakout(df, year, category, sig_level=0.01):
    df = df[df['Year']==year]
    num = df['Job ID'].nunique()
    arr = np.zeros(num)
    p = 1
    i = 1
    while(i <= num):
        arr[:i] = 1
        p = stats.ttest_1samp(arr, popmean=0).pvalue
        if p/2 <= sig_level:
            print(num, '->', i)
            break
        i+=1
    
    cat = pd.DataFrame(df[category].value_counts()).reset_index()
    cat = cat.rename(columns={category:'count'})
    cat = cat[cat['count']>=i]
    cat['inc'] = np.true_divide(cat['count'], num)
    cat['var'] = cat['inc'] * (1 - cat['inc'])
    
    return cat, num

def validate(row, n2, n1):
    return two_t_test(row['inc_x'], row['inc_y'], 0, row['var_x'], row['var_y'], n1, n2)['p']

def growth(df1, df2, n1, n2):
    df = df1.merge(df2, on='index', how='outer', indicator=True)
    df = df[df['_merge']=='both']
    df['p_change'] = df.apply(validate, axis=1, args=(n1, n2,))
    df = df[df['p_change']<=0.01]
    df['growth'] = np.true_divide(df['inc_y'] - df['inc_x'], df['inc_x'])
    return df.sort_values('growth', ascending=False).reset_index(drop=True)
    
mask = faculty_table[faculty_table['Post-Doctoral'] == 1][['Job ID']]
post = main_table.merge(mask, on='Job ID', how='inner')

inst07, num07 = breakout(post, 2007, 'IPEDS Institution Name')
inst17, num17 = breakout(post, 2017, 'IPEDS Institution Name')

inst_g = growth(inst07, inst17, num07, num17)

zone07, num07 = breakout(post, 2007, 'BEA_Zone')
zone17, num17 = breakout(post, 2017, 'BEA_Zone')

zone_g = growth(zone07, zone17, num07, num17)

In [None]:
inst_g

In [None]:
zone_g

In [None]:
two_t_test(0.155761, 0.234685, 0, 0.131500, 0.234685, 2812, 5697, sides='two', side='+')

In [None]:
stats.t.cdf(-1.96, 25)

In [None]:
NSF = ['Agricultural sciences and natural resources', 'Biological and biomedical sciences', 'Health sciences', 'Chemistry', 'Geosciences, atmospheric, and ocean sciences', 'Physics and astronomy', 'Computer and information sciences', 'Mathematics and statistics', 'Psychology', 'Anthropology', 'Economics', 'Political science and government', 'Sociology', 'Other social sciences', 'Aerospace, aeronautical, and astronautical engineering', 'Bioengineering and biomedical engineering', 'Chemical engineering', 'Civil engineering', 'Electrical, electronics, and communications engineering', 'Industrial and manufacturing engineering', 'Materials science engineering', 'Mechanical engineering', 'Other engineering', 'Education administration', 'Education research', 'Teacher education', 'Other education', 'Foreign languages and literature', 'History', 'Letters', 'Other humanities and arts', 'Business management and administration', 'Communication', 'FS_Life_sciences', 'FS_Physical_sciences_and_earth_sciences', 'FS_Mathematics_and_computer_sciences', 'FS_Psychology_and_social_sciences', 'FS_Engineering', 'FS_Education', 'FS_Humanities_and_arts', 'FS_Others']
fac = faculty_table[faculty_table['Post-Doctoral']==1]

def fac_cat(df, year, min):
    df = df[df['Year']==year]
    num = df['Job ID'].nunique()
    
    df = pd.DataFrame(df[NSF].sum()).reset_index().rename(columns={0:'count'})
    df['inc'] = np.true_divide(df['count'], num)
    
    df = df[df['count']>min]
    df['var'] = df['inc'] * (1 - df['inc'])
    
    return df, num
    
fac07, num07 = fac_cat(fac, 2007, 6)
fac17, num17 = fac_cat(fac, 2017, 6)
    
fac = fac07.merge(fac17, on='index', how='inner')

growth(fac07, fac17, num07, num17)

In [None]:
skill = skill_table.drop(columns=['Unnamed: 0', 'Unnamed: 0.1']).merge(mask, on='Job ID', how='inner')

def skill_break(df, year, NSF=None):
    df = df[df['Year']==year]
    
    if NSF is not None:
        mask = faculty_table[faculty_table[NSF]==1][['Job ID']]
        df = df.merge(mask, on='Job ID', how='inner')
    
    skills = pd.DataFrame(df['Skill Name'].value_counts()).reset_index()
    s_skills = pd.DataFrame(df[df['Is Specialized Skill?']==1]['Skill Name'].value_counts()).reset_index()
    
    return skills, s_skills

skills17, s_skills17 = skill_break(skill, 2017, NSF='Economics')

In [None]:
skills07

In [None]:
s_skills07

In [None]:
main = main_table.merge(mask, on='Job ID', how='inner')

wSal = main[main['Annual Salary']>0]

wSal['Job ID'].nunique()/main['Job ID'].nunique()

In [None]:
wSal['Annual Salary'].describe()

In [None]:
wSal['Annual Salary'].median()

In [None]:
plt.hist(wSal['Annual Salary'], bins=75)
plt.show()

In [None]:
from mlxtend.evaluate import permutation_test

In [None]:
first = np.zeros(2812)
first[:438] = 1
second = np.zeros(5697)
second[:1337] = 1
permutation_test(first, second)