In [1]:
import os
import math
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 2000)
import numpy as np

In [2]:
jobs = pd.read_parquet(r"A:\HEJP_revision_02152020\Jobs.parquet").convert_dtypes()
print(len(jobs))

cert = pd.read_parquet(r"A:\HEJP_revision_02152020\Certifications.parquet").convert_dtypes()
print(len(cert))

6625009
1109288


In [3]:
def fix_sector(string):
    if string is pd.NA:
        return pd.NA
    if string == "Sector unknown (not active":
        return "Sector unknown (not active)"
    elif string.find(',') > 0:
        return ''.join(string.split(','))
    else:
        return string

In [4]:
jobs['IPEDS Sector Name'] = jobs['IPEDS Sector Name'].apply(fix_Sector)

In [21]:
jobs['IPEDS_Sector_Name'].value_counts()

Public 4-year or above                     2877639
Private not-for-profit 4-year or above     1919931
Public 2-year                               552286
Administrative Unit                         377234
Private for-profit 4-year or above          289199
Private not-for-profit 2-year                95061
Private for-profit 2-year                    55634
Private for-profit less-than 2-year          44559
Public less-than 2-year                      43791
Private not-for-profit less-than 2-year       9659
Sector unknown (not active)                     36
Name: IPEDS_Sector_Name, dtype: int64

In [23]:
jobs[jobs['IPEDS_Sector_Name']=='Private for-profit less-than 2-year']['IPEDS_Institution_Name'].value_counts()

Vatterott Education Center                                     2729
Gene Juarez Academy of Beauty-Federal Way                      2702
Everest College-Santa Ana                                      1921
Demmons School of Beauty                                       1601
Everest College-Alhambra                                       1185
Arizona Culinary Institute                                     1154
Everest College-Chicago                                         936
Computer Systems Institute                                      674
Everest College-Merrionette Park                                621
Tulsa Welding School-Houston                                    558
Imagine-Paul Mitchell Partner School                            448
Everest College-San Jose                                        434
Everest College-Fife                                            398
Everest College-Merrillville                                    384
Everest Institute-Grand Rapids                  

In [5]:
jobs = jobs.rename(columns={'Job Title':'Job_Title', 'Career Area':'Career_Area', 'Annual Salary':'Annual_Salary', 
                     'IPEDS Institution Name':'IPEDS_Institution_Name', 'IPEDS Sector Name':'IPEDS_Sector_Name'})

In [6]:
def BEA_Zone(state):
    
    dictionary = {'KS': 'Plains',
                  'MS': 'Southeast',
                  'NM': 'Southwest',
                  'MN': 'Plains',
                  'GA': 'Southeast',
                  'TN': 'Southeast',
                  'PA': 'Mideast',
                  'OH': 'Great Lakes',
                  'WI': 'Great Lakes',
                  'NJ': 'Mideast',
                  'TX': 'Southwest',
                  'AZ': 'Southwest',
                  'CA': 'Far West',
                  'MA': 'New England',
                  'FL': 'Southeast',
                  'RI': 'New England',
                  'NC': 'Southeast',
                  'IL': 'Great Lakes',
                  'KY': 'Southeast',
                  'NV': 'Far West',
                  'CO': 'Rocky Mountains',
                  'DC': 'Mideast',
                  'VA': 'Southeast',
                  'IA': 'Plains',
                  'UT': 'Rocky Mountains',
                  'IN': 'Great Lakes',
                  'NH': 'New England',
                  'OR': 'Far West',
                  'MO': 'Plains',
                  'DE': 'Mideast',
                  'CT': 'New England',
                  'MI': 'Great Lakes',
                  'SC': 'Southeast',
                  'MT': 'Rocky Mountains',
                  'OK': 'Southwest',
                  'NY': 'Mideast',
                  'ID': 'Rocky Mountains',
                  'WV': 'Southeast',
                  'MD': 'Mideast',
                  'AK': 'Far West',
                  'AR': 'Southeast',
                  'NE': 'Plains',
                  'AL': 'Southeast',
                  'LA': 'Southeast',
                  'WA': 'Far West',
                  'HI': 'Far West',
                  'VT': 'New England',
                  'ME': 'New England',
                  'SD': 'Plains',
                  'WY': 'Rocky Mountains',
                  'ND': 'Plains',
                  'PR': 'Territories',
                  'GU': 'Territories',
                  'VI': 'Territories',
                  'MP': 'Territories',
                  'AS': 'Territories',
                  'MH': 'Territories',
                  'FM': 'Territories'}
    
    if type(state) is str and state == 'na':
        return None
    elif type(state) is str:
        return dictionary[state]
    elif state is None:
        return None
    
jobs['BEA_Zone'] = jobs['State'].apply(BEA_Zone)

In [7]:
jobs.columns

Index(['BGTJobId', 'LIJobId', 'Year', 'Faculty', 'Postdoc', 'Diversity',
       'Job_Title', 'Occupation', 'Career_Area', 'Min_EDU', 'Max_EDU',
       'Min_Exp', 'Annual_Salary', 'MSA', 'State', 'IPEDS_Institution_Name',
       'IPEDS_Sector_Name', 'R1', 'BEA_Zone'],
      dtype='object')

In [24]:
sectors[(sectors['IPEDS_Sector_Name']==2)]

Unnamed: 0_level_0,IPEDS_Sector_Name
IPEDS_Institution_Name,Unnamed: 1_level_1
Southwestern College,2
Louisiana State University-System Office,2
Sanford-Brown College-San Antonio,2
Everest College-Arlington,2
American Institute of Alternative Medicine,2


In [14]:
sectors = jobs.groupby(['IPEDS_Institution_Name'])[['IPEDS_Sector_Name']].nunique().sort_values('IPEDS_Sector_Name')

uncertain_institutions = set(sectors[(sectors['IPEDS_Sector_Name']==0)|
                                     (sectors['IPEDS_Sector_Name']==2)].reset_index()['IPEDS_Institution_Name'].values)

# Colleges with Ambiguous Source Sectors
# Public 2-year
public_two_year = ['Dallas County Community College District',
                   'Ventura County Community College System Office',
                   'San Bernardino Community College District',
                   'Grossmont-Cuyamaca Community College District',
                   'Kentucky Community and Technical College System',
                   'State Center Community College District',
                   'South Orange County Community College District',
                   'Mississippi Community College Board', 
                   'Contra Costa Community College District Office', 
                   'Southwestern College', 
                   'Tennessee Board of Regents']

# Public 4-year
public_four_year =['North Orange County Community College District', 
                   'Yosemite Community College District Office', 
                   'Delaware Technical Community College-Central Office',
                   'California State University-Chancellors Office', 
                   'Colorado State University-System Office', 
                   'Oregon University System', 
                   'Southern Illinois University-System Office', 
                   'Texas A & M University-System Office', 
                   'The University of Texas System Office', 
                   'University System of Maryland', 
                   'University of California-System Administration Central Office',
                   'University of Colorado System Office',
                   'University of Hawaii System Office',
                   'University of Illinois University Administration',
                   'University of Louisiana-System Administration',
                   'University of Massachusetts-Central Office',
                   'University of North Carolina General Administration',
                   'University of North Texas System',
                   'University of Wisconsin Extension',
                   'University of Wisconsin-System Administration']

# Private for profit 2-year
private_two_year = ['Carrington College-Administrative Office']


# Private not for profit, 4-year
private_four_year = ['CUNY System Office',
                     'American Institute of Alternative Medicine',
                     'Sistema Universitario Ana G Mendez']

for_profit = ['American Institute of Alternative Medicine']

non_IPEDS = ['Everest College-Arlington', 
             'Sanford-Brown College-San Antonio']

multi = ['Southwestern College', 
         'Louisiana State University-System Office', 
         'Sanford-Brown College-San Antonio', 
         'Everest College-Arlington', 
         'American Institute of Alternative Medicine']

categories = [public_two_year, public_four_year, private_two_year, private_four_year]

In [15]:
for s in jobs[jobs['IPEDS_Sector_Name']=='Administrative Unit']['IPEDS_Institution_Name'].values:
    uncertain_institutions.add(s)

In [17]:
for l in categories:
    for s in l:
        if s in uncertain_institutions:
            uncertain_institutions.remove(s)

uncertain_institutions

{'American Institute of Alternative Medicine',
 'California State University-Chancellors Office',
 'Colorado State University-System Office',
 'Everest College-Arlington',
 'Louisiana State University-System Office',
 'Oregon University System',
 'Sanford-Brown College-San Antonio',
 'Sistema Universitario Ana G Mendez',
 'Southern Illinois University-System Office',
 'Southwestern College',
 'Tennessee Board of Regents',
 'Texas A & M University-System Office',
 'The University of Texas System Office',
 'University System of Maryland',
 'University of California-System Administration Central Office',
 'University of Colorado System Office',
 'University of Hawaii System Office',
 'University of Illinois University Administration',
 'University of Louisiana-System Administration',
 'University of Massachusetts-Central Office',
 'University of North Carolina General Administration',
 'University of North Texas System',
 'University of Wisconsin Extension',
 'University of Wisconsin-Syst

In [28]:
jobs[(jobs['IPEDS_Institution_Name']=='Southwestern College')&~jobs['IPEDS_Institution_Name'].isnull()]

Unnamed: 0,BGTJobId,LIJobId,Year,Faculty,Postdoc,Diversity,Job_Title,Occupation,Career_Area,Min_EDU,Max_EDU,Min_Exp,Annual_Salary,MSA,State,IPEDS_Institution_Name,IPEDS_Sector_Name,R1,BEA_Zone
840,38016762312,d181ddfef9b141bdb26d741e2543a69c4440b8ef,2016,0,0,0,Hvac Mechanic,HVAC Mechanic / Installer,"Maintenance, Repair, and Installation",,,4.0,,"San Diego-Carlsbad, CA",CA,Southwestern College,Private not-for-profit 4-year or above,0,Far West
1058,38018747214,bf392f2c5cdab5eb4b8f83351d7227cd2de518,2016,0,0,0,Financial Aid Technician - Board Of Financial ...,Financial Aid Counselor / Specialist,Finance,16,,2.0,,"San Diego-Carlsbad, CA",CA,Southwestern College,Private not-for-profit 4-year or above,0,Far West
1387,38018747363,c843aedceda5a55d815aa4e0f435823667eac98,2016,0,0,0,Lead Custodian,Janitor / Cleaner,"Hospitality, Food, and Tourism",12,,3.0,,"San Diego-Carlsbad, CA",CA,Southwestern College,Private not-for-profit 4-year or above,0,Far West
1632,38016761849,a715bba9993da1e61d21bd6faf371143d461cf1b,2016,0,0,0,Network Security Systems Analyst,Cyber Security Analyst,Information Technology,16,,3.0,,"San Diego-Carlsbad, CA",CA,Southwestern College,Private not-for-profit 4-year or above,0,Far West
3360,38018951629,62bcabaafbe5374758654061f484981c758,2016,1,0,0,Assistant Professor Of Exercise Science With A...,College Professor / Instructor,Education and Training,16,18,2.0,49789.0,"San Diego-Carlsbad, CA",CA,Southwestern College,Private not-for-profit 4-year or above,0,Far West
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6607855,290335444,306351505,2010,1,0,0,Psychology Teacher,College Professor / Instructor,Education and Training,18,,,,"Toledo, OH",OH,Southwestern College,Private not-for-profit 4-year or above,0,Great Lakes
6608967,297508623,298312144,2010,1,0,0,Business Instructor,College Professor / Instructor,Education and Training,18,,1.0,,"Cincinnati, OH-KY-IN",OH,Southwestern College,Private not-for-profit 4-year or above,0,Great Lakes
6611779,297508650,298312864,2010,1,0,0,Criminal Justice Department Chair,College Professor / Instructor,Education and Training,18,,1.0,,"Cincinnati, OH-KY-IN",OH,Southwestern College,Private not-for-profit 4-year or above,0,Great Lakes
6613525,353082977,74482ab682b485eec1b85c9cf9decc8f75fb1ce8,2013,0,0,0,Administrative Secretary II-Nursing,Secretary,Clerical and Administrative,16,,2.0,42000.0,"San Diego-Carlsbad, CA",CA,Southwestern College,Private not-for-profit 4-year or above,0,Far West


In [None]:
# Create Institution Table

inst = jobs[['IPEDS_Institution_Name', 'IPEDS_Sector_Name', 'R1']]

inst = inst.drop_duplicates('IPEDS_Institution_Name').sort_values(by='IPEDS_Institution_Name')

inst = inst[~inst['IPEDS_Institution_Name'].isnull()]

inst

In [None]:
jobs[~jobs['IPEDS_Institution_Name'].isnull()&(jobs['IPEDS_Institution_Name']=='Everest College-Arlington')].groupby(['Year', 'IPEDS_Sector_Name']).count()

In [None]:
# OLD_HEJP

main_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Main_Data\Main_Table_01072020.csv")
print(len(main_table))
# faculty_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Faculty_Data\Faculty_Table_11222019.csv")
# print(len(faculty_table))
# skill_table = pd.read_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Skills_Data\Skill_Table_06072019.csv")
# print(len(skill_table))

In [5]:
main_table[main_table['R1']==1][['Job ID']].to_csv(r"C:\Users\Public\_Data\R1.csv", index=False)

In [None]:
def BEA_zone(state):
    
    dictionary = {'Kansas': 'Plains',
                  'Mississippi': 'Southeast',
                  'New Mexico': 'Southwest',
                  'Minnesota': 'Plains',
                  'Georgia': 'Southeast',
                  'Tennessee': 'Southeast',
                  'Pennsylvania': 'Mideast',
                  'Ohio': 'Great Lakes',
                  'Wisconsin': 'Great Lakes',
                  'New Jersey': 'Mideast',
                  'Texas': 'Southwest',
                  'Arizona': 'Southwest',
                  'California': 'Far West',
                  'Massachusetts': 'New England',
                  'Florida': 'Southeast',
                  'Rhode Island': 'New England',
                  'North Carolina': 'Southeast',
                  'Illinois': 'Great Lakes',
                  'Kentucky': 'Southeast',
                  'Nevada': 'Far West',
                  'Colorado': 'Rocky Mountains',
                  'District of Columbia': 'Mideast',
                  'Virginia': 'Southeast',
                  'Iowa': 'Plains',
                  'Utah': 'Rocky Mountains',
                  'Indiana': 'Great Lakes',
                  'New Hampshire': 'New England',
                  'Oregon': 'Far West',
                  'Missouri': 'Plains',
                  'Delaware': 'Mideast',
                  'Connecticut': 'New England',
                  'Michigan': 'Great Lakes',
                  'South Carolina': 'Southeast',
                  'Montana': 'Rocky Mountains',
                  'Oklahoma': 'Southwest',
                  'New York': 'Mideast',
                  'Idaho': 'Rocky Mountains',
                  'West Virginia': 'Southeast',
                  'Maryland': 'Mideast',
                  'Alaska': 'Far West',
                  'Arkansas': 'Southeast',
                  'Nebraska': 'Plains',
                  'Alabama': 'Southeast',
                  'Louisiana': 'Southeast',
                  'Washington': 'Far West',
                  'Hawaii': 'Far West',
                  'Vermont': 'New England',
                  'Maine': 'New England',
                  'South Dakota': 'Plains',
                  'Wyoming': 'Rocky Mountains',
                  'North Dakota': 'Plains',
                  'Puerto Rico': 'Territories',
                  'Guam': 'Territories',
                  'Virgin Islands of the U.S.': 'Territories',
                  'Northern Mariana Islands': 'Territories',
                  'American Samoa': 'Territories',
                  'Marshall Islands': 'Territories',
                  'Federated States of Micronesia': 'Territories'}
    
    if type(state) is str:
        return dictionary[state]
    elif math.isnan(state):
        return state
    

In [None]:
states = ['Alabama', 'Alaska', 'Arizona', 'Arkansas', 'California', 'Colorado', 'Connecticut', 'Delaware', 
          'District of Columbia', 'Florida', 'Georgia', 'Hawaii', 'Idaho', 'Illinois', 'Indiana', 'Iowa', 'Kansas', 'Kentucky',
          'Louisiana', 'Maine', 'Maryland', 'Massachusetts', 'Michigan', 'Minnesota', 'Mississippi', 'Missouri', 'Montana',
          'Nebraska', 'Nevada', 'New Hampshire', 'New Jersey', 'New Mexico', 'New York', 'North Carolina', 'North Dakota',
          'Ohio', 'Oklahoma', 'Oregon', 'Pennsylvania', 'Puerto Rico', 'Rhode Island', 'South Carolina', 'South Dakota',
          'Tennessee', 'Texas', 'Utah', 'Vermont', 'Virginia', 'Virgin Islands of the U.S.', 'Washington', 'West Virginia',
          'Wisconsin', 'Wyoming']

input_loc = r'C:\Users\Public\_Data\_Data\bls_states\raw_bls\\'

for state in states:
    df = pd.read_excel(f"{input_loc}Emp_{state}.xlsx")
    df['State'] = state
    df = df[['Year', 'Dec', 'State']].rename(columns={'Dec':'Jobs'})
    
    if state == 'Alabama':
        bls = df
    else:
        bls = pd.concat([bls, df])

bls['BEA_zone'] = bls['State'].apply(BEA_zone)

bls = bls[~bls['Jobs'].isnull()]

bls

In [None]:
bls.groupby(['Year', 'BEA_zone']).sum()

In [None]:
bls.to_csv(r"C:\Users\Public\_Data\_Data\bls_states\bls_state.csv", index=False)

In [None]:
#############################################################
# Generating Institution based State and BEA_zone identifiers
#############################################################
# 01/19/2020

main = main_table.copy()

# Get all states that each institution is found in and the number of postings in those states
states = pd.DataFrame(main_table.groupby(['IPEDS Institution Name', 'State']).count()['Job ID'])

# Get ['IPEDS Institution Name', 'State'] as workable columns
states = states.reset_index()

# Resort the table lexicographically by institution name, and descending by number of postings within
states = states.sort_values(by=['IPEDS Institution Name', 'Job ID'], ascending=[True, False])

# Drop duplicates on institution s.t. only the top counted state is kept
states = states.drop_duplicates('IPEDS Institution Name')

# Dictionary for mapping institution to its most common state
mapping = {}

def populate_mapping(row, mapping):
    '''
    Method for mapping each Institution with its primary state. Meant to be caled with the
    Pandas.DataFrame.apply() function.
    '''
    
    mapping[row['IPEDS Institution Name']] = row['State']
    
# Get mapping    
states.apply(populate_mapping, axis=1, args=(mapping,))

# Function returns the primary state of the institution
def primary_state(inst):
    if type(inst) is str:
        return mapping[inst]
    elif math.isnan(inst):
        return inst
        
# Generate new 'Institution State' column for main table and BEA_zone from that
main['Institution_State'] = main['IPEDS Institution Name'].apply(primary_state)
main['Institution_BEA_zone'] = main['Institution_State'].apply(BEA_zone)

# Fix previous BEA_zone column
main['BEA_zone'] = main['State'].apply(BEA_zone)


In [None]:
# File to disk
main[['Job ID', 'Year', 'Job Title', 'Occupation', 'Career Area', 'Minimum EDU Requirements', 'Maximum EDU Requirements', 
      'Minimum Experience Requirements', 'Annual Salary', 'Metropolitan Statistical Area', 'State', 'BEA_zone', 
      'IPEDS Institution Name', 'IPEDS Sector Name', 'Institution_State', 'Institution_BEA_zone', 'R1', '4-year', '2-year',
      'Public', 'Private']].to_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Main_Data\Main_Table_01192020.csv", index=False)

In [None]:
# postdoc mask useful for isolating all ['Post-Doctoral' = 1] postings by Job ID
mask = faculty_table[faculty_table['Post-Doctoral']==1][['Job ID']]

diff_state = main[(main['State']!=main['Institution_State'])]

print('Postings where "State" and "Institution_State" don\'t match:\t', len(diff_state))

post_diff_state = diff_state.merge(mask, on='Job ID', how='inner')

print('Post-Doc postings with the above:\t\t\t\t', len(post_diff_state), '\n')

diff_state = diff_state[~diff_state['Institution_State'].isnull()]

print('All with null Institution removed:\t\t\t\t', len(diff_state))

post_diff_state = diff_state.merge(mask, on='Job ID', how='inner')

print('Post-Docs with null Institutions removed:\t\t\t', len(post_diff_state))

In [None]:
# Independent Study Data Generation

table = main_table.drop(columns=['BEA_Zone']).copy()

table['BEA_zone'] = table['State'].apply(BEA_Zone)

In [None]:
table

In [None]:
table.to_csv(r"C:\Users\Public\_Data\_Data\Latest_Version\Main_Data\Main_Table_01072020.csv", index=False)

In [None]:
table.groupby(['BEA_Zone', 'State']).count()

In [None]:
IS_main = table[['Job ID', 'Year', 'Job Title', 'Minimum EDU Requirements', 'Minimum Experience Requirements', 'Annual Salary', 'BEA_Zone']]
IS_main = IS_main[IS_main['Annual Salary']>0]

# IS_fac = faculty_table[['Job ID', 'Faculty', 'Post-Doctoral']]
# IS_fac = IS_fac.merge(IS_main[['Job ID']], on='Job ID', how='inner')

IS_skill = skill_table[['Job ID', 'Skill Name']]
IS_skill = IS_skill.merge(IS_main[['Job ID']], on='Job ID', how='inner')

IS_df = IS_main.merge(IS_fac, on='Job ID', how='inner')
IS_df

In [None]:
IS_df = IS_df.drop(columns=['Faculty', 'Post-Doctoral'])
IS_df


In [None]:
IS_df.to_csv(r"C:\Users\Matt\Documents\IS_Data\IS_Main.csv")

In [None]:
IS_skill.to_csv(r"C:\Users\Matt\Documents\IS_Data\IS_Skill.csv")

In [None]:
dic = dict()
for state in states:
    print(state)
    print('Enter Region: ')
    region = input()
    dic[state] = region
print(dic)

In [None]:
num_to_reg = {'1' : 'New England', '2':'Mideast', '3':'Southeast', '4':'Great Lakes', '5':'Plains', '6':'Southwest', '7':'Rocky Mountiains', '8':'Far West'}
state_to_num = {'Kansas': '5', 'Mississippi': '3', 'New Mexico': '6', 'Minnesota': '5', 'Georgia': '3', 'Tennessee': '3', 'Pennsylvania': '2', 'Ohio': '4', 'Wisconsin': '4', 'New Jersey': '2', 'Texas': '6', 'Arizona': '6', 'California': '8', 'Massachusetts': '1', 'Florida': '3', 'Rhode Island': '1', 'North Carolina': '3', 'Illinois': '4', 'Kentucky': '3', 'Nevada': '8', 'Colorado': '7', 'District of Columbia': '2', 'Virginia': '3', 'Iowa': '5', 'Utah': '7', 'Indiana': '4', 'New Hampshire': '1', 'Oregon': '8', 'Missouri': '5', 'Delaware': '2', 'Connecticut': '1', 'Michigan': '4', 'South Carolina': '3', 'Montana': '7', 'Oklahoma': '6', 'New York': '2', 'Idaho': '7', 'West Virginia': '3', 'Maryland': '2', 'Alaska': '8', 'Arkansas': '3', 'Nebraska': '5', 'Alabama': '3', 'Louisiana': '3', 'Washington': '8', 'Hawaii': '8', 'Vermont': '1', 'Maine': '1', 'South Dakota': '5', 'Wyoming': '7', 'North Dakota': '5', 'Puerto Rico': '3', 'Guam': '8', 'Virgin Islands of the U.S.': '3', 'Northern Mariana Islands': '8', 'American Samoa': '8', 'Marshall Islands': '8', 'Federated States of Micronesia': '8'}

In [None]:
dic = dict()
for state in states:
    dic[state] = num_to_reg[state_to_num[state]]
    
print(dic)

In [None]:
table = main_table.merge(faculty_table[['Job ID', 'Faculty', 'Post-Doctoral', 'Contingent']], on='Job ID', how='inner')

In [None]:
table['Career Area'].value_counts()

In [None]:
table[table['Post-Doctoral']==1]['Career Area'].value_counts()

In [None]:
table[(table['Post-Doctoral']==1)]['Career Area'].value_counts()

In [None]:
table[table['Occupation'].str.contains('administrator', case=False, na=False)]['Occupation'].value_counts()

In [None]:
main_table[main_table['Occupation']=='Economist']['Career Area'].value_counts()

In [None]:
len(table)

In [None]:
table[table['Career Area']=='Science and Research']

In [None]:
skill_tax = pd.read_excel(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\skillTaxonomy.xlsx")
print(len(skill_tax))

In [None]:
del(skill_table)

In [None]:
del(taxonomy)

In [None]:
taxonomy = dict()
i = 0
for index, row in skill_tax.iterrows():
    skill = row['skillName']
    family = row['familyName']
    print(i, ":", skill, "|", family)
    i+=1
    if(type(family) is str):
        taxonomy[skill] = family
    else:
        taxonomy[skill] = 'na'

In [None]:
skill_tax[7072:]

In [None]:
len(taxonomy)

In [None]:
for string in taxonomy:
    print(string, "|", taxonomy[string])

In [None]:
skill_tax['skillName'] = skill_tax['skillName'].astype('str')

In [None]:
skill_tax['familyName'].astype('str')

In [None]:
skill_tax.sort_values(by='skillName')

In [None]:
skill_table.groupby(['Skill Name', 'Skill Cluster Name']).count()

In [None]:
main_table.groupby(['Occupation', 'Career Area']).count()

In [None]:
test = faculty_table.copy()
for index, row in other_table.iterrows():
    print(index)
    test = test[test['Job ID']!=row['Job ID']]
    
test = test.append(other_table)
test['Job ID'].value_counts()

In [None]:
test.to_csv(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\Faculty_Potential_Fix.csv")

In [None]:
test[['Job ID', 'Year', 'Diversity and Inclusion', 'Post-Doctoral', 'Faculty', 'Agricultural sciences and natural resources', 'Biological and biomedical sciences', 'Health sciences', 'Chemistry', 'Geosciences, atmospheric, and ocean sciences', 'Physics and astronomy', 'Computer and information sciences', 'Mathematics and statistics', 'Psychology', 'Anthropology', 'Economics', 'Political science and government', 'Sociology', 'Other social sciences', 'Aerospace, aeronautical, and astronautical engineering', 'Bioengineering and biomedical engineering', 'Chemical engineering', 'Civil engineering', 'Electrical, electronics, and communications engineering', 'Industrial and manufacturing engineering', 'Materials science engineering', 'Mechanical engineering', 'Other engineering', 'Education administration', 'Education research', 'Teacher education', 'Teaching fields', 'Other education', 'Foreign languages and literature', 'History', 'Letters', 'Other humanities and arts', 'Business management and administration', 'Communication', 'Number of Detailed Fields of Study', 'FS_Life_sciences', 'FS_Physical_sciences_and_earth_sciences', 'FS_Mathematics_and_computer_sciences', 'FS_Psychology_and_social_sciences', 'FS_Engineering', 'FS_Education', 'FS_Humanities_and_arts', 'FS_Others', 'Tenured', 'Tenured_Track', 'Contingent', 'Full-time Contingent', 'Part-time Contingent']].sort_values(by=['Year', 'Job ID']).to_csv(r"C:\Users\bob65\Documents\Summer2019_Research\_SourceData\Brandeis_Dummy_Table_8-9-19.csv")

In [None]:
full_table = main_table[['Job ID', 'Job Title']].merge(new_faculty_table.drop(columns=['Year']), on='Job ID', how='inner')


In [None]:
segment = full_table[full_table['Job Title'].str.contains('radiology', case=False, na=False)&(full_table['Communication']==1)]
segment['Number of Detailed Fields of Study'] = segment['Number of Detailed Fields of Study'] - 1
segment['Communication'] = 0
full_table = full_table[~full_table['Job Title'].str.contains('radiology', case=False, na=False)|~(full_table['Communication']==1)]

full_table = full_table.append(segment)

In [None]:
full_table[full_table['Job Title'].str.contains('radiology', case=False, na=False)&(full_table['Faculty']==1)&(full_table['Number of Detailed Fields of Study']>1)]

In [None]:
full_table = full_table.merge(main_table[['Job ID', 'Year']], on='Job ID', how='inner')
full_table

In [None]:
full_table = full_table[['Job ID', 'Year', 'Diversity and Inclusion', 'Post-Doctoral', 'Faculty', 'Agricultural sciences and natural resources', 'Biological and biomedical sciences', 'Health sciences', 'Chemistry', 'Geosciences, atmospheric, and ocean sciences', 'Physics and astronomy', 'Computer and information sciences', 'Mathematics and statistics', 'Psychology', 'Anthropology', 'Economics', 'Political science and government', 'Sociology', 'Other social sciences', 'Aerospace, aeronautical, and astronautical engineering', 'Bioengineering and biomedical engineering', 'Chemical engineering', 'Civil engineering', 'Electrical, electronics, and communications engineering', 'Industrial and manufacturing engineering', 'Materials science engineering', 'Mechanical engineering', 'Other engineering', 'Education administration', 'Education research', 'Teacher education', 'Teaching fields', 'Other education', 'Foreign languages and literature', 'History', 'Letters', 'Other humanities and arts', 'Business management and administration', 'Communication', 'Number of Detailed Fields of Study', 'FS_Life_sciences', 'FS_Physical_sciences_and_earth_sciences', 'FS_Mathematics_and_computer_sciences', 'FS_Psychology_and_social_sciences', 'FS_Engineering', 'FS_Education', 'FS_Humanities_and_arts', 'FS_Others', 'Tenured', 'Tenured_Track', 'Contingent', 'Full-time Contingent', 'Part-time Contingent']]


In [None]:
full_table.to_csv(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\Brandeis_Dummy_Table_WIP.csv")

In [None]:
main_table[main_table['IPEDS Sector Name'] == 'Administrative Unit']['IPEDS Insitution Name'].value_counts()

In [None]:
main_table[main_table['IPEDS Insitution Name'].str.contains('community college', case=False, na=False)]['IPEDS Sector Name'].value_counts()

In [None]:
main_table[(main_table['IPEDS Insitution Name']=="North Orange County Community College District")&(main_table['4-year']==1)]['Metropolitan Statistical Area'].value_counts()

In [None]:
full_table[full_table['IPEDS Sector Name']=='Private for-profit']

In [None]:
############################
# Public vs. Private Dummies
############################

# Almost all sectors can be parsed into public and private since this is in their name,
# the one problem area (other than Sector Unknown and Na) is adminsitrative unit. I am 
# not sure if the Community Colleges that appear in this sample are public or private

# Here is a list of those confirmed to be public or private as well as their 4/2 year
# classifications:

# Administrative Unit Community Colleges:
# Public 2-year
'''
Dallas County Community College District
Ventura County Community College System Office
San Bernardino Community College District
Grossmont-Cuyamaca Community College District
Kentucky Community and Technical College System
State Center Community College District
South Orange County Community College District 
Mississippi Community College Board 
Contra Costa Community College District Office
'''

# Public 4-year
'''
North Orange County Community College District
Yosemite Community College District Office
Delaware Technical Community College-Central Office 
'''

# Private for profit 2-year
'''
Carrington College-Administrative Office
'''

# Private not for profit, 4-year
'''
CUNY System Office
'''

# Generate a stable separate copy of main_table to make the changes on before saving
# onto orignal file.
full_table = main_table.copy()

# NOTE: tilde symbol "~" is a not operator in the Pandas library

# An institution is Public if:
#     (1) The IPEDS Sector Name contains the word Public
#     (2) If it is one of the public Institutions within the Administrative Unit 
#         category (Easier to say not(private) since there are fewer of those in the
#         sample)
public = full_table['IPEDS Sector Name'].str.contains('public', case=False, na=False)
public = public + (full_table['IPEDS Sector Name'].str.contains('admin', case=False, na=False) & 
                        ~((full_table['IPEDS Insitution Name'] == 'Carrington College-Administrative Office')|
                         (full_table['IPEDS Insitution Name'] == 'CUNY System Office')))

# If you look carefully above, you will see that I am generating a 1D boolean array 
# that maps to the full_table instead of a 0-1 dummy vector. The way I convert this 
# to a dummy is by adding zero to the entire vector. Pandas then performs these 
# calculations: True + 0 = 1 and False + 0 = 0
full_table['Public'] = public + 0

# An institution is Private if:
#     (1) The IPEDS Sector Name contains the word Private
#     (2) If it is one of the Private institutions in Administrative Unit
private = full_table['IPEDS Sector Name'].str.contains('private', case=False, na=False)
private = private + (full_table['IPEDS Sector Name'].str.contains('admin', case=False, na=False) & 
                        ((full_table['IPEDS Insitution Name'] == 'Carrington College-Administrative Office')|
                         (full_table['IPEDS Insitution Name'] == 'CUNY System Office')))
full_table['Private'] = private + 0

# An institution is 4-year if:
#     (1) The IPEDS Sector Name contains the phrase 4-year
#     (2) If it is NOT one of the 2-year institutions in Administrtrative Unit     
four_year = full_table['IPEDS Sector Name'].str.contains('4-year', case=False, na=False)
four_year = four_year + (full_table['IPEDS Sector Name'].str.contains('admin', case=False, na=False) & 
                        ~((full_table['IPEDS Insitution Name'] == 'Dallas County Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Ventura County Community College System Office')|
                         (full_table['IPEDS Insitution Name'] == 'San Bernardino Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Grossmont-Cuyamaca Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Kentucky Community and Technical College System')|
                         (full_table['IPEDS Insitution Name'] == 'State Center Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'South Orange County Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Mississippi Community College Board')|
                         (full_table['IPEDS Insitution Name'] == 'Contra Costa Community College District Office')|
                         (full_table['IPEDS Insitution Name'] == 'Carrington College-Administrative Office')))
full_table['4-year'] = four_year + 0

# An institution is 2-year if:
#     (1) The IPEDS Sector Name contains the phrase 2-year
#     (2) If it is one of the 2-year institutions in Administrtrative Unit  
two_year = full_table['IPEDS Sector Name'].str.contains('2-year', case=False, na=False)
two_year = two_year + (full_table['IPEDS Sector Name'].str.contains('admin', case=False, na=False) & 
                        ((full_table['IPEDS Insitution Name'] == 'Dallas County Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Ventura County Community College System Office')|
                         (full_table['IPEDS Insitution Name'] == 'San Bernardino Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Grossmont-Cuyamaca Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Kentucky Community and Technical College System')|
                         (full_table['IPEDS Insitution Name'] == 'State Center Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'South Orange County Community College District')|
                         (full_table['IPEDS Insitution Name'] == 'Mississippi Community College Board')|
                         (full_table['IPEDS Insitution Name'] == 'Contra Costa Community College District Office')|
                         (full_table['IPEDS Insitution Name'] == 'Carrington College-Administrative Office')))
full_table['2-year'] = two_year + 0


In [None]:
public_private = pd.DataFrame(full_table.groupby(['Public', 'Private', 'IPEDS Sector Name'])['Job ID'].count())
public_private.rename(columns={'Job ID':'Count'}, inplace=True)
public_private.to_excel(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\Public_Private_Validation.xls")

In [None]:
four_two = pd.DataFrame(full_table.groupby(['4-year', '2-year', 'IPEDS Sector Name'])['Job ID'].count())
four_two.rename(columns={'Job ID':'Count'}, inplace=True)
four_two.to_excel(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\Four_Two_Validation.xls")

In [None]:
main_table = main_table.merge(full_table[['Job ID', 'Public', 'Private']], on='Job ID', how='inner')
main_table

In [None]:
main_table[(main_table['4-year']==1)&(main_table['IPEDS Insitution Name'].str.contains('community college', case=False, na=False))]

In [None]:
faculty_table = pd.read_csv(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\Brandeis_Dummy_Table_Non_Excluded_08112019.csv")

In [None]:
######################
# Data Generation Code
######################

full_table = faculty_table.copy()

# Tenure_Line / Contingent Manipualtion
# Clarify the Tenure Line variable
full_table['Tenure_Line'] = full_table['Tenured'] + full_table['Tenured_Track']
full_table['Tenure_Line'].where(full_table['Tenure_Line'] < 2, 1, inplace=True)

# Mutually exclude Tenure-Line and Contingent
'''
If Tenure-Line is 1 AND Contingent is 0 OR Tenure-Line is 0 AND Contingent is 1, leave contingent alone;
Otherwise it must be replaced with 0
'''
full_table['Contingent'].where(((full_table['Tenure_Line'] > 0) & (full_table['Contingent'] < 1) |
                                (full_table['Tenure_Line'] < 1) & (full_table['Contingent'] > 0)), 0, inplace=True)
'''
If Tenure-Line is 1, FTC and PTC should be 0
'''
full_table['Full-time Contingent'].where(full_table['Tenure_Line'] < 1, 0, inplace=True)
full_table['Part-time Contingent'].where(full_table['Tenure_Line'] < 1, 0, inplace=True)

full_table['Full-time Contingent'].where(full_table['Contingent'] > 0, 0, inplace=True)
full_table['Part-time Contingent'].where(full_table['Contingent'] > 0, 0, inplace=True)

full_table.groupby(['Tenure_Line','Tenured', 'Tenured_Track', 'Contingent', 'Full-time Contingent',
                    'Part-time Contingent']).count()

In [None]:
# Actually saved as Brandeis_Dummy_Table_Updated_08272019
full_table.drop(columns=['Unnamed: 0']).to_csv(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\Brandeis_Dummy_Table_Updated_08112019.csv")

In [None]:
#########################

# Parse IPEDS Column:
# 4-year
four_year = full_table['IPEDS Sector Name'].str.contains('4-year')
four_year = four_year + full_table['IPEDS Sector Name'].str.contains('Admin')
full_table['4-year'] = four_year + 0
# 2-Year
two_year = full_table['IPEDS Sector Name'].str.contains(', 2-year')
full_table['2-year'] = two_year + 0

# (4) Drop now useless columns
full_table = full_table[['Job ID', 'Year', 'Faculty', 'R1', '4-year', '2-year']]

In [None]:
full_table[['Job ID', '4-year', '2-year', 'Public', 'Private']].to_csv(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\New_Classification_Dummies_07102019.csv")

In [None]:
# main_table.drop(columns=['4-year', '2-year'])
end_table = main_table.merge(full_table[['Job ID', '4-year', '2-year', 'Public', 'Private']], on='Job ID', how='inner')
end_table.drop(columns=['Unnamed: 0', 'Unnamed: 0.1'], inplace=True)


In [None]:
end_table.to_csv(r"C:\Users\bob65\Documents\Summer2019_Research\PythonFiles\_Data\Brandeis_Main_Table_updated_7122019.csv")

In [None]:
faculty_table = faculty_table.drop(3957167)

In [None]:
faculty_table = faculty_table.astype('int64', copy=False)

In [None]:
new_class

In [None]:
dict = new_class.set_index('Job ID').to_dict()
faculty_table_new = faculty_table['Job ID'].apply(lambda x: dict[x])

In [None]:
faculty_table_new

In [None]:
for string in main_table.columns:
    print(string)

In [None]:
for string in faculty_table.columns:
    print(string)

In [None]:
for string in full_table.columns:
    print(string)