In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', -1)
import numpy as np
# plt.style.use('fivethirtyeight')

In [None]:
education = pd.read_csv(r'B:\_DataBGTRes\Doctoral_Data\doc_education_info_with_indicator.csv')
print(len(education))
print(education['BGTResID'].nunique())

In [None]:
education.info()

In [None]:
education['ind_doc'].sum()

In [None]:
# Isolate degrees with '#' in any of the fields

columns = ['MajorCipCode', 'major', 'degreeLevel', 'CompletionDateRaw']
row = 0

mask = None

for column in columns:
    ids = education[education[column].astype(str).str.contains('#', na=False)][['BGTResID']]
    if mask is None:
        mask = ids
    else:
        mask = pd.concat([mask, ids])
        
mask = mask.drop_duplicates()

final = education.merge(mask, on='BGTResID', how='inner')

final_doc = final[final['ind_doc']==1]

In [None]:
final_doc['BGTResID'].nunique()

In [None]:
def pounds(string):
    string = str(string)
    return len(string.split('#')) - 1
        
for column in columns:
    final_doc[column + '_pound'] = final_doc[column].apply(pounds)
        
final_doc

In [None]:
final_doc[final_doc['ind_doc']==1].groupby(['MajorCipCode_pound', 'major_pound', 'degreeLevel_pound', 'CompletionDateRaw_pound']).count().sort_values('BGTResID', ascending=False)

In [None]:
final_doc[final_doc['CompletionDateRaw_pound']>=30]

In [None]:
# Doctoral indicator dummy generation code

def define_doctoral(df):

    _21 = df[df['degreeLevel'].str.contains('21', na=False)]
    
    print('\tNumber of \'21\'s found:', _21['BGTResID'].nunique())
        
    doc = df.drop(df.index.difference(_21.index))
    wo_21 = df.drop(_21.index)

    strings = ['Doctor', 'ph\.', 'm\.d\.', 'j\.d\.', 'phd', 'dds', 'dml', 'ed\. D']

    for string in strings:
        target = wo_21[wo_21['DegreeType'].str.contains(string, case=False, na=False)]

        print('\tContains \'' + string + '\':', target['BGTResID'].nunique())

        wo_21 = wo_21.drop(target.index)
        doc = pd.concat([doc, target])
        
    
    return doc

doc = define_doctoral(education)

doc['ind_doc'] = 1
education['ind_doc'] = 0

doc = pd.concat([doc, education.loc[education.index.difference(doc.index)]], sort=False)

doc.to_csv(r'A:\_DataBGTRes\Doctoral_Data\doc_education_info_with_indicator.csv', index=False)

In [None]:
# Benchmarks of CIP data

def print_benchmarks(df):
    total = df['BGTResID'].nunique()

    full_both = df[~df['MajorCipCode'].isnull()&~df['major'].isnull()]['BGTResID'].nunique()

    no_CIP = df[df['MajorCipCode'].isnull()&~df['major'].isnull()]['BGTResID'].nunique()

    null_both = df[df['MajorCipCode'].isnull()&df['major'].isnull()]['BGTResID'].nunique()

    print(total, '\n\t' + str(full_both) + ' -> ' + str(round((float(full_both)/float(total)) * 100, 2)) + '%',
         '\n\t' + str(no_CIP) + ' -> ' + str(round((float(no_CIP)/float(total)) * 100, 2)) + '%',
         '\n\t' + str(null_both) + ' -> ' + str(round((float(null_both)/float(total)) * 100, 2)) + '%')
    
print_benchmarks(education[education['ind_doc']==1])
print_benchmarks(education[education['ind_doc']==0])

In [None]:
df = education[education['ind_doc']==1]
len(df)/df['BGTResID'].nunique()

In [None]:
df = education[education['ind_doc']==0]
len(df)/df['BGTResID'].nunique()

In [None]:
len(doc)/doc['BGTResID'].nunique()

In [None]:
education[~education['MajorCipCode'].isnull()&~education['major'].isnull()]

In [None]:
no_cip = pd.DataFrame(education[(education['MajorCipCode'].isnull())]['major'].value_counts())

print(len(no_cip))
display(no_cip)

In [None]:
len(education[education['major']=='Biology']['DegreeType'].value_counts())

In [None]:
cips = pd.read_csv(r'https://nces.ed.gov/ipeds/cipcode/Files/CIPCode2010.csv')

In [None]:
cips[cips['CIPTitle'].str.contains('Biology', case=False)]

In [None]:
# ELIAS CODE

In [None]:
import numpy as np
import pandas as pd

# Set data locations
input_loc = '[SET TO INTERMEDIATE DATA LOCATION]'
output_loc = '[SET TO DESIRED OUTPUT LOCATION]'
onet_url = 'https://www.onetcenter.org/taxonomy/2010/soc2018/2010_to_2018_SOC_Crosswalk.csv?fmt=csv'
naics_url = 'https://www.census.gov/eos/www/naics/2017NAICS/2-6%20digit_2017_Codes.xlsx'
cip_url = 'https://nces.ed.gov/ipeds/cipcode/Files/CIPCode2010.csv'

onet = pd.read_csv(onet_url)
onet.rename(columns = {
    'O*NET-SOC 2010 Code': 'ONETCode', 
    'O*NET-SOC 2010 Title': 'ONETName', 
    '2018 SOC Code': 'SOCCode',
    '2018 SOC Title': 'SOCName'
    }, inplace = True)
naics = pd.read_excel(naics_url)
naics.rename(columns = {
    '2017 NAICS US   Code': 'NAICS2',
    '2017 NAICS US Title': 'NAICSName'
    }, inplace = True)
naics = naics[['NAICS2', 'NAICSName']]

jobs = pd.read_csv(f'{input_loc}04_PhD_Jobs.csv')

jobs = jobs.merge(onet, on = 'ONETCode')
jobs = jobs.merge(naics, on = 'NAICS2')
jobs = jobs[[
    'BGTResID', 'StartDate', 'EndDate', 
    'ONETCode', 'ONETName', 'SOCCode', 'SOCName',
    'NAICS2', 'NAICSName'
    ]]

cip = pd.read_csv(cip_url)
cip = cip[['CIPCode', 'CIPTitle']]
cip['CIPCode'] = cip['CIPCode'].str.replace('=', '', regex = False)
cip['CIPCode'] = cip['CIPCode'].str.replace('"', '', regex = False)

phds = pd.read_csv(f'{input_loc}03_PhD_CIP_codes.csv', index_col = 'BGTResID')
phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(';', '#', regex = False)
phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(' ', '#', regex = False)
for string in ['38.0001', '38.0101', '38.0199', '38.9999']:
    pat = string + '#'
    phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(pat, '', regex = False)
    pat = '#' + string
    phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(pat, '', regex = False)

max_splits = phds['PhD_CIPs'].str.count('#').max()
print(f'The most Ph.D. CIPs associated with a resume is {max_splits + 1}')
phds_split = phds['PhD_CIPs'].str.split(pat = '#', expand = True
    ).fillna(value = '')
phds_split = phds_split.reset_index()
cip.rename(columns = {
    'CIPCode': 'CIPCode0',
    'CIPTitle': 'CIPName0',
    }, inplace = True)
columns = ['BGTResID']
for code in range(max_splits.astype(int) + 1):
    phds_split.rename(columns = {
        code: f'CIPCode{code}',
        }, inplace = True)
    assert phds_split[f'CIPCode{code}'].str.len().max() <= 7
    phds_split = phds_split.merge(cip, how = 'left', on = f'CIPCode{code}')
    cip.rename(columns = {
        f'CIPCode{code}': f'CIPCode{code + 1}',
        f'CIPName{code}': f'CIPName{code + 1}'
        }, inplace = True)
    columns.append(f'CIPCode{code}')
    columns.append(f'CIPName{code}')
phds_split = phds_split[columns]

# phds_split.to_csv(f'{output_loc}PhD_CIP_codes.csv', index = False)
# jobs.to_csv(f'{output_loc}PhD_Jobs.csv', index = False)



