In [None]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)
import numpy as np
import datetime as dt
# plt.style.use('fivethirtyeight')

In [None]:
education = pd.read_csv(r'B:\_DataBGTRes\Doctoral_Data\doc_education_info_with_indicator.csv')
print(len(education))
print(education['BGTResID'].nunique())

jobs = pd.read_csv(r'B:\_DataBGTRes\Doctoral_Data\doc_job_info.csv')
print(len(jobs))
print(jobs['BGTResID'].nunique())

education = education.convert_dtypes()
jobs = jobs.convert_dtypes()

In [None]:
education.info()

In [None]:
jobs.info()

In [None]:
len(education[education['ind_doc']==1])

In [None]:
type(float('nan'))

In [None]:
# Isolate degrees with '#' in any of the fields

columns = ['MajorCipCode', 'major', 'degreeLevel', 'CompletionDateRaw']

edu = education.copy()

def pounds(string):
    string = str(string)
    return len(string.split('#')) - 1
        
for column in columns:
    edu[column + '_pound'] = edu[column].apply(pounds)

docs = edu[edu['ind_doc']==1]

docs_no_pound = None
first = True
for column in columns:
    if(first):
        docs_no_pound = docs[docs[column + '_pound']==0]
        first = False
    else:
        docs_no_pound = docs_no_pound[docs_no_pound[column + '_pound']==0]
        
docs_pound = docs.drop(docs_no_pound.index)

print(len(docs_pound), len(docs_no_pound))

print(docs_pound['BGTResID'].nunique(), docs_no_pound['BGTResID'].nunique())

In [None]:
group_columns = ['MajorCipCode_pound', 'major_pound', 'degreeLevel_pound', 'CompletionDateRaw_pound']
possible_pound_combinations = docs_pound[docs_pound['ind_doc']==1].groupby(group_columns).count()[['BGTResID']]
# display(possible_pound_combinations)

# Iterate over the aggregate
empty = True
output = None

# Temporary consumable DataFrame for speed advantage
df = docs_pound.copy()

# Iterate over all combinations of pound symbol divisions in the dataset
for index, row in possible_pound_combinations.reset_index().iterrows():
    nums = set()
    safe = True
    
    # Collect the unique values for number of pound symbols
    for i in range(len(group_columns)):
        nums.add(row[i])
    
    # If there are 3 or more of them, then the columns cannot be disentangled
    if len(nums) > 2:
        safe = False
        
    # If there are exactly two unique values, at least one must be zero to be disentangled
    elif len(nums) == 2:
        if ((nums.pop() != 0) & (nums.pop() != 0)):
            safe = False
               
    # If all of the numbers of divisions are the same OR if some of them have no divisons
    if safe:
        # Find all of the rows in the target table with this valid set of numbers
        columns = row.index
        temp = df[(df[columns[0]]==row[0])&(df[columns[1]]==row[1])&(df[columns[2]]==row[2])&(df[columns[3]]==row[3])]
        
        # Put them in a DataFrame together
        if empty:
            output = temp
            empty = False
        else:
            output = pd.concat([output, temp])

        # Throw out all observations that have been approved so they need not be compared to again
        df = df.drop(temp.index)
        
del(df)
print(len(output))
print(output['BGTResID'].nunique())
display(output)

In [None]:
output.groupby(group_columns).count()[['BGTResID']]

In [None]:
group_columns = ['MajorCipCode_pound', 'major_pound', 'degreeLevel_pound', 'CompletionDateRaw_pound']
docs_pound[(docs_pound['MajorCipCode_pound']==1)&(docs_pound['major_pound']==2)&(docs_pound['degreeLevel_pound']==3)&(docs_pound['CompletionDateRaw_pound']==2)&(~docs_pound['GPA'].isnull())].iloc[:, :12][:1]

In [None]:
docs_no_pound.groupby(group_columns).count()

In [None]:
docs_pound.groupby(group_columns).count()[['BGTResID']].sort_values('BGTResID', ascending=False)

In [None]:
############################
# Purely iterative approach
# DON'T USE
############################

# empty = True
# output = None

# for index, row in docs_pound.iterrows():
#     i = 13
#     nums = list()
#     prev = None
#     for k in range(4):
#         prev = row[i+k]
#         if prev not in nums:
#             nums.append(prev)
#     if len(nums) > 2:
#         continue
#     elif len(nums) == 2:
#         if(nums[0] != 0 | nums[1] != 0):
#             continue
#     else:
#         if empty:
#             output = pd.DataFrame(row).transpose()
#             empty = False
#         else:
#             output = pd.concat([output, pd.DataFrame(row).transpose()])

# print(len(output))
# display(output)

In [None]:
################################
# Completion Date work
################################

dates = education.copy()

def pound_split(string):
    if string is not pd.NA:
        return string.split('#')
    else: 
        return pd.NA
    
def num_dates(dates):
    if dates is not pd.NA:
        return len(dates)
    else:
        return pd.NA
    
def remove_unicode_escapes(string):
    if string is pd.NA:
        return pd.NA
    if string is type(float):
        return float('nan')
    
    r_str = string.encode('unicode_escape')
    loc = r_str.find(b'\\')
    
    if loc == -1:
        return string
    
    if loc + 1 < len(string):
        output = string.replace(string[loc]+string[loc+1], ' ')
    else:
        output = string.replace(string[loc], '')
    
#     output = ''
#     for s in string.split(string[loc]):
#         if output == '':
#             output = s
#         else:
#             output = output + ' ' + s
        
    return remove_unicode_escapes(output)
    
dates['CompletionDateProc'] = dates['CompletionDateRaw'].apply(remove_unicode_escapes)

single_dates = dates[dates['CompletionDateProc'].apply(pound_split).apply(num_dates)==1]

single_dates[single_dates['CompletionDateProc']!=single_dates['CompletionDateRaw']]

In [None]:
single_dates[single_dates['CompletionDateProc']!=single_dates['CompletionDateRaw']][:1]['CompletionDateRaw'].values[0]

In [None]:
def parse_slash_date(string):
    if string is pd.NA:
        return pd.NA
    
    string = string.replace(',', '')
    string = string.replace('.', '')
    string = string.replace('?', '')
    
    segments = string.split('/')
    
    # Ignore all strings that are not composed entirely of numbers inside the slashes
    for s in segments:
        try:
            int(s)
        except(ValueError):
            return pd.NA
    
    l = len(segments)
    
    day = None
    month = None
    year = None
    
    if l == 3:
        month = int(segments[0])
        day = int(segments[1])
        year = int(segments[2])
    
    if l == 2:
        month = int(segments[0])
        year = int(segments[1])
    
    if l == 1:
        year = int(segments[0])
        
    return (year, month, day)
        

# w_slash = single_dates[single_dates['CompletionDateRaw'].str.contains('/', na=False)]

# w_slash['CompletionDate'] = w_slash['CompletionDateRaw'].apply(parse_slash_date)


def parse_word_month(string):
    
    months = {
        'jan':1,
        'january':1,
        'feb':2,
        'february':2,
        'mar':3,
        'march':3,
        'apr':4,
        'april':4,
        'may':5,
        'jun':6,
        'june':6,
        'jul':7,
        'july':7,
        'aug':8,
        'august':8,
        'sep':9,
        'sept':9,
        'september':9,
        'oct':10,
        'october':10,
        'nov':11,
        'november':11,
        'dec':12,
        'december':12,
        'spring':6,
        'fall':12,
        'summer':9,
        'winter':1
    }
    
    if string is pd.NA:
        return pd.NA
    
    string = string.replace(',', ' ')
    string = string.replace('.', ' ')
    string = string.replace('\'', ' ')
    string = string.replace('-', ' ')
    string = string.replace('/', ' ')
    string = string.replace('1st', ' 1 ')
    string = string.replace('nd', ' ')
    string = string.replace('rd', ' ')
    string = string.replace('th', ' ')
    string = string.replace('?', '')
    string = string.replace('of', ' ')
    
    pieces = string.split(' ')
    
    no_empty = list()
    for p in pieces:
        if len(p) > 0:
            no_empty.append(p)
    
    pieces = no_empty
    
    year = None
    month = None
    day = None
    
    try:
        if len(pieces) == 2:
            month = pieces[0]
            day = 1
            year = int(pieces[1])

        elif len(pieces) == 3:
            month = pieces[0]
            day = int(pieces[1])
            year = int(pieces[2])

        elif len(pieces) == 1:
            year = int(pieces[0])
            month = 'june'
            day = 1

        else:
            return pd.NA
        
    except(ValueError):
        if len(pieces) == 1:
            print(string)
            for k in months.keys():
                if string.find(k) >=0:
                    month = k
                    break
            remainder = string.replace(k, '')
            try:
                year = int(remainder)
            except(ValueError):
                return pd.NA
            
            if month is None:        
                return pd.NA
        else:
            return pd.NA
        
    if month in months.keys():
        month = months[month]
    else:
        try: 
            month = int(month)
        except(ValueError):
            return pd.NA
        
    
    return (year, month, day)

In [None]:
def parse_date(string):
    
    string = string.lower()
    
    if string.find('/') >= 0:
        date = parse_slash_date(string)
        if date is pd.NA:
            date = parse_word_month(string)
    
    else:
        date = parse_word_month(string)
    
    if date is pd.NA:
        return pd.NA
    
    year = date[0]
    month = date[1]
    day = date[2]
    
    if (year is None) & (month is None) & (day is None):
        return pd.NA
    
    if year is None:
        return pd.NA
    
    year = abs(year)
    
    if month > 1926:
        temp = month
        month = year
        year = temp
        del(temp)  
    
    if year < 100:
        y = str(year)
        if year < 26:
            year = int('20' + f"{y:0>2}")
        else:
            year = int('19' + f"{y:0>2}")
        
    if (month > 12) & (month < 31):
        temp = day
        day = month
        month = temp
        del(temp)
    
    if day is None: 
        day = 1
    
    if month is None:
        month = 6
    
    try:
        date = dt.date(year, month, day)
        return date
    except(ValueError):
#         print(string, '->', month, day, year)
        return pd.NA
    
single_dates['CompletionDate'] = single_dates['CompletionDateProc'].apply(parse_date)

In [None]:
single_dates[single_dates['CompletionDate'].isnull()]

In [None]:
single_dates[~single_dates['CompletionDate'].isnull()&(single_dates['ind_doc']==1)]

In [None]:
single_dates

In [None]:
temp = jobs.merge(single_dates[single_dates['ind_doc']==1][['BGTResID', 'CompletionDate']], how='inner')

temp['jobISOStartDate'] = pd.to_datetime(temp['jobISOStartDate'], errors='coerce')
temp['jobISOEndDate'] = pd.to_datetime(temp['jobISOEndDate'], errors='coerce')
temp['CompletionDate'] = pd.to_datetime(temp['CompletionDate'], errors='coerce')

In [None]:
display(temp.info())

greater = temp[temp['jobISOStartDate']>temp['CompletionDate']]

display(greater)

greater['BGTResID'].nunique()



In [None]:
temp.groupby(['BGTResID', 'CompletionDate', 'jobISOStartDate']).count()[:100]

In [None]:

months = {
        'jan':1,
        'january':1,
        'feb':2,
        'february':2,
        'mar':3,
        'march':3,
        'apr':4,
        'april':4,
        'may':5,
        'jun':6,
        'june':6,
        'jul':7,
        'july':7,
        'aug':8,
        'august':8,
        'sep':9,
        'september':9,
        'oct':10,
        'october':10,
        'nov':11,
        'november':11,
        'dec':12,
        'december':12
}

print(months.keys())

In [None]:
single_dates.info()

In [None]:
container = set()

def find_non_numbers(string):
    if string is not pd.NA:
        for s in string.split(' '):
            try:
                int(s)
            except(ValueError):
                container.add(s)
                
single_dates['CompletionDateRaw'].apply(find_non_numbers)

container

In [None]:
for s in container:
    if s.find('/') >= 0:
        print(s.replace('?', ''))

In [None]:
remove_unicode_escapes(string)

In [None]:
raw_s = r'{}'.format(string)

print(raw_s.find('\\'))

In [None]:
output[(output['MajorCipCode_pound']==0)&(output['major_pound']==0)&(output['degreeLevel_pound']==0)&(output['CompletionDateRaw_pound']==0)]

In [None]:
final_doc[final_doc['CompletionDateRaw_pound']>=30]

In [None]:
# Doctoral indicator dummy generation code

def define_doctoral(df):

    _21 = df[df['degreeLevel'].str.contains('21', na=False)]
    
    print('\tNumber of \'21\'s found:', _21['BGTResID'].nunique())
        
    doc = df.drop(df.index.difference(_21.index))
    wo_21 = df.drop(_21.index)

    strings = ['Doctor', 'ph\.', 'm\.d\.', 'j\.d\.', 'phd', 'dds', 'dml', 'ed\. D']

    for string in strings:
        target = wo_21[wo_21['DegreeType'].str.contains(string, case=False, na=False)]

        print('\tContains \'' + string + '\':', target['BGTResID'].nunique())

        wo_21 = wo_21.drop(target.index)
        doc = pd.concat([doc, target])
        
    
    return doc

doc = define_doctoral(education)

doc['ind_doc'] = 1
education['ind_doc'] = 0

doc = pd.concat([doc, education.loc[education.index.difference(doc.index)]], sort=False)

doc.to_csv(r'A:\_DataBGTRes\Doctoral_Data\doc_education_info_with_indicator.csv', index=False)

In [None]:
# Benchmarks of CIP data

def print_benchmarks(df):
    total = df['BGTResID'].nunique()

    full_both = df[~df['MajorCipCode'].isnull()&~df['major'].isnull()]['BGTResID'].nunique()

    no_CIP = df[df['MajorCipCode'].isnull()&~df['major'].isnull()]['BGTResID'].nunique()

    null_both = df[df['MajorCipCode'].isnull()&df['major'].isnull()]['BGTResID'].nunique()

    print(total, '\n\t' + str(full_both) + ' -> ' + str(round((float(full_both)/float(total)) * 100, 2)) + '%',
         '\n\t' + str(no_CIP) + ' -> ' + str(round((float(no_CIP)/float(total)) * 100, 2)) + '%',
         '\n\t' + str(null_both) + ' -> ' + str(round((float(null_both)/float(total)) * 100, 2)) + '%')
    
print_benchmarks(education[education['ind_doc']==1])
print_benchmarks(education[education['ind_doc']==0])

In [None]:
df = education[education['ind_doc']==1]
len(df)/df['BGTResID'].nunique()

In [None]:
df = education[education['ind_doc']==0]
len(df)/df['BGTResID'].nunique()

In [None]:
len(doc)/doc['BGTResID'].nunique()

In [None]:
education[~education['MajorCipCode'].isnull()&~education['major'].isnull()]

In [None]:
no_cip = pd.DataFrame(education[(education['MajorCipCode'].isnull())]['major'].value_counts())

print(len(no_cip))
display(no_cip)

In [None]:
len(education[education['major']=='Biology']['DegreeType'].value_counts())

In [None]:
cips = pd.read_csv(r'https://nces.ed.gov/ipeds/cipcode/Files/CIPCode2010.csv')

In [None]:
cips[cips['CIPTitle'].str.contains('Biology', case=False)]

In [None]:
# ELIAS CODE

In [None]:
import numpy as np
import pandas as pd

# Set data locations
input_loc = '[SET TO INTERMEDIATE DATA LOCATION]'
output_loc = '[SET TO DESIRED OUTPUT LOCATION]'
onet_url = 'https://www.onetcenter.org/taxonomy/2010/soc2018/2010_to_2018_SOC_Crosswalk.csv?fmt=csv'
naics_url = 'https://www.census.gov/eos/www/naics/2017NAICS/2-6%20digit_2017_Codes.xlsx'
cip_url = 'https://nces.ed.gov/ipeds/cipcode/Files/CIPCode2010.csv'

onet = pd.read_csv(onet_url)
onet.rename(columns = {
    'O*NET-SOC 2010 Code': 'ONETCode', 
    'O*NET-SOC 2010 Title': 'ONETName', 
    '2018 SOC Code': 'SOCCode',
    '2018 SOC Title': 'SOCName'
    }, inplace = True)
naics = pd.read_excel(naics_url)
naics.rename(columns = {
    '2017 NAICS US   Code': 'NAICS2',
    '2017 NAICS US Title': 'NAICSName'
    }, inplace = True)
naics = naics[['NAICS2', 'NAICSName']]

jobs = pd.read_csv(f'{input_loc}04_PhD_Jobs.csv')

jobs = jobs.merge(onet, on = 'ONETCode')
jobs = jobs.merge(naics, on = 'NAICS2')
jobs = jobs[[
    'BGTResID', 'StartDate', 'EndDate', 
    'ONETCode', 'ONETName', 'SOCCode', 'SOCName',
    'NAICS2', 'NAICSName'
    ]]

cip = pd.read_csv(cip_url)
cip = cip[['CIPCode', 'CIPTitle']]
cip['CIPCode'] = cip['CIPCode'].str.replace('=', '', regex = False)
cip['CIPCode'] = cip['CIPCode'].str.replace('"', '', regex = False)

phds = pd.read_csv(f'{input_loc}03_PhD_CIP_codes.csv', index_col = 'BGTResID')
phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(';', '#', regex = False)
phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(' ', '#', regex = False)
for string in ['38.0001', '38.0101', '38.0199', '38.9999']:
    pat = string + '#'
    phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(pat, '', regex = False)
    pat = '#' + string
    phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(pat, '', regex = False)

max_splits = phds['PhD_CIPs'].str.count('#').max()
print(f'The most Ph.D. CIPs associated with a resume is {max_splits + 1}')
phds_split = phds['PhD_CIPs'].str.split(pat = '#', expand = True
    ).fillna(value = '')
phds_split = phds_split.reset_index()
cip.rename(columns = {
    'CIPCode': 'CIPCode0',
    'CIPTitle': 'CIPName0',
    }, inplace = True)
columns = ['BGTResID']
for code in range(max_splits.astype(int) + 1):
    phds_split.rename(columns = {
        code: f'CIPCode{code}',
        }, inplace = True)
    assert phds_split[f'CIPCode{code}'].str.len().max() <= 7
    phds_split = phds_split.merge(cip, how = 'left', on = f'CIPCode{code}')
    cip.rename(columns = {
        f'CIPCode{code}': f'CIPCode{code + 1}',
        f'CIPName{code}': f'CIPName{code + 1}'
        }, inplace = True)
    columns.append(f'CIPCode{code}')
    columns.append(f'CIPName{code}')
phds_split = phds_split[columns]

# phds_split.to_csv(f'{output_loc}PhD_CIP_codes.csv', index = False)
# jobs.to_csv(f'{output_loc}PhD_Jobs.csv', index = False)



