In [87]:
import os
import pandas as pd
pd.set_option('display.max_columns', 500)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_colwidth', None)
import numpy as np
import datetime as dt
# plt.style.use('fivethirtyeight')

In [5]:
education = pd.read_csv(r'B:\_DataBGTRes\Doctoral_Data\doc_education_info_with_indicator.csv')
print(len(education))
print(education['BGTResID'].nunique())

education = education.convert_dtypes()

1268712
434072


In [6]:
education.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1268712 entries, 0 to 1268711
Data columns (total 13 columns):
 #   Column             Non-Null Count    Dtype 
---  ------             --------------    ----- 
 0   BGTResID           1268712 non-null  Int64 
 1   degreeLevel        994482 non-null   string
 2   Instituition       1268096 non-null  string
 3   InstitutionCity    828473 non-null   string
 4   InstitutionState   770074 non-null   string
 5   degreePosition     1268712 non-null  Int64 
 6   DegreeType         1057725 non-null  string
 7   ipeds_unit_id      768126 non-null   Int64 
 8   major              761359 non-null   string
 9   MajorCipCode       474082 non-null   string
 10  CompletionDateRaw  800666 non-null   string
 11  GPA                157619 non-null   string
 12  ind_doc            1268712 non-null  Int64 
dtypes: Int64(4), string(9)
memory usage: 130.7 MB


In [None]:
len(education[education['ind_doc']==1])

In [None]:
# Isolate degrees with '#' in any of the fields

columns = ['MajorCipCode', 'major', 'degreeLevel', 'CompletionDateRaw']

edu = education.copy()

def pounds(string):
    string = str(string)
    return len(string.split('#')) - 1
        
for column in columns:
    edu[column + '_pound'] = edu[column].apply(pounds)

docs = edu[edu['ind_doc']==1]

docs_no_pound = None
first = True
for column in columns:
    if(first):
        docs_no_pound = docs[docs[column + '_pound']==0]
        first = False
    else:
        docs_no_pound = docs_no_pound[docs_no_pound[column + '_pound']==0]
        
docs_pound = docs.drop(docs_no_pound.index)

print(len(docs_pound), len(docs_no_pound))

print(docs_pound['BGTResID'].nunique(), docs_no_pound['BGTResID'].nunique())

In [None]:
group_columns = ['MajorCipCode_pound', 'major_pound', 'degreeLevel_pound', 'CompletionDateRaw_pound']
possible_pound_combinations = docs_pound[docs_pound['ind_doc']==1].groupby(group_columns).count()[['BGTResID']]
# display(possible_pound_combinations)

# Iterate over the aggregate
empty = True
output = None

# Temporary consumable DataFrame for speed advantage
df = docs_pound.copy()

# Iterate over all combinations of pound symbol divisions in the dataset
for index, row in possible_pound_combinations.reset_index().iterrows():
    nums = set()
    safe = True
    
    # Collect the unique values for number of pound symbols
    for i in range(len(group_columns)):
        nums.add(row[i])
    
    # If there are 3 or more of them, then the columns cannot be disentangled
    if len(nums) > 2:
        safe = False
        
    # If there are exactly two unique values, at least one must be zero to be disentangled
    elif len(nums) == 2:
        if ((nums.pop() != 0) & (nums.pop() != 0)):
            safe = False
               
    # If all of the numbers of divisions are the same OR if some of them have no divisons
    if safe:
        # Find all of the rows in the target table with this valid set of numbers
        columns = row.index
        temp = df[(df[columns[0]]==row[0])&(df[columns[1]]==row[1])&(df[columns[2]]==row[2])&(df[columns[3]]==row[3])]
        
        # Put them in a DataFrame together
        if empty:
            output = temp
            empty = False
        else:
            output = pd.concat([output, temp])

        # Throw out all observations that have been approved so they need not be compared to again
        df = df.drop(temp.index)
        
del(df)
print(len(output))
print(output['BGTResID'].nunique())
display(output)

In [None]:
output.groupby(group_columns).count()[['BGTResID']]

In [None]:
group_columns = ['MajorCipCode_pound', 'major_pound', 'degreeLevel_pound', 'CompletionDateRaw_pound']
docs_pound[(docs_pound['MajorCipCode_pound']==1)&(docs_pound['major_pound']==2)&(docs_pound['degreeLevel_pound']==3)&(docs_pound['CompletionDateRaw_pound']==2)&(~docs_pound['GPA'].isnull())].iloc[:, :12][:1]

In [None]:
docs_no_pound.groupby(group_columns).count()

In [None]:
docs_pound.groupby(group_columns).count()[['BGTResID']].sort_values('BGTResID', ascending=False)

In [None]:
############################
# Purely iterative approach
# DON'T USE
############################

# empty = True
# output = None

# for index, row in docs_pound.iterrows():
#     i = 13
#     nums = list()
#     prev = None
#     for k in range(4):
#         prev = row[i+k]
#         if prev not in nums:
#             nums.append(prev)
#     if len(nums) > 2:
#         continue
#     elif len(nums) == 2:
#         if(nums[0] != 0 | nums[1] != 0):
#             continue
#     else:
#         if empty:
#             output = pd.DataFrame(row).transpose()
#             empty = False
#         else:
#             output = pd.concat([output, pd.DataFrame(row).transpose()])

# print(len(output))
# display(output)

In [86]:
################################
# Completion Date work
################################

education['CompletionDateRaw'].value_counts()

def pound_split(string):
    if string is not pd.NA:
        return string.split('#')
    else: 
        return pd.NA
    
def num_dates(dates):
    if dates is not pd.NA:
        return len(dates)
    else:
        return pd.NA
    
def remove_unicode_escapes(string):
    if string is pd.NA:
        return pd.NA
    
    r_str = string.encode('unicode_escape')
    loc = r_str.find(b'\\')
    
    if loc == -1:
        return string
    
    output = ''
    for s in string.split(string[loc]):
        if output == '':
            output = s
        else:
            output = output + ' ' + s
        
    return remove_unicode_escapes(output)
    
education['CompletionDateRaw'] = education['CompletionDateRaw'].apply(remove_unicode_escapes)

single_dates = education[education['CompletionDateRaw'].apply(pound_split).apply(num_dates)<=1]

single_dates

Unnamed: 0,BGTResID,degreeLevel,Instituition,InstitutionCity,InstitutionState,degreePosition,DegreeType,ipeds_unit_id,major,MajorCipCode,CompletionDateRaw,GPA,ind_doc
2,5567150,21,DETROIT COLLEGE OF LAW,Detroit,MI,1,Juris Doctor,-999,,,1987,,1
4,5567196,21,University of Miami,,,2,J.D,135726,,,2002,,1
5,5594991,21,"The Ohio State University, Moritz College of Law",Columbus,OH,2,Juris Doctorate,-999,,,June 2013,3.0,1
7,5594750,21,COLUMBIA LAW SCHOOL,Chicago,IL,2,Juris Doctorate,-999,,,May 2003,,1
8,5600064,21,Mississippi State University,Starkville,MS,1,Ph.D,176080,Industrial and Systems Engineering,,December 2011,,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1268704,77169541,,Purchase St,,,6,B.A.,192749,Chemistry,40.0501,May 1985,,0
1268705,77169541,,,,,7,,-999,,,2010,,0
1268708,77169541,,Purchase St,,,7,B.A.,-999,Chemistry,40.0501,May 1985,,0
1268709,77169541,,,,,8,,-999,,,2010,,0


In [154]:
def parse_slash_date(string):
    if string is pd.NA:
        return pd.NA
    
    segments = string.split('/')
    
    # Ignore all strings that are not composed entirely of numbers inside the slashes
    for s in segments:
        try:
            int(s)
        except(ValueError):
            return pd.NA
    
    l = len(segments)
    
    day = None
    month = None
    year = None
    
    if l == 3:
        month = int(segments[0])
        day = int(segments[1])
        year = int(segments[2])
    
    if l == 2:
        month = int(segments[0])
        year = int(segments[1])
    
    if l == 1:
        year = int(segments[0])
        
    if (year is None) & (month is None) & (day is None):
        return pd.NA
        
    if month > 1926:
        temp = month
        month = year
        year = temp
        del(temp)    
    
    if year < 100:
        y = str(year)
        if year < 26:
            year = int('20' + f"{y:0>2}")
        else:
            year = int('19' + f"{y:0>3}")
        
    if (month > 12) & (month < 31):
        temp = day
        day = month
        month = temp
        del(temp)
    
    if day is None: 
        day = 1
    
    if month is None:
        month = 6
    
    if year is None:
        return pd.NA
    
    try:
        date = dt.date(year, month, day)
        return date
    except(ValueError):
        print(string, '->', month, day, year)
        return pd.NA
        
            
w_slash = single_dates[single_dates['CompletionDateRaw'].str.contains('/', na=False)]

w_slash['CompletionDate'] = w_slash['CompletionDateRaw'].apply(parse_slash_date)

6/87 -> 6 1 19087
5/98 -> 5 1 19098
6/94 -> 6 1 19094
5/94 -> 5 1 19094
12/84 -> 12 1 19084
08/95 -> 8 1 19095
05/95 -> 5 1 19095
01/76 -> 1 1 19076
12/98 -> 12 1 19098
05/98 -> 5 1 19098
03/95 -> 3 1 19095
5/84 -> 5 1 19084
6/20016/2003 -> 6 20016 2003
12/96 -> 12 1 19096
10/87 -> 10 1 19087
12/98 -> 12 1 19098
9/99 -> 9 1 19099
8/93 -> 8 1 19093
12/96 -> 12 1 19096
6/95 -> 6 1 19095
01/97 -> 1 1 19097
10/87 -> 10 1 19087
08/97 -> 8 1 19097
5/97 -> 5 1 19097
6/87 -> 6 1 19087
12/98 -> 12 1 19098
5/83 -> 5 1 19083
7/96 -> 7 1 19096
9/99 -> 9 1 19099
5/94 -> 5 1 19094
5/13/90 -> 5 13 19090
5/95 -> 5 1 19095
6/97 -> 6 1 19097
8/97 -> 8 1 19097
8/81 -> 8 1 19081
5/94 -> 5 1 19094
08/95 -> 8 1 19095
6/89 -> 6 1 19089
6/92 -> 6 1 19092
5/99 -> 5 1 19099
5/95 -> 5 1 19095
5/97 -> 5 1 19097
6/85 -> 6 1 19085
5/83 -> 5 1 19083
12/93 -> 12 1 19093
1/94 -> 1 1 19094
12/94 -> 12 1 19094
6/98 -> 6 1 19098
6/79 -> 6 1 19079
8/97 -> 8 1 19097
8/94 -> 8 1 19094
7/01/93 -> 7 1 19093
12/83 -> 12 1 1908

5/97 -> 5 1 19097
1/95 -> 1 1 19095
12/99 -> 12 1 19099
1/95 -> 1 1 19095
08/86 -> 8 1 19086
8/87 -> 8 1 19087
5/93 -> 5 1 19093
08/99 -> 8 1 19099
6/98 -> 6 1 19098
01/94 -> 1 1 19094
6/98 -> 6 1 19098
5/76 -> 5 1 19076
1997/98 -> 98 1 1997
12/96 -> 12 1 19096
5/86 -> 5 1 19086
6/99 -> 6 1 19099
09/98 -> 9 1 19098
9/98 -> 9 1 19098
3/97 -> 3 1 19097
12/68 -> 12 1 19068
6/98 -> 6 1 19098
12/86 -> 12 1 19086
12/95 -> 12 1 19095
8/95 -> 8 1 19095
6/75 -> 6 1 19075
5/93 -> 5 1 19093
9/86 -> 9 1 19086
12/94 -> 12 1 19094
05/90 -> 5 1 19090
05/98 -> 5 1 19098
8/99 -> 8 1 19099
5/99 -> 5 1 19099
5/81 -> 5 1 19081
6/86 -> 6 1 19086
5/94 -> 5 1 19094
8/97 -> 8 1 19097
5/99 -> 5 1 19099
5/99 -> 5 1 19099
6/94 -> 6 1 19094
05/99 -> 5 1 19099
8/69 -> 8 1 19069
7/94 -> 7 1 19094
09/97 -> 9 1 19097
5/92 -> 5 1 19092
03/93 -> 3 1 19093
5/85 -> 5 1 19085
6/12/99 -> 6 12 19099
8/19/99 -> 8 19 19099
12 /97 -> 12 1 19097
08/98 -> 8 1 19098
12/98 -> 12 1 19098
4/99 -> 4 1 19099
5/93 -> 5 1 19093
12/99 ->

12/95 -> 12 1 19095
5/90 -> 5 1 19090
6/85 -> 6 1 19085
07/28/98 -> 7 28 19098
5/99 -> 5 1 19099
01/97 -> 1 1 19097
06/93 -> 6 1 19093
5/87 -> 5 1 19087
5/92 -> 5 1 19092
5/85 -> 5 1 19085
5/84 -> 5 1 19084
12/97 -> 12 1 19097
02/91 -> 2 1 19091
12/94 -> 12 1 19094
5/98 -> 5 1 19098
12/98 -> 12 1 19098
5/90 -> 5 1 19090
09/96 -> 9 1 19096
5/92 -> 5 1 19092
05/98 -> 5 1 19098
1/97 -> 1 1 19097
10/90 -> 10 1 19090
2/73 -> 2 1 19073
05/97 -> 5 1 19097
11/70 -> 11 1 19070
12/99 -> 12 1 19099
5/66 -> 5 1 19066
5/92 -> 5 1 19092
5/85 -> 5 1 19085
02/99 -> 2 1 19099
12/98 -> 12 1 19098
6 / 80 -> 6 1 19080
6 / 74 -> 6 1 19074
5/86 -> 5 1 19086
5/96 -> 5 1 19096
5/95 -> 5 1 19095
12/97 -> 12 1 19097
5/97 -> 5 1 19097
5/92 -> 5 1 19092
12/89 -> 12 1 19089
03/26 -> 3 1 19026
02/94 -> 2 1 19094
12/95 -> 12 1 19095
6/91 -> 6 1 19091
5/87 -> 5 1 19087
8/94 -> 8 1 19094
8/93 -> 8 1 19093
4/82 -> 4 1 19082
9/97 -> 9 1 19097
9/90 -> 9 1 19090
5/98 -> 5 1 19098
12/92 -> 12 1 19092
4/89 -> 4 1 19089
5/99

5/91 -> 5 1 19091
1/96 -> 1 1 19096
5/94 -> 5 1 19094
5/91 -> 5 1 19091
5/91 -> 5 1 19091
5/86 -> 5 1 19086
4/84 -> 4 1 19084
02/85 -> 2 1 19085
05/86 -> 5 1 19086
08/93 -> 8 1 19093
8/97 -> 8 1 19097
8/98 -> 8 1 19098
06/91 -> 6 1 19091
6/93 -> 6 1 19093
6/96 -> 6 1 19096
5/95 -> 5 1 19095
5/98 -> 5 1 19098
6/74 -> 6 1 19074
6/72 -> 6 1 19072
7/96 -> 7 1 19096
77/90 -> 77 1 19090
8/90 -> 8 1 19090
12/86 -> 12 1 19086
10/16/98 -> 10 16 19098
9/90 -> 9 1 19090
5/73 -> 5 1 19073
8/81 -> 8 1 19081
7/79 -> 7 1 19079
5/81 -> 5 1 19081
2/86 -> 2 1 19086
12/96 -> 12 1 19096
6/88 -> 6 1 19088
12/98 -> 12 1 19098
9/98 -> 9 1 19098
12/84 -> 12 1 19084
5/80 -> 5 1 19080
5/77 -> 5 1 19077
5/74 -> 5 1 19074
1/54 -> 1 1 19054
08/98 -> 8 1 19098
07/84 -> 7 1 19084
7/95 -> 7 1 19095
6/95 -> 6 1 19095
8/93 -> 8 1 19093
5/91 -> 5 1 19091
12/91 -> 12 1 19091
7/86 -> 7 1 19086
12/92 -> 12 1 19092
2/98 -> 2 1 19098
5/89 -> 5 1 19089
12/97 -> 12 1 19097
6/96 -> 6 1 19096
12/98 -> 12 1 19098
09/94 -> 9 1 190

5/99 -> 5 1 19099
05/97 -> 5 1 19097
5/82 -> 5 1 19082
5/80 -> 5 1 19080
03/99 -> 3 1 19099
07/96 -> 7 1 19096
5/77 -> 5 1 19077
6/99 -> 6 1 19099
6/93 -> 6 1 19093
6/97 -> 6 1 19097
7/95 -> 7 1 19095
5/83 -> 5 1 19083
5/81 -> 5 1 19081
12/86 -> 12 1 19086
5/85 -> 5 1 19085
10/1/96 -> 10 1 19096
05/87 -> 5 1 19087
05/85 -> 5 1 19085
12/95 -> 12 1 19095
07/98 -> 7 1 19098
07/95 -> 7 1 19095
03/90 -> 3 1 19090
08/87 -> 8 1 19087
5/94 -> 5 1 19094
5/91 -> 5 1 19091
5/91 -> 5 1 19091
12/84 -> 12 1 19084
12/90 -> 12 1 19090
6/73 -> 6 1 19073
6/99 -> 6 1 19099
12/99 -> 12 1 19099
5/98 -> 5 1 19098
5/94 -> 5 1 19094
1/74 -> 1 1 19074
5/94 -> 5 1 19094
12/94 -> 12 1 19094
8/82 -> 8 1 19082
5/81 -> 5 1 19081
12/93 -> 12 1 19093
6/71 -> 6 1 19071
12/93 -> 12 1 19093
2/97 -> 2 1 19097
12/84 -> 12 1 19084
12/81 -> 12 1 19081
12/98 -> 12 1 19098
06/92 -> 6 1 19092
6/71 -> 6 1 19071
05/98 -> 5 1 19098
6/96 -> 6 1 19096
12/93 -> 12 1 19093
12/94 -> 12 1 19094
12/97 -> 12 1 19097
5/94 -> 5 1 19094
10/

5/95 -> 5 1 19095
2/95 -> 2 1 19095
8/28/98 -> 8 28 19098
5/8/92 -> 5 8 19092
9/80 -> 9 1 19080
9/91 -> 9 1 19091
07/98 -> 7 1 19098
07/97 -> 7 1 19097
09/90 -> 9 1 19090
8/97 -> 8 1 19097
05/95 -> 5 1 19095
5/99 -> 5 1 19099
5/97 -> 5 1 19097
6/73 -> 6 1 19073
2/94 -> 2 1 19094
6/66 -> 6 1 19066
6/87 -> 6 1 19087
05/90 -> 5 1 19090
04/87 -> 4 1 19087
01/99 -> 1 1 19099
1/86 -> 1 1 19086
9/82 -> 9 1 19082
9/81 -> 9 1 19081
9/77 -> 9 1 19077
6/86 -> 6 1 19086
6/84 -> 6 1 19084
6/86 -> 6 1 19086
6/84 -> 6 1 19084
12/90 -> 12 1 19090
6/98 -> 6 1 19098
5/83 -> 5 1 19083
5/97 -> 5 1 19097
5/73 -> 5 1 19073
8/88 -> 8 1 19088
5/84 -> 5 1 19084
12/83 -> 12 1 19083
5/86 -> 5 1 19086
12/83 -> 12 1 19083
12/97 -> 12 1 19097
5/93 -> 5 1 19093
6/99 -> 6 1 19099
10/79 -> 10 1 19079
6/77 -> 6 1 19077
5/87 -> 5 1 19087
6/83 -> 6 1 19083
08/99 -> 8 1 19099
12/09/98 -> 12 9 19098
1/80 -> 1 1 19080
5/30 -> 5 1 19030
06/92 -> 6 1 19092
05/89 -> 5 1 19089
2/51 -> 2 1 19051
5/97 -> 5 1 19097
1/82 -> 1 1 190

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  w_slash['CompletionDate'] = w_slash['CompletionDateRaw'].apply(parse_slash_date)


In [144]:
w_slash[w_slash['CompletionDate'].isnull()]

Unnamed: 0,BGTResID,degreeLevel,Instituition,InstitutionCity,InstitutionState,degreePosition,DegreeType,ipeds_unit_id,major,MajorCipCode,CompletionDateRaw,GPA,ind_doc,CompletionDate
1632,5581481,21,University of Phoenix,,,1,PhD Doctor of Management,380465.0,Information Systems and Technology,52.1201,6./12,,1,
4966,945558,21,Temple University#School of Law,,,1,J.D,,,,May/2009,,1,
5055,951978,21,"Temple University, Beasley School of Law",,,1,J.D,,,,May/2009,,1,
6473,8317970,21,Rensselaer Polytechnic Institute,Troy,NY,1,Ph. D,194824.0,Polymer Chemistry,,May/August 2008,,1,
6943,5273516,21#18#16,University of Wisconsin#Technical University#Technical University#Technical University,Darmstadt#Darmstadt#Darmstadt,##,1,Ph.D#M.S#B.S,,Paper Science and Technology#Mechanical Engineering,14.1901,6/20016/2003,,1,
12142,5931264,21,Carnegie Mellon University,Pittsburgh,PA,1,PHD,211440.0,,,Dec/1995,,1,
12800,3772931,21,The Ohio State University,Columbus,OH,1,Doctor of Philosophy,-999.0,Industrial Welding and Systems Engineering#Manufacturing,14.2700;15.0614,Dec./2007,3.5,1,
13029,6505315,21,Warren National University,Cheyenne,WY,1,Ph.D,,International Business Administration,,April/2009,,1,
13891,3363030,21,Pennsylvania State University,,,1,Ph.D,-999.0,Computer Science and Engineering,11.0701,9/92 8/96,,1,
15234,3890216,21,Miami University,Oxford,OH,1,PhD,204024.0,Analytical Chemistry,40.0502,08/98*08/03,,1,


In [155]:
w_slash.sample(100)

Unnamed: 0,BGTResID,degreeLevel,Instituition,InstitutionCity,InstitutionState,degreePosition,DegreeType,ipeds_unit_id,major,MajorCipCode,CompletionDateRaw,GPA,ind_doc,CompletionDate
1089207,192513247,16,MS VALLEY STATE UNIV.,,,1,Bachelor's Degree,,Computer Science,11.0701,12/15/2005,,0,2005-12-15
975396,42048715,16,Shandong Institute of Architectural and Civil,,,3,BS,,Civil Engineering,14.0801,07/1996,,0,1996-07-01
750782,17609632,16,Central State University,Prescott Valley,AZ,4,Bachelor of Science Degree,201690.0,Accounting,52.0301,08/01/1986,,0,1986-08-01
466117,403121,16,St. Petersburg State University of Telecommunications,East Lansing,MI,3,B.Eng,,Telecommunications Engineering,,06/2004,,0,2004-06-01
478930,8360522,18,Wheelock College,Minneapolis,MN,2,Master of Arts,168290.0,Social Work,,12/05,,0,2005-12-01
626880,9612560,,University of Florida,,,20,,134130.0,,,8/20,,0,2020-08-01
250138,57510635,21,University of South Carolina,,,2,Juris Doctorate Degree,218335.0,,,12/ 08,,1,2008-12-01
1198098,216511938,16,Louisiana State University,Tallahassee,,4,B.S,-999.0,"Baton Rouge, Speech and Journalism Education",09.0400;09.0401,8/1971,,0,1971-08-01
247247,52691526,21,"Chinese University of Hong Kong, Hong Kong",,,1,Ph.D,,GeoInformation Science,,07/2009,,1,2009-07-01
421253,41566704,,University of Phoenix,Tempe,AZ,1,Doctor of Education,380465.0,Education/Educational Leadership,,11/2015,,1,2015-11-01


'012'

In [132]:
single_dates.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660966 entries, 2 to 1268711
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   BGTResID           660966 non-null  Int64 
 1   degreeLevel        561415 non-null  string
 2   Instituition       660726 non-null  string
 3   InstitutionCity    442593 non-null  string
 4   InstitutionState   418919 non-null  string
 5   degreePosition     660966 non-null  Int64 
 6   DegreeType         593077 non-null  string
 7   ipeds_unit_id      433110 non-null  Int64 
 8   major              415359 non-null  string
 9   MajorCipCode       259024 non-null  string
 10  CompletionDateRaw  660966 non-null  object
 11  GPA                95167 non-null   string
 12  ind_doc            660966 non-null  Int64 
dtypes: Int64(4), object(1), string(8)
memory usage: 73.1+ MB


In [111]:
container = set()

def find_non_numbers(string):
    if string is not pd.NA:
        for s in string.split(' '):
            try:
                int(s)
            except(ValueError):
                container.add(s)
                
single_dates['CompletionDateRaw'].apply(find_non_numbers)

container

{'',
 '9.90',
 '8/1986',
 '12/29/17',
 '10/11/',
 '08/01/1978',
 '1997/07',
 '03.2001',
 'May-2005',
 '05/21/1993',
 '05/10/2011',
 'Feb.1990',
 'February',
 '5/10/2016',
 '3/06',
 'April)',
 '6/13',
 '10-2012',
 '09/2015',
 '4/27/96',
 '07/23/1996',
 '4/1997',
 '12/2010',
 '1/15',
 '10/1/96',
 '6/20/1980',
 '05/1968',
 '05/31/2007',
 '31,1970',
 '05/1971',
 '8/10/1984',
 '2005September',
 '1995.',
 '5/22/88',
 '09/11',
 '08/27/2009',
 'May2010',
 'Aug.2013',
 '1/2013',
 '11/84',
 '9/3/09',
 '01/1999',
 '8/1989',
 '05/1993',
 'May,1997',
 '16,2008',
 '02/96',
 '02/02',
 '03/1999',
 "February'",
 '4/23/2010',
 '10/1988',
 '83.82%',
 '2/5/1980',
 '05/03/72',
 'MAY,',
 'Spring2017',
 '5/1/1992',
 '8/18/2016',
 '7/91',
 '9th,',
 '01/2004',
 'December-1996',
 '11/10',
 '2/1986',
 '7/1/1987',
 "12'98",
 'Jan./1990',
 '06/01/2005',
 '1/1991',
 '02/1988',
 'May/1976',
 '5/1988',
 '3/13',
 '07/1997',
 '11/2013',
 '9/89',
 '08/19/1996',
 '19/67',
 '1982......',
 '________________________________

In [113]:
for s in container:
    if s.find('/') >= 0:
        print(s.replace('?', ''))

8/1986
12/29/17
10/11/
08/01/1978
1997/07
05/21/1993
05/10/2011
5/10/2016
3/06
6/13
09/2015
4/27/96
07/23/1996
4/1997
12/2010
1/15
10/1/96
6/20/1980
05/1968
05/31/2007
05/1971
8/10/1984
5/22/88
09/11
08/27/2009
1/2013
11/84
9/3/09
01/1999
8/1989
05/1993
02/96
02/02
03/1999
4/23/2010
10/1988
2/5/1980
05/03/72
5/1/1992
8/18/2016
7/91
01/2004
11/10
2/1986
7/1/1987
Jan./1990
06/01/2005
1/1991
02/1988
May/1976
5/1988
3/13
07/1997
11/2013
9/89
08/19/1996
19/67
1/93
08/2019
12/09/1998
5/4/75
05/14/2011
01/15
5/2012
05/27/2011
5/17/2011
9/11
5/2009
06/01/1989
08/20/2000
04/24/2009
2016/02
9/25/2018
5/30/1983
9/1988
2/28
5/27/2007
7/2000
5/24/2003
09/10
10/31/2009
June/July
01/1979
11/08/2011
12/11/2009
5/2003
12/31/2012
5/1993
10/2011
09/04/2008
8/06
07/11/2001
8/10/1980
2/15/03
2/26/2018
5/24/1997
8.27/10
1/3%
May/2006
6/1992
02/1977
04/1988
03/10/04
8/12
6/20
01/88
2/1981
01/26/2018
5/13/2001
05/88
06/75
01/
08/86
02/15/2015
06/30/15
1/95
5/13/2006
n06/2014
3/26/2013
11/2018
04/1983
6/11/201

05/98
11/18/1993
01/2000
8.33/10
5/01
07/30
1/6/1980
03/01/2001
7/20/2001
6/1995
07/1975
6/77
8/31/2018
05/07/2009
4/2014
08/94
n08/2002
06/2002
9/19/2010
7/22/1980
9/79
10/19/2006
03/85
08/200005/2001
10/04/1995
12/1/2015
06/15/2005
01/91
07/94
02/18/10
06/15/2001
04/2000
01/10/2009
1/31/2014
10/86
2/99
01/14/2000
6/19/1980
08/95
02/91
11/1994
06/16/2013
6/11
3/2009
07/1969
06/2012
/2009
9/00
8/29/2004
8/21/82
1/1982
3/03
03/2014
04/98
6/18/1980
10/08/2011
June/2004
May/1984
12/18/2017
2/2/1980
6/15/07
06/2006
1/8/1980
Spring/Summer,
8/1968
02/84
6/03
06/28/2001
23/172,
7/01/96
7/23/2001
01/2009
08/2002
9/18
06/10/2005
5/24/1990
5/62
12/31
07/1999
3/2003
10/31/1999
03/2002
9/03
5/18
Fall/2016
11/99
11/2019
12/2008
08/2008
10/1993
06/20/1995
5/1979
9/14
20/58
06/18/08
4/28/2001
6/01
9/98-12/2000
4/94
10/23/1980
04/25/2004
6/19
12/85
04/11
5/8/93
8/1967
09/01/1998
1/13/2013
04/05/2004
11/1984
3/2002
3/01
02/89
16/20
05/05/2017
9/2001
05/21/2012
12/31/2016
7/1978
04/1990
5/16/2008
12/197

In [81]:
remove_unicode_escapes(string)

['December', '2006#Fall\t2004']

December
2006#Fall	2004


['December 2006#Fall', '2004']

December 2006#Fall
2004


'December 2006#Fall 2004'

In [36]:
raw_s = r'{}'.format(string)

print(raw_s.find('\\'))

-1


In [None]:
output[(output['MajorCipCode_pound']==0)&(output['major_pound']==0)&(output['degreeLevel_pound']==0)&(output['CompletionDateRaw_pound']==0)]

In [None]:
final_doc[final_doc['CompletionDateRaw_pound']>=30]

In [None]:
# Doctoral indicator dummy generation code

def define_doctoral(df):

    _21 = df[df['degreeLevel'].str.contains('21', na=False)]
    
    print('\tNumber of \'21\'s found:', _21['BGTResID'].nunique())
        
    doc = df.drop(df.index.difference(_21.index))
    wo_21 = df.drop(_21.index)

    strings = ['Doctor', 'ph\.', 'm\.d\.', 'j\.d\.', 'phd', 'dds', 'dml', 'ed\. D']

    for string in strings:
        target = wo_21[wo_21['DegreeType'].str.contains(string, case=False, na=False)]

        print('\tContains \'' + string + '\':', target['BGTResID'].nunique())

        wo_21 = wo_21.drop(target.index)
        doc = pd.concat([doc, target])
        
    
    return doc

doc = define_doctoral(education)

doc['ind_doc'] = 1
education['ind_doc'] = 0

doc = pd.concat([doc, education.loc[education.index.difference(doc.index)]], sort=False)

doc.to_csv(r'A:\_DataBGTRes\Doctoral_Data\doc_education_info_with_indicator.csv', index=False)

In [None]:
# Benchmarks of CIP data

def print_benchmarks(df):
    total = df['BGTResID'].nunique()

    full_both = df[~df['MajorCipCode'].isnull()&~df['major'].isnull()]['BGTResID'].nunique()

    no_CIP = df[df['MajorCipCode'].isnull()&~df['major'].isnull()]['BGTResID'].nunique()

    null_both = df[df['MajorCipCode'].isnull()&df['major'].isnull()]['BGTResID'].nunique()

    print(total, '\n\t' + str(full_both) + ' -> ' + str(round((float(full_both)/float(total)) * 100, 2)) + '%',
         '\n\t' + str(no_CIP) + ' -> ' + str(round((float(no_CIP)/float(total)) * 100, 2)) + '%',
         '\n\t' + str(null_both) + ' -> ' + str(round((float(null_both)/float(total)) * 100, 2)) + '%')
    
print_benchmarks(education[education['ind_doc']==1])
print_benchmarks(education[education['ind_doc']==0])

In [None]:
df = education[education['ind_doc']==1]
len(df)/df['BGTResID'].nunique()

In [None]:
df = education[education['ind_doc']==0]
len(df)/df['BGTResID'].nunique()

In [None]:
len(doc)/doc['BGTResID'].nunique()

In [None]:
education[~education['MajorCipCode'].isnull()&~education['major'].isnull()]

In [None]:
no_cip = pd.DataFrame(education[(education['MajorCipCode'].isnull())]['major'].value_counts())

print(len(no_cip))
display(no_cip)

In [None]:
len(education[education['major']=='Biology']['DegreeType'].value_counts())

In [None]:
cips = pd.read_csv(r'https://nces.ed.gov/ipeds/cipcode/Files/CIPCode2010.csv')

In [None]:
cips[cips['CIPTitle'].str.contains('Biology', case=False)]

In [None]:
# ELIAS CODE

In [None]:
import numpy as np
import pandas as pd

# Set data locations
input_loc = '[SET TO INTERMEDIATE DATA LOCATION]'
output_loc = '[SET TO DESIRED OUTPUT LOCATION]'
onet_url = 'https://www.onetcenter.org/taxonomy/2010/soc2018/2010_to_2018_SOC_Crosswalk.csv?fmt=csv'
naics_url = 'https://www.census.gov/eos/www/naics/2017NAICS/2-6%20digit_2017_Codes.xlsx'
cip_url = 'https://nces.ed.gov/ipeds/cipcode/Files/CIPCode2010.csv'

onet = pd.read_csv(onet_url)
onet.rename(columns = {
    'O*NET-SOC 2010 Code': 'ONETCode', 
    'O*NET-SOC 2010 Title': 'ONETName', 
    '2018 SOC Code': 'SOCCode',
    '2018 SOC Title': 'SOCName'
    }, inplace = True)
naics = pd.read_excel(naics_url)
naics.rename(columns = {
    '2017 NAICS US   Code': 'NAICS2',
    '2017 NAICS US Title': 'NAICSName'
    }, inplace = True)
naics = naics[['NAICS2', 'NAICSName']]

jobs = pd.read_csv(f'{input_loc}04_PhD_Jobs.csv')

jobs = jobs.merge(onet, on = 'ONETCode')
jobs = jobs.merge(naics, on = 'NAICS2')
jobs = jobs[[
    'BGTResID', 'StartDate', 'EndDate', 
    'ONETCode', 'ONETName', 'SOCCode', 'SOCName',
    'NAICS2', 'NAICSName'
    ]]

cip = pd.read_csv(cip_url)
cip = cip[['CIPCode', 'CIPTitle']]
cip['CIPCode'] = cip['CIPCode'].str.replace('=', '', regex = False)
cip['CIPCode'] = cip['CIPCode'].str.replace('"', '', regex = False)

phds = pd.read_csv(f'{input_loc}03_PhD_CIP_codes.csv', index_col = 'BGTResID')
phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(';', '#', regex = False)
phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(' ', '#', regex = False)
for string in ['38.0001', '38.0101', '38.0199', '38.9999']:
    pat = string + '#'
    phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(pat, '', regex = False)
    pat = '#' + string
    phds['PhD_CIPs'] = phds['PhD_CIPs'].str.replace(pat, '', regex = False)

max_splits = phds['PhD_CIPs'].str.count('#').max()
print(f'The most Ph.D. CIPs associated with a resume is {max_splits + 1}')
phds_split = phds['PhD_CIPs'].str.split(pat = '#', expand = True
    ).fillna(value = '')
phds_split = phds_split.reset_index()
cip.rename(columns = {
    'CIPCode': 'CIPCode0',
    'CIPTitle': 'CIPName0',
    }, inplace = True)
columns = ['BGTResID']
for code in range(max_splits.astype(int) + 1):
    phds_split.rename(columns = {
        code: f'CIPCode{code}',
        }, inplace = True)
    assert phds_split[f'CIPCode{code}'].str.len().max() <= 7
    phds_split = phds_split.merge(cip, how = 'left', on = f'CIPCode{code}')
    cip.rename(columns = {
        f'CIPCode{code}': f'CIPCode{code + 1}',
        f'CIPName{code}': f'CIPName{code + 1}'
        }, inplace = True)
    columns.append(f'CIPCode{code}')
    columns.append(f'CIPName{code}')
phds_split = phds_split[columns]

# phds_split.to_csv(f'{output_loc}PhD_CIP_codes.csv', index = False)
# jobs.to_csv(f'{output_loc}PhD_Jobs.csv', index = False)



