In [1]:
import pandas as pd

In [11]:
physicians = pd.read_csv('../../Data/MasterfileCOre/physician.csv')

In [12]:
physicians['degree_1'] = ['MD' if x == 1 else 'DO' for x in physicians.degree_type]

In [42]:
physicians['first_name'] = [str(x).upper().strip() for x in physicians.first_name]
physicians['last_name'] = [str(x).upper().strip() for x in physicians.last_name]

In [34]:
directors = pd.read_csv('../../Data/MasterfileCore/person.csv')
directors = directors.fillna('None')
directors['first_name'] = [x.upper().strip() for x in directors.first_name]
directors['last_name'] = [x.upper().strip() for x in directors.last_name]

In [40]:
physicians.last_name

0             STANFORD
1               MARTIN
2              JACKSON
3              MAISIAK
4                 GALA
              ...     
1387222            LEE
1387223          MONTI
1387224    SNARRENBERG
1387225      BARTOSIAK
1387226           ZOSS
Name: last_name, Length: 1387227, dtype: object

In [72]:
def get_unique(directors):
    identifying_fields = ['last_name','first_name','middle_name','degree_1','degree_2','degree_3']
    unique_directors = directors.drop_duplicates(identifying_fields).sort_values('last_name')
    unique_directors = unique_directors[identifying_fields]
    unique_directors['person_id'] = list(range(len(unique_directors)))
    directors = pd.merge(directors, unique_directors, on = identifying_fields)
    
    return directors, unique_directors

def get_matches(unique_directors, physicians):
    all_match = pd.merge(physicians, unique_directors, on=['first_name', 'last_name'], suffixes=('_physician', '_residency'))
    pure_match = all_match.drop_duplicates('person_id', keep=False)
    
    return all_match, pure_match

def create_duplicate_matches(all_match, pure_match, directors):
    duplicate_matches = all_match[~all_match.person_id.isin(pure_match.person_id)]
    duplicates = directors[directors.person_id.isin(duplicate_matches.person_id)]
    duplicate_matches = duplicate_matches.fillna('None')

    return duplicate_matches, duplicates

def filter_out_duplicates(duplicates, duplicate_matches):
    matched_dict_list = []

    for row in duplicates.itertuples():
        new_df = merge_filtered_dataframe(row, duplicate_matches)
        if len(new_df) == 1:
            matched_dict_list.append({'person_id':row.person_id, 'medical_education_number': list(new_df.medical_education_number)[0]})

    return pd.DataFrame(matched_dict_list)

def merge_filtered_dataframe(row, duplicate_matches):
    new_df = duplicate_matches[duplicate_matches.person_id == row.person_id]

    if row.degree_1 != 'None' and row.degree_1 != 'MPH':
        new_df = new_df[new_df.degree_1_physician == row.degree_1]

    if len(new_df) > 1 and row.middle_name != 'None':
        if len(row.middle_name) == 1:
            new_df['middle'] = [x[0] for x in new_df.middle_name_physician]
            new_df = new_df[new_df.middle == row.middle_name]
        else:
            new_df = new_df[new_df.middle_name_physician == row.middle_name.upper()]

    return new_df

def get_all_links(pure_match, new_match, directors):
    linking_data = pd.concat([pure_match[['medical_education_number', 'person_id']], new_match])

    return pd.merge(linking_data, directors, on='person_id')[['medical_education_number','program']]

In [60]:
directors, unique_directors = get_unique(directors)

In [61]:
all_match, pure_match = get_matches(unique_directors, physicians)

In [63]:
duplicate_matches, duplicates = create_duplicate_matches(all_match, pure_match, directors)

In [73]:
new_match = filter_out_duplicates(duplicates, duplicate_matches)

In [74]:
physician_directors = get_all_links(pure_match, new_match, directors)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.




In [75]:
physician_directors

Unnamed: 0,medical_education_number,program
0,102000056,2202122109
1,102000595,4450421028
2,102010647,1554812150
3,102020821,1401221107
4,102030966,401121036
...,...,...
8837,4813800158,9994800133
8838,3601640421,9994800168
8839,3501020833,9994900142
8840,5605060601,9995100193


In [78]:
duplicates

Unnamed: 0,id,program,personnel_type,aamc_id,first_name,middle_name,last_name,suffix_name,degree_1,degree_2,degree_3,phone_number,email,last_update_date,person_id
0,020012110912247369,200121109,D,12247369,THOMAS,P,ATKINSON,,MD,PhD,,2.05639e+09,patkinson@peds.uab.edu,6/10/2020,425
2,020030000210668447,200300002,D,10668447,JOHN,C,LEWIS,,MD,,,4.80301e+09,lux.kristy@mayo.edu,7/2/2020,5797
5,020051200313884902,200512003,D,13884902,JAVED,,SHEIKH,,MD,,,8.77574e+09,socal.residency@kp.org,7/22/2020,9069
12,020053100210638206,200531002,D,10638206,ANDREW,,WHITE,,MD,,,8.58765e+09,white.andrew@scrippshealth.org,7/14/2020,10587
16,020082109910703209,200821099,D,10703209,FLORENCE,I,HSU,,MD,,,2.03785e+09,madeleine.michaud@yale.edu,7/22/2020,4343
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11433,999480013310865501,9994800133,D,10865501,LYNN,R,CAMPBELL,,MD,,,5.12325e+09,lrcampbell@ascension.org,7/14/2020,1377
11435,999480016811212262,9994800168,D,11212262,DONALD,K,NELMS,,MD,,,8.17927e+09,dnelms@jpshealth.org,7/22/2020,7115
11439,999490014210457847,9994900142,D,10457847,JOHN,C,CHRISTENSEN,,MD,,,8.00549e+09,john.christensen2@imail.org,7/22/2020,1736
11441,999510019311319888,9995100193,D,11319888,JAMES,K,CLARK,,MD,,,7.57953e+09,james.k.clark52.mil@mail.mil,1970-01-01 00:00:00,1803
