In [None]:
import pandas as pd
import usaddress
import pyodbc
from fuzzywuzzy import fuzz
from datamart import fix_me
from functools import reduce
import useful_functions as use

#### My thought here is that we would first look for a phone/address combination between IQVIA/Symphony/DHC/Data.gov where there was some agreement between multiple sources. If we did not find agreement between multiple sources, we would default to what was present on the IQVIA, assuming it was different than what we already had, in particular with POLO.

In [None]:
#POLOs
older_polos = pd.read_csv('../../Data/POLO_Filter/Older_Filtered_POLOs_2021-08-26.csv', low_memory=False)
older_polos['ME'] = use.fix_me(older_polos.ME)
older_polos['IQVIA_ME'] = [x[0:10] for x in older_polos.ME]

In [None]:
#dhc
dhc = pd.read_csv('../../Data/DHC/DHC_2021-07.csv', low_memory=False)
dhc = dhc.rename(columns={
    'Zip_Code':'ZIP_DHC',
    'State':'STATE_DHC',
    'City':'CITY_DHC',
    'Address': 'ADDRESS_1_DHC',
    'Address1': 'ADDRESS_2_DHC'
})
dhc = dhc.fillna('None')
dhc['ME'] = use.fix_me(dhc.ME)
dhc = dhc[dhc.ME.isin(older_polos.ME)]
dhc['DHC_PHONE'] = [use.fix_phone(x) for x in dhc['Phone_Number']]

In [None]:
#data.gov
gov = pd.read_csv('../../Data/DataGov/All_Data_210826.csv', low_memory=False)
gov = gov.fillna('None')
gov['ME'] = use.fix_me(gov.ME)
gov = gov[gov.ME.isin(older_polos.ME)]
gov['GOV_PHONE'] = [use.fix_phone(x) if x !='None' else x for x in gov['phone']]
gov = gov.rename(columns={
    'zip':'ZIP_GOV',
    'st':'STATE_GOV',
    'cty':'CITY_GOV',
    'adr_ln_2': 'ADDRESS_2_GOV',
    'adr_ln_1': 'ADDRESS_1_GOV'
})

In [None]:
#iqvia
username = 'vigrose'
password = 'Ravenclaw~10946'
q = "DSN=eprdods; UID={}; PWD={}".format(username, password)
ODS = pyodbc.connect(q)
iqvia_query = \
        """
        SELECT DISTINCT 
        B.PHONE,
        B.PHYSICAL_ADDR_1,
        B.PHYSICAL_ADDR_2,
        B.PHYSICAL_CITY,
        B.PHYSICAL_STATE,
        B.PHYSICAL_ZIP,
        P.ME,
        T.AFFIL_TYPE_DESC,
        A.AFFIL_IND,
        A.AFFIL_RANK
        FROM 
        ODS.ODS_IMS_BUSINESS B, ODS.SAS_ODS_IMS_PROVIDER_AFFIL A, ODS.ODS_IMS_PROFESSIONAL P, ODS.ODS_IMS_AFFILIATION_TYPE T
        WHERE  
        B.IMS_ORG_ID = A.IMS_ORG_ID
        AND
        A.PROFESSIONAL_ID = P.PROFESSIONAL_ID
        AND
        A.AFFIL_TYPE_ID = T.AFFIL_TYPE_ID
        AND
        P.CURRENT_BATCH_FLAG='Y'
        AND
        A.CURRENT_BATCH_FLAG='Y'
        AND
        B.CURRENT_BATCH_FLAG='Y'
        """
iqvia = pd.read_sql(con=ODS, sql=iqvia_query)
iqvia.head()

In [None]:
iqvia =iqvia.fillna('None')
iqvia = iqvia[iqvia.ME.isin(older_polos.IQVIA_ME)]
iqvia['IQVIA_PHONE'] = [use.fix_phone(x) if x !='None' else x for x in iqvia['PHONE']]
iqvia = iqvia.rename(columns={
    'PHYSICAL_ZIP':'ZIP_IQVIA',
    'PHYSICAL_STATE':'STATE_IQVIA',
    'PHYSICAL_CITY':'CITY_IQVIA',
    'PHYSICAL_ADDR_1': 'ADDRESS_1_IQVIA',
    'PHYSICAL_ADDR_2': 'ADDRESS_2_IQVIA'
})

In [None]:
#symphony
sym_query = \
        """
        SELECT
        d.ADDR_LINE_2_TXT AS MAILING_LINE_1,
        d.ADDR_LINE_1_TXT AS MAILING_LINE_2,
        d.ADDR_CITY_NAM AS CITY,
        d.ADDR_ST_CDE AS STATE,
        d.ADDR_ZIP_CDE AS ZIP,
        d.ADDR_FRST_TLPHN_NBR AS TELEPHONE,
        l.OTHER_ID AS SYM_ME
        FROM
        ODS.PRACTITIONER_DEMOGRAPHIC_LAYOUT d, ODS.PRACTITIONER_ADDL_IDS_LAYOUT l
        WHERE
        d.DS_PRCTR_ID = l.DS_PRCTR_ID
        and
        l.ID_QLFR_TYP_CDE = 38
        """  
symphony = pd.read_sql(con=ODS, sql=sym_query)
symphony.head()

In [None]:
symphony =symphony.fillna('None')
symphony= symphony[symphony.SYM_ME.isin(older_polos.IQVIA_ME)]
symphony['SYM_PHONE'] = [use.fix_phone(x) for x in symphony['TELEPHONE']]
symphony = symphony.rename(columns={
    'ZIP':'ZIP_SYMPHONY',
    'STATE':'STATE_SYMPHONY',
    'CITY':'CITY_SYMPHONY',
    'MAILING_LINE_1': 'ADDRESS_2_SYMPHONY',
    'MAILING_LINE_2': 'ADDRESS_1_SYMPHONY'
})

In [None]:
THIS = older_polos[['ME','IQVIA_ME']]
THIS = pd.merge(THIS, symphony, left_on='IQVIA_ME', right_on='SYM_ME', how='left')
THIS = pd.merge(THIS, iqvia, left_on='IQVIA_ME', right_on='ME', how='left', suffixes = ['','_iqvia'])
THIS = pd.merge(THIS, dhc, on='ME', how='left')
THIS = pd.merge(THIS, gov, on='ME', how='left')[['ME','IQVIA_ME', 'SYM_PHONE','IQVIA_PHONE', 'DHC_PHONE', 'GOV_PHONE']].drop_duplicates()
THIS = THIS.fillna('None')

In [None]:
THIS

In [None]:
dict_list = []
for row in THIS.itertuples():
    count = 0
    phone_num = 'None'
    symph_phone = row.SYM_PHONE
    dhc_phone = row.DHC_PHONE
    iqvia_phone = row.IQVIA_PHONE
    data_phone = row.GOV_PHONE
    if symph_phone == dhc_phone and symph_phone != 'None':
        MATCHES = 'Symphony, DHC'
        phone_num = symph_phone
        if dhc_phone == iqvia_phone:
            count = 2
            MATCHES = 'Symphony, DHC, IQVia'
            if dhc_phone == data_phone:
                count = 3
                MATCHES = 'Symphony, DHC, IQVia, DataGov'
        elif dhc_phone == data_phone:
            count = 2
            MATCHES = 'Symphony, DHC, DataGov'
        else:
            count = 1       
    elif symph_phone == iqvia_phone and symph_phone != 'None':
        phone_num = symph_phone
        MATCHES = 'Symphony, IQVia'
        count = 1
        if symph_phone == data_phone:
            count = 2
            MATCHES = 'Symphony, IQVia, DataGov'
    elif symph_phone == data_phone and symph_phone != 'None':
        phone_num = symph_phone
        MATCHES = 'Symphony, DataGov'
        count = 1
    elif dhc_phone == iqvia_phone and dhc_phone != 'None':
        phone_num = dhc_phone
        MATCHES = 'DHC, IQVia'
        count = 1
        if dhc_phone == data_phone:
            MATCHES = 'DHC, IQVia, DataGov'
            count = 2
    elif dhc_phone == data_phone and dhc_phone != 'None':
        phone_num = dhc_phone
        MATCHES = 'DHC, DataGov'
        count = 1
    elif data_phone== iqvia_phone and data_phone != 'None':
        phone_num = data_phone
        MATCHES = 'IQVia, DataGov'
        count = 1
        
    if phone_num!='None':
        dicto = {
        'ME': row.ME,
        'IQVIA_ME': row.IQVIA_ME,
        'PHONE': phone_num,
        'MATCHED': count,
        'MATCHES': MATCHES
    }
        dict_list.append(dicto)

In [None]:
phone_matches = pd.DataFrame(dict_list)

In [None]:
phones = phone_matches.drop_duplicates().sort_values('MATCHED').drop_duplicates('ME',keep='last')

In [None]:
def clean_address_two(add_1):
    add_1 = add_1.strip()
    if add_1 == 'None':
        addr_1 = ' '
    elif add_1 == 'NAN':
        addr_1 = ' '
    else:
        addr_1 = ',' + add_1
    return(addr_1)

def is_a_match(thing_1, thing_2):
    if thing_1 == thing_2:
        match = True
    elif thing_1 in thing_2:
        match = True
    elif thing_2 in thing_1:
        match = True
    elif fuzz.ratio(thing_1, thing_2)>75:
        match = True
    else:
        match = False
    return(match)

def is_match(thing_1, thing_2):
    if thing_1 == thing_2:
        match = True
    elif thing_1 in thing_2:
        match = True
    elif thing_2 in thing_1:
        match = True
    else:
        match = False
    return(match)

def error_handle(parsed_string):
    new_dict = {}
    for thing in parsed_string:
        if thing[1] in new_dict.keys():
            a_list = [new_dict[thing[1]], thing[0]]
            new_dict[thing[1]] = max(a_list, key=len)
        else:
            new_dict[thing[1]] = thing[0]
    return(new_dict)

# def get_all_keys(moar):
#     all_keys =[]
#     for row in moar.itertuples():
#         addr_2 = clean_address_two(row.ADDRESS_2)
#         address = f'{row.ADDRESS_1}{addr_2}, {row.CITY}, {row.STATE}'
#         try:
#             new_dict = usaddress.tag(address)[0]
#         except usaddress.RepeatedLabelError as e:
#             print(e.original_string)
#             new_dict = error_handle(e.parsed_string)
#             print('')
#         le_keys = list(new_dict.keys())
#         for key in le_keys:
#             if key not in all_keys:
#                 all_keys.append(key)
#         return (all_keys)
    
# def parse_address(moar):
#     dict_list = []
#     for row in moar.itertuples():
#         new_dict = {}
#         new_dict['ME'] = row.ME
#         new_dict['IQVIA_ME'] = row.IQVIA_ME
#         new_dict['PHONE'] = row.PHONE
#         new_dict['ZIP'] = row.ZIP
#         addr_2 = clean_address_two(row.ADDRESS_2)
#         address = f'{row.ADDRESS_1}{addr_2}, {row.CITY}, {row.STATE}'
#         try:
#             address_dict = usaddress.tag(address)[0]
#         except usaddress.RepeatedLabelError as e:
#             print(e.original_string)
#             address_dict = error_handle(e.parsed_string)
#             print('')
#         dict_list.append(new_dict)
#     return (dict_list)

# LE_KEYS = get_all_keys(all_file)
# new_list = parse_address(all_file, LE_KEYS)

In [None]:
def count_matches(THIS):
    dict_list = []
    for row in THIS.itertuples():
        count = 0
        phone_num = 'None'
        symph_phone = row.ADDRESS_1_SYMPHONY.upper().strip()
        dhc_phone = row.ADDRESS_1_DHC.upper().strip()
        iqvia_phone = row.ADDRESS_1_IQVIA.upper().strip()
        data_phone = row.ADDRESS_1_GOV.upper().strip()
        if is_a_match(symph_phone, dhc_phone) and symph_phone != 'NONE':
            MATCHES = 'Symphony, DHC'
            phone_num = symph_phone
            if is_a_match(dhc_phone,iqvia_phone):
                count = 2
                MATCHES = 'Symphony, DHC, IQVia'
                if is_a_match(dhc_phone,data_phone):
                    count = 3
                    MATCHES = 'Symphony, DHC, IQVia, DataGov'
            elif is_a_match(dhc_phone,data_phone):
                count = 2
                MATCHES = 'Symphony, DHC, DataGov'
            else:
                count = 1       
        elif is_a_match(symph_phone,iqvia_phone) and symph_phone != 'NONE':
            phone_num = symph_phone
            MATCHES = 'Symphony, IQVia'
            count = 1
            if is_a_match(symph_phone,data_phone):
                count = 2
                MATCHES = 'Symphony, IQVia, DataGov'
        elif is_a_match(symph_phone,data_phone) and symph_phone != 'NONE':
            phone_num = symph_phone
            MATCHES = 'Symphony, DataGov'
            count = 1

        elif is_a_match(dhc_phone,iqvia_phone) and dhc_phone != 'NONE':
            phone_num = iqvia_phone
            MATCHES = 'DHC, IQVia'
            count = 1
            if dhc_phone == data_phone:
                MATCHES = 'DHC, IQVia, DataGov'
                count = 2
        elif is_a_match(dhc_phone,data_phone) and dhc_phone != 'NONE':
            phone_num = data_phone
            MATCHES = 'DHC, DataGov'
            count = 1
        elif is_a_match(data_phone,iqvia_phone) and data_phone != 'NONE':
            phone_num = data_phone
            MATCHES = 'IQVia, DataGov'
            count = 1

        if phone_num!='None':
            dicto = {
            'ME': row.ME,
            'IQVIA_ME': row.IQVIA_ME,
            'ADDRESS': phone_num,
            'MATCHED': count,
            'ADDRESS_MATCHES': MATCHES,
            'STATE_PPMA':
            'STATE_PPMA'
        }
            dict_list.append(dicto)
    return dict_list

In [None]:
this = older_polos[['ME','IQVIA_ME','STATE_PPMA','STATE_POLO']]
this = pd.merge(this, symphony, left_on='IQVIA_ME', right_on='SYM_ME', how='left')
this = pd.merge(this, iqvia, left_on='IQVIA_ME', right_on='ME', how='left', suffixes = ['','_iqvia'])
this = pd.merge(this, dhc, on='ME', how='left')
this = pd.merge(this, gov, on='ME', how='left').drop_duplicates()

In [None]:
this = this.fillna('None')
ADD_LIST = count_matches(this)

In [None]:
ADDRESSES = pd.DataFrame(ADD_LIST)

In [None]:
addresses = ADDRESSES.sort_values('MATCHED').drop_duplicates('ME', keep='last')

In [None]:
alls = pd.merge(addresses, phones, on=['ME','IQVIA_ME'], how='outer', suffixes=['_ADDRESS','_PHONE']).drop_duplicates()

In [None]:
alls

In [None]:
alls.sort_values('MATCHED_PHONE', ascending=False)

In [None]:
TEES = pd.merge(alls, THIS, on=['ME','IQVIA_ME']).sort_values('MATCHED_PHONE', ascending=False)

In [None]:
XX = pd.merge(TEES, this, on=['ME','IQVIA_ME']).sort_values('MATCHED_PHONE', ascending=False)

In [None]:
fuzzed_xx = pd.merge(fuzzed_cont, this, on=['ME','IQVIA_ME'])

In [None]:
this[['ME', 'IQVIA_ME', 'ADDRESS_2_SYMPHONY', 'ZIP_SYMPHONY',  'ADDRESS_1_IQVIA', 
       'ZIP_IQVIA', 'ZIP_DHC', 'ADDRESS_1_DHC', 'First Name', 'Physician Name',
       'Middle Name', 'Primary Specialty', 'Primary Hospital Affiliation',
       'Last Name', 'ADDRESS_2_DHC', 'DHC_PHONE', 'NPI_y', ' Ind_PAC_ID',
       ' Ind_enrl_ID', ' lst_nm', ' frst_nm', ' mid_nm', ' suff', ' gndr',
       ' Cred', ' Med_sch', ' Grd_yr', ' pri_spec', ' sec_spec_1',
       ' sec_spec_2', ' sec_spec_3', ' sec_spec_4', ' sec_spec_all', ' org_nm',
       ' org_pac_id', ' num_org_mem', 'ADDRESS_1_GOV', 'ADDRESS_2_GOV',
       ' ln_2_sprs', 'CITY_GOV', 'STATE_GOV', 'ZIP_GOV', ' phn_numbr',
       ' hosp_afl_1', ' hosp_afl_lbn_1', ' hosp_afl_2', ' hosp_afl_lbn_2',
       ' hosp_afl_3', ' hosp_afl_lbn_3', ' hosp_afl_4', ' hosp_afl_lbn_4',
       ' hosp_afl_5', ' hosp_afl_lbn_5', ' ind_assgn', ' grp_assgn',
       ' adrs_id', 'PARTY_ID', 'GOV_PHONE']]

In [None]:
iqvia[iqvia.ME.isin(TEES.IQVIA_ME)==False].drop_duplicates('ME')

In [None]:
TEES.drop_duplicates('ME')

In [None]:
phone_matches.drop_duplicates('ME')

In [None]:
8161+5294

In [None]:
pd.merge(testt, older_polos, on=['ME','IQVIA_ME'])[['ADDRESS','STATE_DHC','STATE_IQVIA','STATE_SYMPHONY','STATE_GOV','STATE_POLO','STATE_PPMA','ADDRESS_MATCHES']].drop_duplicates()

In [None]:
older_polos.columns

In [None]:
len(older_polos)

In [None]:
testt = XX[['ME', 'IQVIA_ME', 'ADDRESS', 'MATCHED_ADDRESS', 'ADDRESS_MATCHES',
       'PHONE_x', 'MATCHED_PHONE', 'MATCHES', 'STATE_SYMPHONY', 'STATE_IQVIA', 'STATE_DHC',
       'STATE_GOV']].drop_duplicates()

In [None]:
testt

In [None]:
XX.columns

In [None]:
XX = XX.fillna('None')

In [None]:
XX.drop_duplicates()

In [None]:
def find_all_matches():
    dictss = []
    mes = []
    for row in XX.itertuples():
        if row.ADDRESS_MATCHES == 'None':
            continue
        if row.ME in mes:
            continue
        sources = row.ADDRESS_MATCHES
        address = row.ADDRESS.upper().strip()
        phone = row.PHONE_x
        phone_source = 'Multiple'
        address_source = sources
        if "IQVia" in sources and row.ADDRESS_1_IQVIA.upper().strip() == address:
            state = row.STATE_IQVIA
            zipcode = row.ZIP_IQVIA
            city = row.CITY_IQVIA
            address_2 = row.ADDRESS_2_SYMPHONY
            if phone == 'None':
                phone = row.IQVIA_PHONE_x
                phone_source = 'IQVia'
            address_source = 'IQVia'
        elif "Symphony" in sources and row.ADDRESS_1_SYMPHONY.upper().strip() == address:
            state = row.STATE_SYMPHONY
            zipcode = row.ZIP_SYMPHONY
            city = row.CITY_SYMPHONY
            address_2 = row.ADDRESS_2_SYMPHONY
            if phone == 'None':
                phone = row.SYM_PHONE_x
                phone_source = 'Symphony'
            address_source = 'Symphony'
        elif "DataGov" in sources and row.ADDRESS_1_GOV.upper().strip() == address:
            state = row.STATE_GOV
            zipcode = row.ZIP_GOV
            city = row.CITY_GOV
            address_2 = row.ADDRESS_2_GOV
            if phone == 'None':
                phone = row.GOV_PHONE_x
                phone_source = 'DataGov'
            address_source = 'DataGov'
        NEW_DICT = {
            'ME': row.ME,
            'ME_IQVIA': row.IQVIA_ME,
            'ADDRESS_MATCHES': row.ADDRESS_MATCHES,
            'PHONE_MATCHES': row.MATCHES,
            'ADDRESS_1': address,
            'ADDRESS_2': address_2,
            'CITY': city,
            'STATE': state,
            'ZIPCODE': zipcode,
            'PHONE': phone,
            'PHONE_SOURCE': phone_source,
            'ADDRESS_SOURCE': address_source
        }
        dictss.append(NEW_DICT)
        if phone!='None':
            mes.append(row.ME)
        return mes, dictss

In [None]:
sample = pd.DataFrame(dictss).drop_duplicates(['ME','ADDRESS_1'])

In [None]:
sample

In [None]:
XX[XX.ADDRESS_MATCHES == 'None'][['ME','MATCHES',  'ADDRESS_1_SYMPHONY', 'CITY_SYMPHONY', 'STATE_SYMPHONY', 'ADDRESS_1_IQVIA',
       'CITY_IQVIA', 'STATE_IQVIA', 'STATE_DHC',
       'CITY_DHC', 'ADDRESS_1_DHC', 'ADDRESS_1_GOV', 'CITY_GOV', 'STATE_GOV']]

In [None]:
no_address = XX[XX.ADDRESS_MATCHES == 'None']

In [None]:
def count_fuzzy_matches(THIS):
    dict_list = []
    for row in THIS.itertuples():
        count = 0
        phone_num = 'None'
        symph_phone = row.ADDRESS_1_SYMPHONY.upper().strip()
        dhc_phone = row.ADDRESS_1_DHC.upper().strip()
        iqvia_phone = row.ADDRESS_1_IQVIA.upper().strip()
        data_phone = row.ADDRESS_1_GOV.upper().strip()
        print(iqvia_phone)
        if is_a_match(symph_phone,dhc_phone) and symph_phone != 'NONE':
            MATCHES = 'Symphony, DHC'
            phone_num = symph_phone
            if is_a_match(dhc_phone,iqvia_phone):
                count = 2
                MATCHES = 'Symphony, DHC, IQVia'
                if is_a_match(dhc_phone,data_phone):
                    count = 3
                    MATCHES = 'Symphony, DHC, IQVia, DataGov'
            elif is_a_match(dhc_phone,data_phone):
                count = 2
                MATCHES = 'Symphony, DHC, DataGov'
            else:
                count = 1       
        elif is_a_match(symph_phone,iqvia_phone) and symph_phone != 'NONE':
            phone_num = symph_phone
            MATCHES = 'Symphony, IQVia'
            count = 1
            if is_a_match(symph_phone,data_phone):
                count = 2
                MATCHES = 'Symphony, IQVia, DataGov'
        elif is_a_match(symph_phone,data_phone) and symph_phone != 'NONE':
            phone_num = symph_phone
            MATCHES = 'Symphony, DataGov'
            count = 1

        elif is_a_match(dhc_phone,iqvia_phone) and dhc_phone != 'NONE':
            phone_num = iqvia_phone
            MATCHES = 'DHC, IQVia'
            count = 1
            if is_a_match(iqvia_phone,data_phone):
                MATCHES = 'DHC, IQVia, DataGov'
                count = 2
        elif is_a_match(dhc_phone,data_phone) and dhc_phone != 'NONE':
            phone_num = data_phone
            MATCHES = 'DHC, DataGov'
            count = 1

        if phone_num!='None':
            dicto = {
            'ME': row.ME,
            'IQVIA_ME': row.IQVIA_ME,
            'ADDRESS': phone_num,
            'DHC_ADDRESS': dhc_phone,
            'SYM_ADDRESS': symph_phone,
            'IQV_ADDRESS': iqvia_phone,
            'GOV_ADDRESS': data_phone,
            'MATCHED': count,
            'ADDRESS_MATCHES': MATCHES
        }
            dict_list.append(dicto)
    return dict_list

In [None]:
boom = count_fuzzy_matches(no_address)

In [None]:
pd.DataFrame(boom).sort_values('MATCHED').drop_duplicates('ME', keep='last').to_csv("../../Data/POLO_Filter/Match_Test.csv", index=False)

In [None]:
FUZZED = pd.DataFrame(boom).sort_values('MATCHED').drop_duplicates('ME', keep='last')

In [None]:
THIS

In [None]:
fuzzed_cont = pd.merge(FUZZED, phone_matches, on=['ME','IQVIA_ME'], suffixes=['_ADDRESSES','_PHONES']).drop_duplicates()

In [None]:
#no phones?
#still no phones
#phones match but addresses do not
#100 mile check
#license check

In [None]:
dicts_2 = []
mes_2 = []
for row in fuzzed_xx.itertuples():
    if row.ADDRESS_MATCHES == 'None':
        print('Addres mismatch')
        continue
    if row.ME in mes:
        print('already found')
        continue
    sources = row.ADDRESS_MATCHES
    address = row.ADDRESS.upper().strip()
    phone = row.PHONE_x
    phone_source = 'Multiple'
    address_source = sources
    if "IQVia" in sources and row.ADDRESS_1_IQVIA.upper().strip() == address:
        state = row.STATE_IQVIA
        zipcode = row.ZIP_IQVIA
        city = row.CITY_IQVIA
        address_2 = row.ADDRESS_2_SYMPHONY
        if phone == 'None':
            phone = row.IQVIA_PHONE_x
            phone_source = 'IQVia'
        address_source = 'IQVia'
    elif "Symphony" in sources and row.ADDRESS_1_SYMPHONY.upper().strip() == address:
        state = row.STATE_SYMPHONY
        zipcode = row.ZIP_SYMPHONY
        city = row.CITY_SYMPHONY
        address_2 = row.ADDRESS_2_SYMPHONY
        if phone == 'None':
            phone = row.SYM_PHONE_x
            phone_source = 'Symphony'
        address_source = 'Symphony'
    elif "DataGov" in sources and row.ADDRESS_1_GOV.upper().strip() == address:
        state = row.STATE_GOV
        zipcode = row.ZIP_GOV
        city = row.CITY_GOV
        address_2 = row.ADDRESS_2_GOV
        if phone == 'None':
            phone = row.GOV_PHONE_x
            phone_source = 'DataGov'
        address_source = 'DataGov'
    NEW_DICT = {
        'ME': row.ME,
        'ME_IQVIA': row.IQVIA_ME,
        'ADDRESS_MATCHES': row.ADDRESS_MATCHES,
        'PHONE_MATCHES': row.MATCHES,
        'ADDRESS_1': address,
        'ADDRESS_2': address_2,
        'CITY': city,
        'STATE': state,
        'ZIPCODE': zipcode,
        'PHONE': phone,
        'PHONE_SOURCE': phone_source,
        'ADDRESS_SOURCE': address_source
    }
    dicts_2.append(NEW_DICT)
    if phone!='None':
        mes_2.append(row.ME)

In [None]:
pd.DataFrame(dicts_2).drop_duplicates()

In [None]:
dhc

In [None]:
dfs = [dhc, gov, older_polos]
df_final = reduce(lambda left,right: pd.merge(left,right,on='ME'), dfs)

In [None]:
df1.merge(df2,on='name').merge(df3,on='name')

df_final

In [None]:
THIS = older_polos[['ME','IQVIA_ME']]
THIS = pd.merge(THIS, symphony, left_on='IQVIA_ME', right_on='SYM_ME', how='left')
THIS = pd.merge(THIS, iqvia, left_on='IQVIA_ME', right_on='ME', how='left', suffixes = ['','_iqvia'])
THIS = pd.merge(THIS, dhc, on='ME', how='left')
THIS = pd.merge(THIS, gov, on='ME', how='left')[['ME','IQVIA_ME', 'SYM_PHONE','IQVIA_PHONE', 'DHC_PHONE', 'GOV_PHONE']].drop_duplicates()

In [None]:
gov.columns

In [None]:
def universalize_columns(df, source):
    df['ADDRESS_1'] = df[f'ADDRESS_1_{source}']
    df['ADDRESS_2'] = df[f'ADDRESS_2_{source}']
    df['CITY'] = df[f'CITY_{source}']
    df['STATE'] = df[f'STATE_{source}']
    df['ZIPCODE'] = df[f'ZIP_{source}']
    df['PHONE'] = df[f'{source}_PHONE']
    numb_list = list(range(0,len(df)))
    df['KEY'] = [str(s) + source for s in numb_list]

In [None]:
source_list = [
    {'SOURCE':'DHC',
    'DATA':dhc},
    {'SOURCE':'IQVIA',
    'DATA':iqvia},
    {'SOURCE':'SYMPHONY',
    'DATA':symphony},
    {'SOURCE':'GOV',
    'DATA':gov}
]

In [None]:
symphony['SYMPHONY_PHONE']=symphony.SYM_PHONE
symphony['IQVIA_ME'] = symphony.SYM_ME
iqvia['IQVIA_ME'] = iqvia.ME
dhc['IQVIA_ME'] = [x[0:10] for x in fix_me(dhc.ME)]
gov['IQVIA_ME'] = [x[0:10] for x in fix_me(gov.ME)]

In [None]:
for source_dict in source_list:
    universalize_columns(source_dict['DATA'], source_dict['SOURCE'])

In [None]:
def fix(component):
    component = component.strip().upper()
    return component

In [None]:
def fix_zipcode(num):
    num = str(num).strip().replace('.0', '')
    num = ''.join(filter(str.isdigit, num))
    if len(num) > 5:
        num = num[:-4]
    if len(num) == 4:
        num = '0' + num
    elif len(num) == 3:
        num = '00' + num
    elif len(num) == 2:
        num = '000' + num
    return num

In [None]:
def parse_address(moar, source):
    dict_list = []
    mes = []
    for row in moar.itertuples():
        addr_2 = clean_address_two(row.ADDRESS_2)
        address = f'{fix(row.ADDRESS_1)}{addr_2}, {fix(row.CITY)}, {fix(row.STATE)}'
        try:
            address_dict = usaddress.tag(address)[0]
        except usaddress.RepeatedLabelError as e:
            print(e.original_string)
            address_dict = error_handle(e.parsed_string)
            print('')
        address_dict['KEY'] = row.KEY
        address_dict['PHONE'] = row.PHONE
        address_dict['ZIPCODE'] = row.ZIPCODE
        address_dict['ZIP'] = fix_zipcode(row.ZIPCODE)
        dict_list.append(address_dict)
        mes.append(row.IQVIA_ME)
    parsed_df = pd.DataFrame(dict_list)
    parsed_df.dropna(how='all', axis=1, inplace=True)
    parsed_df.columns = [f'{c}_{source}' for c in parsed_df.columns.values]
    parsed_df['IQVIA_ME'] = mes
    return (parsed_df)

In [None]:
for source_dict in source_list:
    source_dict['PARSED_DATA'] = parse_address(source_dict['DATA'], source_dict['SOURCE'])
all_the_data = source_list[0]['PARSED_DATA'].merge(source_list[1]['PARSED_DATA'],on='IQVIA_ME',how='outer').merge(source_list[2]['PARSED_DATA'],on='IQVIA_ME',how='outer').merge(source_list[3]['PARSED_DATA'],on='IQVIA_ME',how='outer')
all_the_data['ROW_KEY'] = list(range(0,len(all_the_data)))
all_the_data = all_the_data.fillna('None')

In [None]:
dict_list = []
for row in all_the_data.itertuples():
    count = 0
    phone_num = 'None'
    symph_phone = row.PHONE_SYMPHONY
    dhc_phone = row.PHONE_DHC
    iqvia_phone = row.PHONE_IQVIA
    data_phone = row.PHONE_GOV
    if symph_phone == dhc_phone and symph_phone != 'None':
        MATCHES = 'Symphony, DHC'
        phone_num = symph_phone
        if dhc_phone == iqvia_phone:
            count = 2
            MATCHES = 'Symphony, DHC, IQVia'
            if dhc_phone == data_phone:
                count = 3
                MATCHES = 'Symphony, DHC, IQVia, DataGov'
        elif dhc_phone == data_phone:
            count = 2
            MATCHES = 'Symphony, DHC, DataGov'
        else:
            count = 1       
    elif symph_phone == iqvia_phone and symph_phone != 'None':
        phone_num = symph_phone
        MATCHES = 'Symphony, IQVia'
        count = 1
        if symph_phone == data_phone:
            count = 2
            MATCHES = 'Symphony, IQVia, DataGov'
    elif symph_phone == data_phone and symph_phone != 'None':
        phone_num = symph_phone
        MATCHES = 'Symphony, DataGov'
        count = 1
        
    elif dhc_phone == iqvia_phone and dhc_phone != 'None':
        phone_num = dhc_phone
        MATCHES = 'DHC, IQVia'
        count = 1
        if dhc_phone == data_phone:
            MATCHES = 'DHC, IQVia, DataGov'
            count = 2
    elif dhc_phone == data_phone and dhc_phone != 'None':
        phone_num = dhc_phone
        MATCHES = 'DHC, DataGov'
        count = 1
    elif iqvia_phone == data_phone and iqvia_phone != 'None':
            phone_num = iqvia_phone
            MATCHES = 'IQVia, DataGov'
            count = 1
    if phone_num!='None':
        dicto = {
        'ROW_KEY': row.ROW_KEY,
        'IQVIA_ME': row.IQVIA_ME,
        'PHONE': phone_num,
        'MATCHED': count,
        'MATCHES': MATCHES
    }
        dict_list.append(dicto)

In [None]:
phone_matches = pd.DataFrame(dict_list)

In [None]:
phone_matches

In [None]:
all_the_data

In [None]:
def count_matches(THIS):
    dict_list = []
    for row in THIS.itertuples():
        count = 0
        phone_num = 'None'
        symph_phone = row.AddressNumber_SYMPHONY + row.StreetName_SYMPHONY
        dhc_phone = row.AddressNumber_DHC + row.StreetName_DHC
        iqvia_phone = row.AddressNumber_IQVIA + row.StreetName_IQVIA
        data_phone = row.AddressNumber_GOV + row.StreetName_GOV
        if symph_phone == dhc_phone and symph_phone != 'NoneNone':
            MATCHES = 'Symphony, DHC'
            phone_num = symph_phone
            if dhc_phone == iqvia_phone:
                count = 2
                MATCHES = 'Symphony, DHC, IQVia'
                if dhc_phone == data_phone:
                    count = 3
                    MATCHES = 'Symphony, DHC, IQVia, DataGov'
            elif dhc_phone == data_phone:
                count = 2
                MATCHES = 'Symphony, DHC, DataGov'
            else:
                count = 1       
        elif symph_phone == iqvia_phone and symph_phone != 'NoneNone':
            phone_num = symph_phone
            MATCHES = 'Symphony, IQVia'
            count = 1
            if symph_phone == data_phone:
                count = 2
                MATCHES = 'Symphony, IQVia, DataGov'
        elif symph_phone == data_phone and symph_phone != 'NoneNone':
            phone_num = symph_phone
            MATCHES = 'Symphony, DataGov'
            count = 1

        elif dhc_phone == iqvia_phone and dhc_phone != 'NoneNone':
            phone_num = dhc_phone
            MATCHES = 'DHC, IQVia'
            count = 1
            if dhc_phone == data_phone:
                MATCHES = 'DHC, IQVia, DataGov'
                count = 2
        elif dhc_phone == data_phone and dhc_phone != 'NoneNone':
            phone_num = dhc_phone
            MATCHES = 'DHC, DataGov'
            count = 1
        elif iqvia_phone == data_phone and iqvia_phone != 'NoneNone':
            phone_num = iqvia_phone
            MATCHES = 'IQVia, DataGov'
            count = 1
        if phone_num!='None':
            dicto = {
            'ROW_KEY': row.ROW_KEY,
            'IQVIA_ME': row.IQVIA_ME,
            'ADDRESS': phone_num,
            'MATCHED': count,
            'ADDRESS_MATCHES': MATCHES
        }
            dict_list.append(dicto)
    return dict_list

In [None]:
gah = count_matches(all_the_data)

In [None]:
gah_match = pd.DataFrame(gah)

In [None]:
gah_match

In [None]:
def is_a_match(thing_1, thing_2):
    if thing_1 == thing_2:
        match = True
    elif thing_1 in thing_2:
        match = True
    elif thing_2 in thing_1:
        match = True
    elif fuzz.ratio(thing_1, thing_2)>90:
        match = True
    else:
        match = False
    return(match)

def count_fuzzy_matches(THIS):
    dict_list = []
    for row in THIS.itertuples():
        count = 0
        phone_num = 'None'
        symph_phone = row.AddressNumber_SYMPHONY + row.StreetName_SYMPHONY
        dhc_phone = row.AddressNumber_DHC + row.StreetName_DHC
        iqvia_phone = row.AddressNumber_IQVIA + row.StreetName_IQVIA
        data_phone = row.AddressNumber_GOV + row.StreetName_GOV
        if is_a_match(symph_phone,dhc_phone) and symph_phone != 'NoneNone':
            MATCHES = 'Symphony, DHC'
            phone_num = symph_phone
            if is_a_match(dhc_phone,iqvia_phone):
                count = 2
                MATCHES = 'Symphony, DHC, IQVia'
                if is_a_match(dhc_phone,data_phone):
                    count = 3
                    MATCHES = 'Symphony, DHC, IQVia, DataGov'
            elif is_a_match(dhc_phone,data_phone):
                count = 2
                MATCHES = 'Symphony, DHC, DataGov'
            else:
                count = 1       
        elif is_a_match(symph_phone,iqvia_phone) and symph_phone != 'NoneNone':
            phone_num = symph_phone
            MATCHES = 'Symphony, IQVia'
            count = 1
            if is_a_match(symph_phone,data_phone):
                count = 2
                MATCHES = 'Symphony, IQVia, DataGov'
        elif is_a_match(symph_phone,data_phone) and symph_phone != 'NoneNone':
            phone_num = symph_phone
            MATCHES = 'Symphony, DataGov'
            count = 1

        elif is_a_match(dhc_phone,iqvia_phone) and dhc_phone != 'NoneNone':
            phone_num = iqvia_phone
            MATCHES = 'DHC, IQVia'
            count = 1
            if is_a_match(iqvia_phone,data_phone):
                MATCHES = 'DHC, IQVia, DataGov'
                count = 2
        elif is_a_match(dhc_phone,data_phone) and dhc_phone != 'NoneNone':
            phone_num = data_phone
            MATCHES = 'DHC, DataGov'
            count = 1
        elif is_a_match(iqvia_phone,data_phone) and iqvia_phone != 'NoneNone':
            phone_num = iqvia_phone
            MATCHES = 'IQVia, DataGov'
            count = 1
        if phone_num!='None':
            dicto = {
            'ROW_KEY': row.ROW_KEY,
            'IQVIA_ME': row.IQVIA_ME,
            'ADDRESS': phone_num,
            'MATCHED': count,
            'ADDRESS_MATCHES': MATCHES
        }
            dict_list.append(dicto)
    return dict_list

In [None]:
bah = count_fuzzy_matches(all_the_data)

In [None]:
add_match = pd.DataFrame(bah)

In [None]:
bopbopbop = pd.merge(phone_matches, add_match, on=['ROW_KEY','IQVIA_ME'],suffixes=['_PHONE','_ADDRESS'], how='outer')

In [None]:
iqvia[iqvia.IQVIA_ME.isin(bopbopbop.IQVIA_ME)==False].drop_duplicates('ME')

In [None]:
len(bopbopbop)

In [None]:
15750+3514

In [None]:
REAL_DEAL = pd.merge(bopbopbop, all_the_data, on=['ROW_KEY', 'IQVIA_ME']).drop_duplicates(['IQVIA_ME','PHONE','ADDRESS'])

In [None]:
all_the_data[all_the_data.ROW_KEY==0].iloc[0]

In [None]:
KEY

In [None]:
iqvia[iqvia.KEY=='23190IQVIA']['CITY'].values[0]

In [None]:
count =0
for row in bopbopbop.itertuples():
    if row.MATCHES != row.ADDRESS_MATCHES:
        count+=1

In [None]:
count

In [None]:
REAL_DEAL = REAL_DEAL.fillna('None')

In [None]:
dictss = []
for row in REAL_DEAL.itertuples():
    if row.ADDRESS_MATCHES == 'None':
        print('Address mismatch')
        continue
    sources = row.ADDRESS_MATCHES
    address = row.ADDRESS
    phone = row.PHONE
    phone_source = 'Multiple'
    address_source = sources
    if "IQVia" in sources and row.StreetName_IQVIA in address:
        KEY = row.KEY_IQVIA
        state = row.StateName_IQVIA
        long_zipcode = row.ZIPCODE_IQVIA
        zipcode = row.ZIP_IQVIA
        city = iqvia[iqvia.KEY==KEY]['CITY'].values[0]
        address_1 = iqvia[iqvia.KEY==KEY]['ADDRESS_1'].values[0]
        address_2 = iqvia[iqvia.KEY==KEY]['ADDRESS_2'].values[0]
        if phone == 'None':
            phone = row.PHONE_IQVIA
            phone_source = 'IQVia'
        address_source = 'IQVia'
    elif "DataGov" in sources and row.StreetName_GOV in address:
        KEY = row.KEY_GOV
        state = row.StateName_GOV
        long_zipcode = row.ZIPCODE_GOV
        zipcode = row.ZIP_GOV
        city = gov[gov.KEY==KEY]['CITY'].values[0]
        address_1 = gov[gov.KEY==KEY]['ADDRESS_1'].values[0]
        address_2 = gov[gov.KEY==KEY]['ADDRESS_2'].values[0]
        if phone == 'None':
            phone = row.PHONE_GOV
            phone_source = 'DataGov'
        address_source = 'DataGov'
    elif "Symphony" in sources and row.StreetName_SYMPHONY in address:
        KEY = row.KEY_SYMPHONY
        state = row.StateName_SYMPHONY
        long_zipcode = row.ZIPCODE_SYMPHONY
        zipcode = row.ZIP_SYMPHONY
        city = symphony[symphony.KEY==KEY]['CITY'].values[0]
        address_1 = symphony[symphony.KEY==KEY]['ADDRESS_1'].values[0]
        address_2 = symphony[symphony.KEY==KEY]['ADDRESS_2'].values[0]
        if phone == 'None':
            phone = row.PHONE_SYMPHONY
            phone_source = 'Symphony'
        address_source = 'Symphony'
    NEW_DICT = {
        'ROW_KEY': row.ROW_KEY,
        'ME_IQVIA': row.IQVIA_ME,
        'ADDRESS_MATCHES': row.ADDRESS_MATCHES,
        'PHONE_MATCHES': row.MATCHES,
        'ADDRESS_1': address_1,
        'ADDRESS_2': address_2,
        'CITY': city,
        'STATE': state,
        'ZIPCODE_FULL': long_zipcode,
        'ZIPCODE': zipcode,
        'PHONE': phone,
        'PHONE_SOURCE': phone_source,
        'ADDRESS_SOURCE': address_source,
        'ADDRESS_KEY': address,
        'DATA_KEY': KEY
    }
    dictss.append(NEW_DICT)

In [None]:
good_data = pd.DataFrame(dictss)

In [None]:
extra_good_data = good_data[good_data.PHONE!='None']

In [None]:
no_adds = REAL_DEAL[REAL_DEAL.IQVIA_ME.isin(good_data.ME_IQVIA)==False]

In [None]:
no_adds

In [None]:
phone_dictss = []
for row in no_adds.itertuples():
    sources = row.MATCHES
    phone = row.PHONE
    phone_source = 'Multiple'
    if "IQVia" in sources:
        KEY = row.KEY_IQVIA
        state = row.StateName_IQVIA
        long_zipcode = row.ZIPCODE_IQVIA
        zipcode = row.ZIP_IQVIA
        city = iqvia[iqvia.KEY==KEY]['CITY'].values[0]
        address_1 = iqvia[iqvia.KEY==KEY]['ADDRESS_1'].values[0]
        address_2 = iqvia[iqvia.KEY==KEY]['ADDRESS_2'].values[0]
        address_source = 'IQVia'
        address = row.AddressNumber_IQVIA+row.StreetName_IQVIA
    elif "DataGov" in sources:
        KEY = row.KEY_GOV
        state = row.StateName_GOV
        long_zipcode = row.ZIPCODE_GOV
        zipcode = row.ZIP_GOV
        city = gov[gov.KEY==KEY]['CITY'].values[0]
        address_1 = gov[gov.KEY==KEY]['ADDRESS_1'].values[0]
        address_2 = gov[gov.KEY==KEY]['ADDRESS_2'].values[0]
        address_source = 'DataGov'
        address = row.AddressNumber_GOV+row.StreetName_GOV
    elif "Symphony" in sources:
        KEY = row.KEY_SYMPHONY
        state = row.StateName_SYMPHONY
        long_zipcode = row.ZIPCODE_SYMPHONY
        zipcode = row.ZIP_SYMPHONY
        city = symphony[symphony.KEY==KEY]['CITY'].values[0]
        address_1 = symphony[symphony.KEY==KEY]['ADDRESS_1'].values[0]
        address_2 = symphony[symphony.KEY==KEY]['ADDRESS_2'].values[0]
        address_source = 'Symphony'
        address = row.AddressNumber_SYMPHONY+row.StreetName_SYMPHONY
    NEW_DICT = {
        'ROW_KEY': row.ROW_KEY,
        'ME_IQVIA': row.IQVIA_ME,
        'ADDRESS_MATCHES': row.ADDRESS_MATCHES,
        'PHONE_MATCHES': row.MATCHES,
        'ADDRESS_1': address_1,
        'ADDRESS_2': address_2,
        'CITY': city,
        'STATE': state,
        'ZIPCODE_FULL': long_zipcode,
        'ZIPCODE': zipcode,
        'PHONE': phone,
        'PHONE_SOURCE': phone_source,
        'ADDRESS_SOURCE': address_source,
        'ADDRESS_KEY': address,
        'DATA_KEY': KEY
    }
    phone_dictss.append(NEW_DICT)

In [None]:
more_good_data = pd.concat([good_data, pd.DataFrame(phone_dictss)])

In [None]:
more_good_data.drop_duplicates('ME_IQVIA').groupby('PHONE_MATCHES').count()[['ROW_KEY']]

In [None]:
necessary_iqvia = iqvia[(iqvia.IQVIA_ME.isin(more_good_data.ME_IQVIA)==False)&(iqvia.PHONE!='None')].sort_values('AFFIL_RANK', ascending=False).drop_duplicates('ME', keep='first')

In [None]:
['ROW_KEY', 'ME_IQVIA', 'ADDRESS_MATCHES', 'PHONE_MATCHES', 'ADDRESS_1',
       'ADDRESS_2', 'CITY', 'STATE', 'ZIPCODE_FULL', 'ZIPCODE', 'PHONE',
       'PHONE_SOURCE', 'ADDRESS_SOURCE', 'ADDRESS_KEY', 'DATA_KEY']
necessary_iqvia.columns
necessary_iqvia['ROW_KEY']='None'


In [None]:
necessary_iqvia = pd.merge(necessary_iqvia, all_the_data, left_on='KEY',right_on='KEY_IQVIA').drop_duplicates('ME')

In [None]:
necessary_iqvia['ADDRESS_MATCHES']='None'
necessary_iqvia['PHONE_MATCHES']='None'
necessary_iqvia['ZIPCODE_FULL']='None'
necessary_iqvia['ZIPCODE']='None'
necessary_iqvia['PHONE_SOURCE']='None'
necessary_iqvia['ADDRESS_SOURCE']='None'
necessary_iqvia['ADDRESS_KEY']='None'

In [None]:
necessary_iqvia['ADDRESS_MATCHES']='None'
necessary_iqvia['PHONE_MATCHES']='None'
necessary_iqvia['PHONE_SOURCE']='IQVia'
necessary_iqvia['ADDRESS_SOURCE']='IQVia'
necessary_iqvia['DATA_KEY']= necessary_iqvia.KEY_IQVIA_x
necessary_iqvia['ADDRESS_KEY']= necessary_iqvia.AddressNumber_IQVIA_y + necessary_iqvia.StreetName_IQVIA_y
necessary_iqvia['ROW_KEY']= necessary_iqvia.ROW_KEY_x
necessary_iqvia['ME_IQVIA']= necessary_iqvia.IQVIA_ME_x

In [None]:
necessary_iqvia.AddressNumber_IQVIA_y + necessary_iqvia.StreetName_IQVIA_y

In [None]:
len(older_polos)

In [None]:
len(iqvia.drop_duplicates('ME'))

In [None]:
more_good_data.drop_duplicates('ME_IQVIA')

In [None]:
necessary_iqvia['ZIPCODE_FULL'] = necessary_iqvia['ZIPCODE_IQVIA_x']
necessary_iqvia['ZIPCODE'] = necessary_iqvia['ZIP_IQVIA']

In [None]:
more_good_data.columns

In [None]:
ALL_OF_IT = pd.concat([necessary_iqvia[more_good_data.columns], more_good_data])

In [None]:
iqvia[iqvia.KEY=='11213IQVIA']

In [None]:
more_good_data

In [None]:
for col in necessary_iqvia.columns:
    print(col)

In [None]:
necessary_iqvia[['KEY_IQVIA_x','KEY_IQVIA_y','IQVIA_ME_x','IQVIA_ME_y']]

In [None]:
eek = ALL_OF_IT.drop_duplicates('ME_IQVIA', keep='last')

In [None]:
def get_present_employment():
    present_employment_key = {
        11: 'Self-Employed Solo Practice',
        13: 'Two Physician Practice-Full Or Part Owner',
        21: 'Other-Patient Care',
        22: 'Locum Tenens',
        30: 'Group Practice',
        35: 'HMO',
        40: 'Medical School',
        50: 'Non-Government Hospital',
        63: 'City/County/State Government-Hospital',
        64: 'City/County/State Government-Other Than Hospital',
        81: 'Federal Government-Hospital/Army',
        82: 'Federal Government-Hospital/Navy',
        83: 'Federal Government-Hospital/Air Force',
        84: 'Federal Government-Hospital/Usphs',
        85: 'Federal Government-Hospital/Vet Admin',
        86: 'Federal Government-Hospital/Other Agency',
        101: 'Other/Non-Patient Care',
        110: 'No Classification'
    }
    return present_employment_key

def humach_samplify(data):
    present_employment_key = get_present_employment()
    data['DESCRIPTION'] = [present_employment_key[x] for x in data.PE_CD]
    new_columns = {
        'ADDRESS_1':'POLO_MAILING_LINE_1',
        'ADDRESS_2':'POLO_MAILING_LINE_2',
        'CITY':'POLO_CITY',
        'STATE':'POLO_STATE',
        'ZIPCODE':'POLO_ZIP',
        'PHONE':'TELEPHONE_NUMBER'}
    humach_columns = [
        'ME',
        'FIRST_NAME',
        'MIDDLE_NAME',
        'LAST_NAME',
        'SUFFIX',
        'ADDRESS_1',
        'ADDRESS_2',
        'CITY',
        'STATE',
        'ZIPCODE',
        'PHONE',
        'PRIM_SPEC_CD',
        'DESCRIPTION',
        'PE_CD',
        'FAX_NUMBER'
    ]
    humach_data = data[humach_columns].rename(columns = new_columns)
    return humach_data

In [None]:
more_good_data.columns

In [None]:
PPD = older_polos[['ADDR_1_POLO','IQVIA_ME','ME',
        'FIRST_NAME',
        'MIDDLE_NAME',
        'LAST_NAME',
        'SUFFIX',
        'PRIM_SPEC_CD',
        'PE_CD',
        'FAX_NUMBER']]

In [None]:
#check zip phone area
#check license
#check 100 miles 
#check match

In [None]:
XX = pd.merge(PPD, eek, left_on='IQVIA_ME', right_on='ME_IQVIA')

In [None]:
BA = humach_samplify(xx)

In [None]:
OTHER = pd.read_csv('../../Data/POLO_Filter/Filtered_POLOs_Humach_Sample_2021-08-26.csv', low_memory=False)

In [None]:
XX['FAX_NUMBER'] = [use.fix_phone(x) for x in XX.FAX_NUMBER]

In [None]:
OTHER['ME'] = fix_me(OTHER.ME)

In [None]:
OTHER.to_excel('../../Data/POLO_Filter/Filtered_POLOs_Humach_Sample_2021-08-26.xlsx', index=False)

In [None]:
BA.to_excel('../../Data/POLO_Filter/Older_POLOs_Humach_Sample_2021-08-26.xlsx', index=False)

In [None]:
OTHER.drop_duplicates('ME')

In [None]:
more_good_data.drop_duplicates('ME_IQVIA')

In [None]:
#robust matching on disparate address elements + phone
#data is included if there is corroboration between at least two sources
#if addresses agreed and phones did not, phone was taken from IQVia, DataGov, or symphony in that preferred order
#If phones agreed and addresses did not, address was taken from IQVia, DataGov, or symphony in that preferred order
#if no agreement between sources, best IQVia affiliation was used