In [53]:
#Import dependencies

import pandas as pd
import os
import statistics
import numpy as np
import pgeocode

In [56]:
#Define path to PPD

path = "..\PhysicianProfessionalDataFile_20190826_105342714"

In [58]:
#Name columns

cols = [
        'ME',
        'RECORD_ID',
        'UPDATE_TYPE',
        'ADDRESS_TYPE',
        'MAILING_NAME',
        'LAST_NAME',
        'FIRST_NAME',
        'MIDDLE_NAME',
        'SUFFIX',
        'MAILING_LINE_1',
        'MAILING_LINE_2',
        'CITY',
        'STATE',
        'ZIP',
        'SECTOR',
        'CARRIER_ROUTE',
        'ADDRESS_UNDELIVERABLE_FLAG',
        'FIPS_COUNTY',
        'FIPS_STATE',
        'PRINTER_CONTROL_CODE',
        'PC_ZIP',
        'PC_SECTOR',
        'DELIVERY_POINT_CODE',
        'CHECK_DIGIT',
        'PRINTER_CONTROL_CODE_2',
        'REGION',
        'DIVISION',
        'GROUP',
        'TRACT',
        'SUFFIX_CENSUS',
        'BLOCK_GROUP',
        'MSA_POPULATION_SIZE',
        'MICRO_METRO_IND',
        'CBSA',
        'CBSA_DIV_IND',
        'MD_DO_CODE',
        'BIRTH_YEAR',
        'BIRTH_CITY',
        'BIRTH_STATE',
        'BIRTH_COUNTRY',
        'GENDER',
        'TELEPHONE_NUMBER',
        'PRESUMED_DEAD_FLAG',
        'FAX_NUMBER',
        'TOP_CD',
        'PE_CD',
        'PRIM_SPEC_CD',
        'SEC_SPEC_CD',
        'MPA_CD',
        'PRA_RECIPIENT',
        'PRA_EXP_DT',
        'GME_CONF_FLG',
        'FROM_DT',
        'TO_DT',
        'YEAR_IN_PROGRAM',
        'POST_GRADUATE_YEAR',
        'GME_SPEC_1',
        'GME_SPEC_2',
        'TRAINING_TYPE',
        'GME_INST_STATE',
        'GME_INST_ID',
        'MEDSCHOOL_STATE',
        'MEDSCHOOL_ID',
        'MEDSCHOOL_GRAD_YEAR',
        'NO_CONTACT_IND',
        'NO_WEB_FLAG',
        'PDRP_FLAG',
        'PDRP_START_DT',
        'POLO_MAILING_LINE_1',
        'POLO_MAILING_LINE_2',
        'POLO_CITY',
        'POLO_STATE',
        'POLO_ZIP',
        'POLO_SECTOR',
        'POLO_CARRIER_ROUTE',
        'MOST_RECENT_FORMER_LAST_NAME',
        'MOST_RECENT_FORMER_MIDDLE_NAME',
        'MOST_RECENT_FORMER_FIRST_NAME',
        'NEXT_MOST_RECENT_FORMER_LAST',
        'NEXT_MOST_RECENT_FORMER_MIDDLE',
        'NEXT_MOST_RECENT_FORMER_FIRST'
    ]

In [59]:
#Read ppd into dataframe

ppd = pd.read_csv(path, names=cols, sep='|', encoding='IBM437', index_col=False, dtype=object)

In [126]:
#Read insurance address updates into dataframe

insurance = pd.read_csv("address_updates_2019_dates.csv")

In [134]:
#Merge ppd and insurance data

giant_merged = pd.merge(insurance.astype('str'), ppd, on='ME', suffixes=("_insurance", "ppd"))

In [138]:
#Write dict from line of dataframe:
def make_new_line(row):
    new_dict = {}
    new_dict['ME']=row.ME
    new_dict['ADDR']=row.ADDR_1
    new_dict['CITY']=row.CITY_insurance.replace("  ", "")
    new_dict['STATE']=row.STATE_CD
    new_dict['ZIP']=row.ZIP_insurance
                            
    new_dict['ADDR_office']=row.MAILING_LINE_2
    new_dict['CITY_office']=row.CITYppd.replace("  ", "")
    new_dict['STATE_office']=row.STATE
    new_dict['ZIP_office']=row.ZIPppd
                            
    new_dict['ADDR_mailing']=row.POLO_MAILING_LINE_2
    new_dict['CITY_mailing']=row.POLO_CITY.replace("  ", "")
    new_dict['STATE_mailing']=row.POLO_STATE
    new_dict['ZIP_mailing']=row.POLO_ZIP
    new_dict['NAME']=str(row.FIRST_NAME) + " " + str(row.LAST_NAME)

    return(new_dict)

In [178]:
#Count all the address differences and write results to dataframes:
def get_differences_summary(df):
    
    office_add = 0
    mailing_add = 0
    both_add = 0
    office_zip = 0
    mailing_zip = 0
    both_zip = 0
    office_city = 0
    mailing_city = 0
    both_city = 0
    office_state = 0
    mailing_state = 0
    both_state = 0
    
    total=df.shape[0]
    
    dict_list = []
    
    for row in df.itertuples():
        ma = oa = mz = oz = oc = mc = os = ms = False
        if str(row.ADDR_1).lower() != str(row.MAILING_LINE_2).lower():
            mailing_add += 1
            ma=True
        if str(row.ADDR_1).lower() != str(row.POLO_MAILING_LINE_2).lower():
            office_add += 1
            oa=True
        if ma and oa == True:
            both_add += 1
        if row.ZIP_insurance!= row.ZIPppd:
            mailing_zip += 1
            mz = True
        if row.ZIP_insurance!= row.POLO_ZIP:
            office_zip += 1
            oz = True
        if mz and oz == True:
            both_zip += 1
        if str(row.CITY_insurance).lower().replace("  ", "") != str(row.CITYppd).lower().replace("  ", ""):
            mailing_city += 1
            mc = True
        if str(row.CITY_insurance).lower().replace("  ", "") != str(row.POLO_CITY).lower().replace("  ", ""):
            office_city += 1
            oc = True
        if mc and oc == True:
            both_city += 1
        if row.STATE_CD != row.STATE:
            mailing_state += 1
            ms = True
        if row.STATE_CD != row.POLO_STATE:
            office_state += 1  
            os = True
        if ms and os == True:
            both_state += 1
        if ma and oa and mz and oz and oc and mc and os and ms == True:
            new_dict = make_new_line(row)
            dict_list.append(new_dict)
    summary_dictionary = [
        {
         "Mailing": mailing_add, 
         "Office": office_add,
         "Both": both_add},
        {
        "Mailing": mailing_zip,
        "Office": office_zip,
        "Both": both_zip},
        {
        "Mailing": mailing_city,
        "Office": office_city,
        "Both": both_city},
        {
         "Mailing": mailing_state,
        "Office": office_state,
        "Both": both_state}
    ]
    summary=pd.DataFrame(summary_dictionary, index =["Address", "Zipcode", "City", "State"])
    differences = pd.DataFrame(dict_list)
    return(differences, summary, total)

In [179]:
#Run function on merged dataframe
MISMATCHES, summary_df, total = get_differences_summary(giant_merged)

In [180]:
#Get percentages
summary_df.apply(lambda x: x/total*100)

Unnamed: 0,Both,Mailing,Office
Address,64.682161,79.437458,75.846657
Zipcode,50.615566,66.857973,64.416025
City,56.04697,66.259675,66.177397
State,20.692562,22.582939,33.802288


In [108]:
#Define function to get distances between zipcodes in dataframe
def get_zip_distances(df, zip_type):    
    distances = []
    TOTAL = df.shape[0]
    print(TOTAL)
    count =0
    DIST=pgeocode.GeoDistance('US')
    dict_list = []
    num_list =[]
    for row in df.itertuples():
        new_dict={}
        new_dict["ME"]=row.ME
        count += 1
        if int(count/TOTAL*100) in [10,20,30,40,50,60,70,80,90] and int(count/TOTAL*100) not in num_list:
            num_list.append(int(count/TOTAL*100))
            print (f'{int(count/TOTAL*100)}% done!')
        try:
            if zip_type == 'ppma':
            elif zip_type == 'polo'
            elif zip_type=='office':
                mf = int(row.ZIP_mailing)
            else:
                mf = int(row.ZIP_office)
            ins = int(row.ZIP)
        except:
            mf='nan'
            ins='nan'
    #     print(f'{mf} {ins}')
        try:
            ZIP_distance = DIST.query_postal_code(mf, ins)*0.621371
    #         print(ZIP_distance)
            if np.isnan(ZIP_distance)==False:
                distances.append(ZIP_distance)
        except:
            ZIP_distance = 'nan'
        new_dict['ZIP_distance']=ZIP_distance
        dict_list.append(new_dict)
    # print(distances2)
    avg = sum(distances)/len(distances)
    print(f'Average distance between insurance and {zip_type} addressses is {avg} miles.')
    return(dict_list, distances)

In [109]:
#Get distances for mailing addresses
get_zip_distances(MISMATCHES, "mailing")

5
20% done!
40% done!
60% done!
80% done!
Average distance between insurance and mailing addressses is 522.391564460491 miles.


([{'ME': 56601860185, 'ZIP_distance': 991.7332892788583},
  {'ME': 40921080028, 'ZIP_distance': 421.0830430632709},
  {'ME': 70402120086, 'ZIP_distance': 507.7425881957216},
  {'ME': 42201136452, 'ZIP_distance': 178.27651522089172},
  {'ME': 60501060367, 'ZIP_distance': 513.1223865437134}],
 [991.7332892788583,
  421.0830430632709,
  507.7425881957216,
  178.27651522089172,
  513.1223865437134])

In [22]:
#Print mailing average distance
avg1

898.0105076280563

In [23]:
#Print office average distance
avg2

880.4454576399766

In [30]:
#Print office median
statistics.median(distances1)

715.6965555238103

In [36]:
#Merge office and mailing zip distance dataframes
office = pd.DataFrame(dict_list1)
mailing = pd.DataFrame(dict_list2)
zip_distances = pd.merge(office, mailing, on='ME', suffixes=('_office', "_mailing"))

In [38]:
#Put distances in mismatch dataframe
MISMATCH_DISTANCES = pd.merge(MISMATCHES, zip_distances, on='ME')

In [43]:
#Print mailing median
mailing.median(skipna=True)

ME              4.964208e+10
ZIP_distance    6.885383e+02
dtype: float64

In [203]:
#Define function to bucket distances:
def bucket_zip_distances(df, column_name):
    bins=[0,50,100,150,200,250,300,350,400,450,500,550,600,10000]
    labels = ["<50", "50-100", "100-150", "150-200","200-250","250-300","300-350","350-400","400-450","450-500", "500-550","550-600", ">600"]
    new_column_name = column_name + "_bins"
    df[new_column_name]=pd.cut(df[column_name], bins=bins, labels=labels)
    buckets = df[[new_column_name, column_name]].groupby(new_column_name).count()
    return(buckets)

In [193]:
#Leave only rows where zip distance is greater than 200 miles
far_places = MISMATCH_DISTANCES[MISMATCH_DISTANCES.ZIP_distance_mailing>200]
far_places = far_places[far_places.ZIP_distance_office>200]

In [197]:
#Get percentage of total
far_places.shape[0]/giant_merged.shape[0]*100

9.731223208662618