## Import Dependencies

In [1]:
import pandas as pd

## Read csvs

In [126]:
ekata_disconnected = pd.read_csv('ekata_disconnected.csv')
ekata_connected = pd.read_csv('ekata_connected.csv')
disconnected = pd.read_csv('disconnected_final.csv')
connected = pd.read_csv('connected_final.csv')

## Merge the disconnected dataframes

In [127]:
disconnected_df = pd.merge(disconnected, ekata_disconnected, on = 'OFFICE_TELEPHONE', suffixes = ('_GetPhoneInfo', '_Ekata'))
disconnected_df = disconnected_df.fillna("None")
disconnected_df.to_csv("combination_disconnected.csv")


## Merge the connected dataframes and print to screen

In [120]:
connected_df = pd.merge(connected, ekata_connected, on = 'OFFICE_TELEPHONE', suffixes = ('_GetPhoneInfo', '_Ekata'))
connected_df

Unnamed: 0,Address_GetPhoneInfo,City_GetPhoneInfo,Date_GetPhoneInfo,Name_GetPhoneInfo,Notes,OFFICE_TELEPHONE,PhoneType_GetPhoneInfo,Provider,QualityScore,State_GetPhoneInfo,...,City_Ekata,Commercial,Date_Ekata,Error,Industry,LineType,Name_Ekata,PhoneType_Ekata,State_Ekata,Valid
0,2755 S HIGHWAY 14 STE 2500,GREER,12/23/2008,MEDICAL GROUP OF THE CAROLINASDIVISION OF,"IsMailable,IsConnected,IsPorted",8648499555,BUSINESS,NUVOX COMMUNICATIONS,HIGH,SC,...,Greer,True,2/28/2019,,Ambulatory Health Care Services,Landline,"Cash, Allyson L N.P.",Business,SC,True
1,1501 HOUSTON ST,CASTROVILLE,10/12/2017,LITTLE ALSACE URGENT CARE CENTER CHILD CARE,"IsMailable,IsConnected,IsPorted",8305383550,BUSINESS,BANDWIDTH.COM - TX,HIGH,TX,...,Castroville,True,11/18/2016,,Ambulatory Health Care Services,NonFixedVOIP,"Fritz, Rebecca PA",Business,TX,True
2,,BURLINGTON,,BRUCE TRONIC,,7817448046,RESIDENTIAL,VERIZON NEW ENGLAND,LOW,MA,...,Burlington,True,8/28/2018,,Ambulatory Health Care Services,Landline,"Bilic, Masha MD",Business,MA,True
3,475 IRVING AVE STE 210,SYRACUSE,5/28/2015,MADISON IRVING PEDIATRICS DEVELOPMENTAL HEALTH,"IsMailable,IsConnected,IsPorted",3154712646,BUSINESS,NORTHLAND NETWORKS,HIGH,NY,...,Syracuse,True,2/28/2019,,Ambulatory Health Care Services,Landline,Madison Irving Pediatrics Pc,Business,NY,True
4,160 OVERLOOK AVE STE 1A,HACKENSACK,1/25/2018,ANDREW FINK MD PHYSICIANS SURGEONS,"IsMailable,IsConnected,IsPorted",2014883131,BUSINESS,"AIRUS, INC. - NJ",HIGH,NJ,...,Hackensack,True,2/28/2019,,Ambulatory Health Care Services,NonFixedVOIP,"Fink, Andrew MD",Business,NJ,True
5,1981 STATE HILL RD,READING,12/8/2015,DR TIMOTHY C STRINGER PHYSICIANS GENERAL,"IsMailable,IsConnected,IsPorted",6106702277,BUSINESS,COMCAST PHONE - PA,HIGH,PA,...,Reading,True,11/19/2016,,Ambulatory Health Care Services,FixedVOIP,Timothy C Stringer DPM,Business,PA,True
6,2854 BELL ST,ZANESVILLE,7/15/2016,DREW LAYNE,"IsMailable,IsConnected,IsPorted",7404543273,RESIDENTIAL,TIME WARNER CABLE OH,HIGH,OH,...,Zanesville,True,2/28/2019,,Ambulatory Health Care Services,FixedVOIP,"Toth, Randall S",Business,OH,True
7,3901 CENTRAL PIKE STE 351,HERMITAGE,9/7/2018,ALLERGY ENT ASSOCIATES OF MIDDLE TENNESSEE PC,"IsMailable,IsConnected,IsPorted",6158898802,BUSINESS,COMCAST PHONE - TN,HIGH,TN,...,Hermitage,True,11/20/2016,,Ambulatory Health Care Services,FixedVOIP,Gowda Ear Nose & Throat Clinic,Business,TN,True
8,6601 LYNDALE AVE S STE 220,MINNEAPOLIS,10/27/2016,KIDNEY SPECIALISTS OF MINNESOTA MAIN INFO,"IsMailable,IsPorted",6128238001,BUSINESS,COMCAST PHONE - MN,LOW,MN,...,Minneapolis,True,11/17/2016,,Ambulatory Health Care Services,FixedVOIP,"Esten, Andrew MD",Business,MN,True
9,1215 S COULTER ST STE 403,AMARILLO,2/17/2012,RAJ SARALAYA MD PHYSICIANS SURGEONS,"IsMailable,IsConnected,IsPorted",8066772002,BUSINESS,PATHWAYZ COMM - TX,HIGH,TX,...,Amarillo,True,11/17/2016,,Ambulatory Health Care Services,Landline,Amarillo Medical Spec Llp,Business,TX,True


## Define the function that determines if the phone owner's name contains medical words

In [63]:
def is_healthy(name):
    med_words = ['cancer', 'cardiology', 'neurology', 'family care', 'pulmonary', 'anesthesia', 'orthope', 'urgent care', 'allergy', 'kidney', 'surgery', 'hosp', 'mri', 'throat', 'dentist', 'med', 'clinic', 'health', 'gastroenter','anesthesiologist','patient','physician','surgeon','doctor','hospital', 'md', 'medical', 'pediatrics', 'm.d.']
    healthy = False
    for word in med_words:
        if word in str(name).lower():
            healthy = True   
    return(healthy)

## Define the function that calculates aggregate numbers for variabes of interest

In [153]:
def analyze_numbers(dataframe):
    
    name_match = 0
    healthy = 0
    address_match = 0
    city_match = 0
    commercial = 0
    no_address = 0
    no_date = 0
    industry = 0
    landline = 0
    business = 0

    for row in dataframe.itertuples():
        address_matched = False
        if str(row.PHYSICIAN_LAST_NAME).lower() in str(row.Name_Ekata).lower():
            name_match +=1
        if is_healthy(row.Name_Ekata) == True:
            healthy += 1
        if str(row.OFFICE_ADDRESS_LINE_2).lower() in str(row.Address_Ekata).lower():
            address_match +=1
            address_matched = True
        if address_matched == False:
            print (f'{row.OFFICE_ADDRESS_LINE_2} is not {row.Address_Ekata}')
        if str(row.OFFICE_ADDRESS_CITY).lower() in str(row.City_Ekata).lower():
            city_match += 1
        if row.Commercial == True:
            commercial += 1
        if 'Ambulatory Health Care Services' in str(row.Industry):
            industry += 1
        if row.Industry == 'Nursing and Residential Care Facilities':
            industry += 1
        if row.Industry == 'Hospitals':
            industry += 1
        if row.LineType == 'Landline':
            landline += 1

        if row.Address_Ekata == 'None':
            no_address += 1
        if row.Date_Ekata == 'None':
            no_date += 1
        if row.PhoneType_Ekata == 'Business':
            business += 1


    new_dict = {
        'Address Matches': address_match,
        'City Matches': city_match,
        'Exact Name Matches': name_match,
        'Relevant Names': healthy,
        'No Address': no_address,
        'No Date': no_date,
        'Business Phone': business,
        'Commercial': commercial,
        'Correct Industry': industry,
        'Landline': landline
        }
        
    return(new_dict)

## Call the function on the merged connected dataframe

In [154]:
analyze_numbers(connected_df)

385 PROSPECT AVE  STE 200 is not 385 Prospect Ave Ste 20            
1981 STATE HILL ROAD is not 320 Abington Dr Ste 4
465 COLUMBUS AVE is not 645 Marble Ave
540 E JEFFERSON ST STE 105 is not 613 E Bloomington St # 100
PO BOX 93358 is not 657 N Town Center Dr
1920 E BASELINE RD is not None
1401 AVACODA AVE STE 602 is not 1401 Avocado Ave Ste 602
59 BAY 29 ST is not 59 Bay 29th St
1100 RED PARKWEY is not 1100 Reid Pkwy
150 DERGEN ST STE H245 is not 150 Bergen St
4802 E JOHNSON AVE is not 1111 Windover Rd
9880 ANGIES WAY STE 205 is not 3999 Dutchmans Ln Ste 6f
275 SANWICH ST is not 275 Sandwich St


{'Address Matches': 27,
 'City Matches': 33,
 'Exact Name Matches': 8,
 'Relevant Names': 30,
 'No Address': 1,
 'No Date': 1,
 'Business Phone': 40,
 'Commercial': 40,
 'Correct Industry': 39,
 'Landline': 15}

## Save to a dictionary

In [133]:
disconnected_dict = analyze_numbers(disconnected_df)

## Turn dictionaries into summary dataframes

In [137]:
pd.DataFrame([connected_dict, disconnected_dict])

Unnamed: 0,Address Matches,Business Phone,City Matches,Commercial,Correct Industry,Exact Name Matches,Landline,No Address,No Date,Relevant Names
0,27,40,33,40,39,8,15,1,1,30
1,8,29,21,29,25,9,27,10,10,20


## Call function on the connected df

In [157]:
analyze_numbers(connected_df)

385 PROSPECT AVE  STE 200 is not 385 Prospect Ave Ste 20            
1981 STATE HILL ROAD is not 320 Abington Dr Ste 4
465 COLUMBUS AVE is not 645 Marble Ave
540 E JEFFERSON ST STE 105 is not 613 E Bloomington St # 100
PO BOX 93358 is not 657 N Town Center Dr
1920 E BASELINE RD is not None
1401 AVACODA AVE STE 602 is not 1401 Avocado Ave Ste 602
59 BAY 29 ST is not 59 Bay 29th St
1100 RED PARKWEY is not 1100 Reid Pkwy
150 DERGEN ST STE H245 is not 150 Bergen St
4802 E JOHNSON AVE is not 1111 Windover Rd
9880 ANGIES WAY STE 205 is not 3999 Dutchmans Ln Ste 6f
275 SANWICH ST is not 275 Sandwich St


{'Address Matches': 27,
 'City Matches': 33,
 'Exact Name Matches': 8,
 'Relevant Names': 30,
 'No Address': 1,
 'No Date': 1,
 'Business Phone': 40,
 'Commercial': 40,
 'Correct Industry': 39,
 'Landline': 15}

## Define med words

In [141]:
med_words = ['cancer', 'cardiology', 'neurology', 'family care', 'pulmonary', 'anesthesia', 'orthope', 'urgent care', 'allergy', 'kidney', 'surgery', 'hosp', 'mri', 'throat', 'dentist', 'med', 'clinic', 'health', 'gastroenter','anesthesiologist','patient','physician','surgeon','doctor','hospital', 'md', 'medical', 'pediatrics', 'm.d.']

## Define function that analyzes GetPhoneInfo results

In [146]:
def analyze_numbers_2(dataframe):
    
    
    address_match = 0
    city_match = 0
    zipcode_match = 0
    healthcount = 0
    mailcount = 0
    connectcount = 0
    portcount = 0
    novalidcount = 0
    unknowncount = 0
    disconnectedcount = 0
    voipcount = 0
    wirelesscount = 0
    no_address_count = 0
    no_date_count = 0
    business_count = 0
    residential_count = 0
    low_count = 0
    high_count = 0
    med_count = 0


    for row in dataframe.itertuples():
        if row.Address_GetPhoneInfo == row.OFFICE_ADDRESS_LINE_2:
            address_match += 1
        if row.City_GetPhoneInfo == row.OFFICE_ADDRESS_CITY:
            city_match += 1
        if row.Zipcode == row.OFFICE_ADDRESS_ZIP:
            zipcode_match += 1

        healthy = False
        for word in med_words:
            if word in str(row.Name_GetPhoneInfo).lower():
                healthy = True
        if healthy == True:    
            healthcount += 1

        if 'IsMailable' in str(row.Notes):
            mailcount += 1
        if 'IsConnected' in str(row.Notes):
            connectcount += 1
        if 'IsPorted' in str(row.Notes):
            portcount += 1
        if 'NotValid' in str(row.Notes):
            novalidcount += 1
        if 'IsUnknownContact' in str(row.Notes):
            unknowncount += 1
        if 'IsPossibleDisconnected' in str(row.Notes):
            disconnectedcount += 1
        if 'IsPossiblePortableVOIP' in str(row.Notes):
            voipcount += 1
        if 'IsWireless' in str(row.Notes):
            wirelesscount += 1

        if row.Address_GetPhoneInfo == 'None':
            no_address_count += 1
        if row.Date_GetPhoneInfo == 'None':
            no_date_count += 1
        if row.PhoneType_GetPhoneInfo == 'BUSINESS':
            business_count += 1
        if row.PhoneType_GetPhoneInfo == 'RESIDENTIAL':
            residential_count += 1
        if row.QualityScore == 'LOW':
            low_count += 1
        if row.QualityScore == 'HIGH':
            high_count += 1
        if row.QualityScore == 'MED':
            med_count += 1

    new_dict = {
        'Address Matches': address_match,
        'City Matches': city_match,
        'ZipCode Matches': zipcode_match,
        'No Address': no_address_count,
        'No Date': no_date_count,
        'Relevant Name': healthcount,
        'Business Phone': business_count,
        'Residential Phone': residential_count,
        'Low Quality': low_count,
        'Medium Quality': med_count,
        'High Quality': high_count,
        'Mailable': mailcount,
        'Connected': connectcount,
        'Ported': portcount,
        'Not Valid': novalidcount,
        'Unknown Contact': unknowncount,
        'Possibly Disconnected': disconnectedcount,
        'Possibly Portable VOIP': voipcount,
        'Wireless': wirelesscount,
        }
        
    return(new_dict)

## Call function

In [147]:
analyze_numbers_2(connected_df)

{'Address Matches': 19,
 'City Matches': 32,
 'ZipCode Matches': 35,
 'No Address': 0,
 'No Date': 0,
 'Relevant Name': 30,
 'Business Phone': 34,
 'Residential Phone': 6,
 'Low Quality': 12,
 'Medium Quality': 0,
 'High Quality': 28,
 'Mailable': 36,
 'Connected': 28,
 'Ported': 32,
 'Not Valid': 0,
 'Unknown Contact': 0,
 'Possibly Disconnected': 2,
 'Possibly Portable VOIP': 4,
 'Wireless': 0}

In [149]:
analyze_numbers_2(disconnected_df)

{'Address Matches': 4,
 'City Matches': 15,
 'ZipCode Matches': 13,
 'No Address': 18,
 'No Date': 20,
 'Relevant Name': 15,
 'Business Phone': 27,
 'Residential Phone': 8,
 'Low Quality': 15,
 'Medium Quality': 6,
 'High Quality': 14,
 'Mailable': 23,
 'Connected': 20,
 'Ported': 20,
 'Not Valid': 0,
 'Unknown Contact': 5,
 'Possibly Disconnected': 9,
 'Possibly Portable VOIP': 1,
 'Wireless': 1}