## Import Dependencies

In [2]:
key = 'WS77-PWJ1-MKS5'
import pandas as pd
import requests
import xmltodict
import json
import matplotlib.pyplot as plt

## Read csv

In [3]:
wslive = pd.read_csv('Data/wslive_with_results_2019-01-04_to_2020-01-24.csv')
wslive = wslive.fillna('None')
sample_df = wslive[(wslive.COMMENTS == 'FAIL') & (wslive.OFFICE_ADDRESS_LINE_2!='None') & (wslive.OFFICE_ADDRESS_LINE_1!='None')].sample(500)

  interactivity=interactivity, compiler=compiler, result=result)


In [42]:
sample_df['OFFICE_TELEPHONE'] = sample_df['OFFICE_TELEPHONE'].astype('int64')

## Define Valid phone function

In [4]:
def is_valid_phone(phone):
    phone = str(phone)
    return len(phone) == 10 and phone.isdigit() and phone[0] != '0' and len(set(phone)) > 2

## Define reverse lookup function

In [20]:
def reverse_lookup(phone_number):
    info_dict = {}
    base_url = "https://ws.serviceobjects.com/gppl2/api.svc/GetPhoneInfo"
    parameters = {'PhoneNumber':phone_number, 'LicenseKey': key}
    response =  requests.get(base_url, params=parameters)
    results = xmltodict.parse(response.content)
    try:
        phone_results = results["PhoneInfoResponse"]["PhoneInfo"]
    except:
        print(f'{phone_number} did not return results')
    info_dict["Number"] = phone_number
    try:
        info_dict["Name"] = phone_results["Contacts"]['Contact']["Name"]
    except:
        info_dict["Name"] = 'None'
    try:
        info_dict["Address"] = phone_results["Contacts"]['Contact']["Address"]
    except:
        info_dict["Address"] = 'None'
    try:
        info_dict["Zipcode"] = phone_results["Contacts"]['Contact']['PostalCode']
    except:
        info_dict["Zipcode"] = 'None'
    try:
        info_dict["City"] = phone_results["Contacts"]['Contact']["City"]
    except:
        info_dict["City"] = 'None'
    try:
        info_dict["State"] = phone_results["Contacts"]['Contact']["State"]
    except:
        info_dict["State"] = 'None'
    try:
        info_dict["PhoneType"] = phone_results["Contacts"]['Contact']["PhoneType"]
    except:
        info_dict["PhoneType"] = 'None'
    try:
        info_dict["QualityScore"] = phone_results["Contacts"]['Contact']["QualityScore"]
    except:
        info_dict["QualityScore"] = 'None'
    try:
        info_dict["Date"] = phone_results["DateOfPorting"]
    except:
        info_dict["Date"] = 'None'
    try:
        info_dict["Notes"] = phone_results["NoteDescriptions"]
    except:
        info_dict["Notes"] = 'None'
    try:
        info_dict["Provider"] = phone_results['Provider']['Name']
    except:
        info_dict["Provider"] = 'None'
        
    try:
        info_dict["SICCode"] = phone_results["Contacts"]['Contact']['SICCode']
    except:
        info_dict["SICCode"] = 'None'
        
    try:
        info_dict["SICDesc"] = phone_results["Contacts"]['Contact']['SICDesc']
    except:
        info_dict["SICDesc"] = 'None'
        
    return(info_dict, results)

## Define test numbers function

In [21]:
def test_numbers(dataframe,filename):
    results_dict_list = []
    fun_massive_list = []
    count = 0
    for row in dataframe.itertuples():
        
        new_dict = {}
        phone = row.OFFICE_TELEPHONE
        
        if is_valid_phone(phone) == False:
            print(f'Entry {count}: {phone} is not a valid phone number')
            break
        try:
            new_dict, phone_results = reverse_lookup(phone)
            results_dict_list.append(new_dict)
            fun_massive_list.append(phone_results)
        except:
            print(f'Reverse Phone lookup did not work')
            print(new_dict)
            print(phone_results)
            break
        count += 1
        print(f'{phone} complete!\n {count} numbers tested.')
    try: 
        new_df = pd.DataFrame(results_dict_list)
    except: 
        return(fun_massive_list, {})
    try:
        with open(f'{filename}_data.txt', 'w') as outfile:
            json.dump(fun_massive_list, outfile)
    except:
        pass
    return(fun_massive_list, new_df)

## Test numbers

In [49]:
massive_list_2, final_phone_df_2 = test_numbers(sample_df[5:], 'GetPhone_NoContact_Test_2-20-02-05')

9046440092 complete!
 1 numbers tested.
5028523876 complete!
 2 numbers tested.
4043650966 complete!
 3 numbers tested.
9497284323 complete!
 4 numbers tested.
6268516634 complete!
 5 numbers tested.
4083633039 complete!
 6 numbers tested.
7082458975 complete!
 7 numbers tested.
2152141754 complete!
 8 numbers tested.
2037397038 complete!
 9 numbers tested.
8058983400 complete!
 10 numbers tested.
4066577000 complete!
 11 numbers tested.
6104366696 complete!
 12 numbers tested.
6036223623 complete!
 13 numbers tested.
2534037277 complete!
 14 numbers tested.
6304699200 complete!
 15 numbers tested.
4804126336 complete!
 16 numbers tested.
8034194949 complete!
 17 numbers tested.
9043548766 complete!
 18 numbers tested.
5182625756 complete!
 19 numbers tested.
9089945000 complete!
 20 numbers tested.
3252245641 complete!
 21 numbers tested.
6503214121 complete!
 22 numbers tested.
2122380100 complete!
 23 numbers tested.
8436449696 complete!
 24 numbers tested.
4153532273 complete!
 25 

9525951100 complete!
 198 numbers tested.
9724422300 complete!
 199 numbers tested.
5854732200 complete!
 200 numbers tested.
5802521136 complete!
 201 numbers tested.
9207293100 complete!
 202 numbers tested.
5092287111 complete!
 203 numbers tested.
5182434317 complete!
 204 numbers tested.
7046371123 complete!
 205 numbers tested.
5136366758 complete!
 206 numbers tested.
7877405349 complete!
 207 numbers tested.
7349953764 complete!
 208 numbers tested.
3372898400 complete!
 209 numbers tested.
3017910600 complete!
 210 numbers tested.
5738822296 complete!
 211 numbers tested.
6022628900 complete!
 212 numbers tested.
4408916500 complete!
 213 numbers tested.
9207293000 complete!
 214 numbers tested.
2147483647 complete!
 215 numbers tested.
7325493000 complete!
 216 numbers tested.
2059346383 complete!
 217 numbers tested.
4154763134 complete!
 218 numbers tested.
8586423972 complete!
 219 numbers tested.
6126246666 complete!
 220 numbers tested.
2123057060 complete!
 221 numbers 

6024064376 complete!
 394 numbers tested.
8645606193 complete!
 395 numbers tested.
6142936526 complete!
 396 numbers tested.
4158857478 complete!
 397 numbers tested.
2164446601 complete!
 398 numbers tested.
7168597540 complete!
 399 numbers tested.
9195721868 complete!
 400 numbers tested.
5034137162 complete!
 401 numbers tested.
3108257105 complete!
 402 numbers tested.
6612603021 complete!
 403 numbers tested.
7863158918 complete!
 404 numbers tested.
8587594765 complete!
 405 numbers tested.
5083635000 complete!
 406 numbers tested.
4148055495 complete!
 407 numbers tested.
6015532000 complete!
 408 numbers tested.
4137949338 complete!
 409 numbers tested.
8593359041 complete!
 410 numbers tested.
8643424000 complete!
 411 numbers tested.
9732679393 complete!
 412 numbers tested.
7242246700 complete!
 413 numbers tested.
7247446167 complete!
 414 numbers tested.
9419229312 complete!
 415 numbers tested.
8045244674 complete!
 416 numbers tested.
6505724702 complete!
 417 numbers 

In [54]:
pd.concat([final_phone_df,final_phone_df_2]).to_csv('GetPhone_NoContact_Sample.csv', index=False)
sample_df.to_csv('WSLive_NoContact_Sample.csv',index=False)

## Export results to csv 

In [77]:
final_phone_df.to_csv('TestingServiceObjects.csv', index = False)

## Print all data

In [78]:
massive_list

[OrderedDict([('PhoneInfoResponse',
               OrderedDict([('@xmlns', 'http://www.serviceobjects.com'),
                            ('@xmlns:i',
                             'http://www.w3.org/2001/XMLSchema-instance'),
                            ('PhoneInfo',
                             OrderedDict([('Provider',
                                           OrderedDict([('Name',
                                                         'CIN BELL ANY DIST OH'),
                                                        ('City', 'CINCINNATI'),
                                                        ('State', 'OHIO'),
                                                        ('Latitude', '39.17'),
                                                        ('Longitude',
                                                         '-84.4929'),
                                                        ('LineType',
                                                         'LANDLINE')])),
               

## Read csv 

In [None]:
phone_df = pd.read_csv('TestingServiceObjects.csv)
phone_df.fillna('None')            

In [8]:
phone_df

Unnamed: 0,Address,City,Date,Name,Notes,Number,PhoneType,Provider,QualityScore,State,WSLIVE_Status,RealPhone_Status,Realphone Stats
0,3285 WESTBOURNE DR STE 1,CINCINNATI,4/15/2019,HUGHES ARTHUR L MD,"IsMailable,IsPorted,IsPossibleDisconnected,INF",5134516200,BUSINESS,CIN BELL ANY DIST OH,LOW,OH,COMPLETE,disconnected-70,FN
1,1215 DUNN AVE,JACKSONVILLE,12/22/2017,RALPH BOONE,"IsMailable,IsConnected,IsPorted",9047571998,RESIDENTIAL,COMCAST PHONE - FL,HIGH,FL,COMPLETE,disconnected-70,FN
2,3113 N SEPULVEDA BLVD,MANHATTAN BEACH,,CHARLES SONG MD CLINICS MEDICAL,"IsMailable,IsConnected",2037318725,BUSINESS,FRONTIER CALIFORNIA,HIGH,CA,COMPLETE,disconnected-70,FN
3,530 PARK AVE E STE 207,PRINCETON,8/10/2011,ASSOCIATED GASTROENTEROLOGY CONSULTANTS SC,"IsMailable,IsPorted,IsPossiblePortableVOIP",8158758666,BUSINESS,LEVEL 3 COMM - IL,LOW,IL,COMPLETE,connected,TP
4,1101 W CLAIREMONT AVE STE 2C,EAU CLAIRE,11/22/2011,EAU CLAIRE ANESTHESIOLOGISTS CLINICS MEDICAL,"IsMailable,IsConnected,IsPorted",7158348721,BUSINESS,CHARTER FIBERLINK-WI,HIGH,WI,COMPLETE,connected,TP
5,901 N MAPLE ST STE B,EFFINGHAM,5/31/2011,COMMUNITY MED CENTER,"IsMailable,IsPorted",2173472900,BUSINESS,MEDIACOM TEL OF IL,LOW,IL,COMPLETE,connected,TP
6,,EUGENE,,PHMG,,5416851794,,QWEST CORPORATION,LOW,OR,COMPLETE,connected,TP
7,7150 CLEARVISTA DR,INDIANAPOLIS,4/7/2016,COMMUNITY HEALTH NETWORK GENERAL PATIENT,"IsMailable,IsConnected,IsPorted,IsPossiblePort...",3176216262,BUSINESS,LEVEL 3 COMM - IN,HIGH,IN,COMPLETE,connected,TP
8,275 N BREIEL BLVD,MIDDLETOWN,9/18/2014,JON SULENTIC DO PHYSICIANS SURGEONS,"IsMailable,IsConnected,IsPorted",5134247711,BUSINESS,TIME WARNER CABLE OH,HIGH,OH,COMPLETE,connected,TP
9,4202 S S SVC,SYRACUSE,9/23/2000,ST VINCENT FAMILY CLINIC DOCTORS GENERAL PRACTICE,"IsConnected,IsPorted",5015624838,BUSINESS,WINDSTREAM COMM AR,HIGH,NY,COMPLETE,connected,TP


## Define medical words

In [53]:
med_words = ['med', 'clinic', 'health', 'gastroenter','anesthesiologist','patient','physician','surgeon','doctor','hospital', 'md', 'medical']

## Count names that contain medical words

In [66]:
complete_healthcount = 0
incomplete_healthcount = 0
complete_unhealthcount = 0
incomplete_unhealthcount = 0
for row in phone_df.itertuples():
    print (row.Name)
    healthy = False
    for word in med_words:
        if word in row.Name.lower():
            healthy = True
    if healthy == True:    
        print("yes")
        if row.WSLIVE_Status == 'COMPLETE':
            complete_healthcount += 1
        else:
            incomplete_healthcount += 1
    else:
        if row.WSLIVE_Status == 'COMPLETE':
            complete_unhealthcount += 1
        else:
            incomplete_unhealthcount += 1
print(f'TP:{complete_healthcount} TN:{incomplete_unhealthcount} FP:{incomplete_healthcount} FN:{complete_unhealthcount} ')
        

HUGHES ARTHUR L MD
yes
RALPH BOONE
CHARLES SONG MD CLINICS MEDICAL
yes
ASSOCIATED GASTROENTEROLOGY CONSULTANTS SC
yes
EAU CLAIRE ANESTHESIOLOGISTS CLINICS MEDICAL
yes
COMMUNITY MED CENTER
yes
PHMG
COMMUNITY HEALTH NETWORK GENERAL PATIENT
yes
JON SULENTIC DO PHYSICIANS SURGEONS
yes
ST VINCENT FAMILY CLINIC DOCTORS GENERAL PRACTICE
yes
ROYBAL EDWARD R COMPREHENSIVE HEALTH CENTER
yes
UNIVERSITY OF ILLINOIS MEDICAL CENTER AT CHICAGO
yes
CROUSE HOSPITAL ALCOHOL SUBSTANCE ABUSE
yes
ARCH WIRELESS O
KAISER PERMANEN
UNITED STATES GOVERNMENT
SILVER LAKES HOME THEATER ELECTRONICS DEALERS
DIGESTIVE SPECIALISTS DOCTORS GASTROENTEROLOGISTS
yes
HQ GLOBAL WORKPLACES
None
TP:8 TN:6 FP:4 FN:2 


## Count missing addresses

In [81]:
none_address_phone_df = phone_df[phone_df.Address == 'None']
none_address_phone_df = none_address_phone_df.groupby(['WSLIVE_Status']).count()
complete_address = none_address_phone_df.loc['COMPLETE', 'Address']
not_address = none_address_phone_df.loc['NOT IN SERVICE', 'Address']

## Count missing dates

In [82]:
none_date_phone_df = phone_df[phone_df.Date == 'None']
none_date_phone_df = none_date_phone_df.groupby(['WSLIVE_Status']).count()
complete_date = none_date_phone_df.loc['COMPLETE', 'Address']
not_date = none_date_phone_df.loc['NOT IN SERVICE', 'Address']

## Count business phone types

In [91]:
business_phone_df = phone_df[phone_df.PhoneType== 'BUSINESS']
business_phone_df = business_phone_df.groupby(['WSLIVE_Status']).count()
complete_busi = business_phone_df.loc['COMPLETE', 'Address']
not_busi = business_phone_df.loc['NOT IN SERVICE', 'Address']

## Count low quality scores

In [92]:
quality_phone_df = phone_df[phone_df.QualityScore== 'LOW']
quality_phone_df = quality_phone_df.groupby(['WSLIVE_Status']).count()
complete_quality = quality_phone_df.loc['COMPLETE', 'Address']
not_quality = quality_phone_df.loc['NOT IN SERVICE', 'Address']

## Count note descriptions

In [70]:
complete_mailcount = 0
complete_connectcount = 0
complete_portcount = 0
incomplete_mailcount = 0
incomplete_connectcount = 0
incomplete_portcount = 0
for row in phone_df.itertuples():
    print (row.Name)
    if row.WSLIVE_Status == 'COMPLETE':
        if 'IsMailable' in row.Notes:
            complete_mailcount += 1
        if 'IsConnected' in row.Notes:
            complete_connectcount += 1
        if 'IsPorted' in row.Notes:
            complete_portcount += 1
    else:
        if 'IsMailable' in row.Notes:
            incomplete_mailcount += 1
        if 'IsConnected' in row.Notes:
            incomplete_connectcount += 1
        if 'IsPorted' in row.Notes:
            incomplete_portcount += 1
print(f' {complete_mailcount} {complete_connectcount} {complete_portcount} {incomplete_mailcount} {incomplete_connectcount} {incomplete_portcount}')

HUGHES ARTHUR L MD
RALPH BOONE
CHARLES SONG MD CLINICS MEDICAL
ASSOCIATED GASTROENTEROLOGY CONSULTANTS SC
EAU CLAIRE ANESTHESIOLOGISTS CLINICS MEDICAL
COMMUNITY MED CENTER
PHMG
COMMUNITY HEALTH NETWORK GENERAL PATIENT
JON SULENTIC DO PHYSICIANS SURGEONS
ST VINCENT FAMILY CLINIC DOCTORS GENERAL PRACTICE
ROYBAL EDWARD R COMPREHENSIVE HEALTH CENTER
UNIVERSITY OF ILLINOIS MEDICAL CENTER AT CHICAGO
CROUSE HOSPITAL ALCOHOL SUBSTANCE ABUSE
ARCH WIRELESS O
KAISER PERMANEN
UNITED STATES GOVERNMENT
SILVER LAKES HOME THEATER ELECTRONICS DEALERS
DIGESTIVE SPECIALISTS DOCTORS GASTROENTEROLOGISTS
HQ GLOBAL WORKPLACES
None
 8 6 8 6 6 4


## Make dictionary of values 

In [95]:
stat_dict_list = [{'Variable': 'No Address',
                   'Complete': complete_address,
                   'Not In Service': not_address},
                 {'Variable': 'No Date',
                   'Complete': complete_date,
                   'Not In Service': not_date},
                 {'Variable': 'Illogical Name',
                   'Complete': complete_unhealthcount,
                   'Not In Service': incomplete_unhealthcount},
                 {'Variable': 'BusinessPhone',
                   'Complete': complete_busi,
                   'Not In Service': not_busi},
                 {'Variable': 'LowQuality',
                   'Complete':complete_quality,
                   'Not In Service': not_quality},
                 {'Variable': 'Mailable',
                   'Complete': complete_mailcount,
                   'Not In Service': incomplete_mailcount},
                 {'Variable': 'Connected',
                   'Complete': complete_connectcount,
                   'Not In Service': incomplete_connectcount},
                 {'Variable': 'Ported',
                   'Complete': complete_portcount,
                   'Not In Service': incomplete_portcount}
                 ]

## Write to dataframe

In [96]:
pd.DataFrame(stat_dict_list)

Unnamed: 0,Complete,Not In Service,Variable
0,1,4,No Address
1,2,6,No Date
2,2,6,Illogical Name
3,8,9,BusinessPhone
4,4,3,LowQuality
5,8,6,Mailable
6,6,6,Connected
7,8,4,Ported


## Run new numbers

In [8]:
connected= pd.read_csv('connected_rpv_getphone.csv')
disconnected = pd.read_csv('disconnected_rpv_getphone.csv')

In [12]:
short_results, short_df = test_numbers(connected[0:5], 'getphoneinfo_short_connected')
short_df

1
2
3
4
5


Unnamed: 0,Address,City,Date,Name,Notes,Number,PhoneType,Provider,QualityScore,State,Zipcode
0,,SAN ANGELO,,WEST TEXAS MEDICAL ASSOCIATES,IsConnected,3252245871,BUSINESS,FRONTIER COMM OF TX,MED,TX,76901
1,,CHICAGO,2019-04-01,NW MEMORIAL HOS,IsPorted,3126952857,BUSINESS,TELEPORT COMM AM-IL,LOW,IL,60611
2,2500 METROHEALTH DR,CLEVELAND,2019-09-12,METROHEALTH MEDICAL CENTER,"IsMailable,IsPorted",2167784174,BUSINESS,TELEPORT COMM AM-OH,LOW,OH,44109-1900
3,,GUAYNABO,2005-06-03,RADIATION THERAPY & CANCER INSTITUTE,IsPorted,7877745555,BUSINESS,"AT&T, INC. - PR",LOW,PR,00968
4,699 ELM ST,BUFFALO,2010-03-02,ROSWELL PARK CANCER INSTITUTE CHARITABLE,"IsConnected,IsPorted",7168452300,BUSINESS,PAETEC COMM - NY,HIGH,NY,14263


In [15]:
connected_results, connected_df = test_numbers(connected[5:], 'getphoneinfo_connected')
connected_df

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136


Unnamed: 0,Address,City,Date,Name,Notes,Number,PhoneType,Provider,QualityScore,State,Zipcode
0,,,,SANTODMNGO DR,IsConnected,8095626767,BUSINESS,CODETEL,MED,,
1,,,2010-08-02,,"IsPorted,IsUnknownContact",7877853398,,"AT&T, INC. - PR",,,
2,2597 SCHOENERSVILLE RD,BETHLEHEM,2008-01-28,LVPG ORTHOPEDICS AND SPORTS MEDICINE,"IsMailable,IsPorted,IsPossibleDisconnected,INF",4848845580,BUSINESS,"CTSI, INC. - PA",LOW,PA,18017-7325
3,,SAN ANTONIO,2016-01-15,S TEX ONCOLOGY,"IsConnected,IsPorted",2105932503,BUSINESS,TELEPORT COM AM - TX,MED,TX,78229
4,9311 MASON-MONTGOMER RD,CINCINNATI,2010-08-27,ALLIANCE PRIMARY CARE MASON OH,"IsPorted,IsPossibleDisconnected,INF",5135846898,BUSINESS,CBTS TECH SOL LLC-OH,LOW,OH,45219
5,,,,,IsUnknownContact,4126053020,,VERIZON PENNSYLVANIA,,,
6,200 HAWKINS DR,IOWA CITY,,UNIVERSITY OF IOWA HOSPITAL,IsMailable,3193849067,BUSINESS,QWEST CORPORATION,LOW,IA,52242-1009
7,739 N JEFFERSON ST,MASCOUTAH,,RURAL FAMILY MEDICINE ASSOCIATES PHYSICIANS,"IsMailable,IsConnected",6185668810,BUSINESS,FRONTIER NORTH - IL,HIGH,IL,62258-1447
8,4311 11TH AVE NE STE 600,SEATTLE,2013-09-10,MIKA SINANAN,"IsMailable,IsPorted",2065435511,RESIDENTIAL,MAGNA5 LLC,LOW,WA,98105-6369
9,,,,,IsUnknownContact,7195622360,,QWEST CORPORATION,,,


In [16]:
disconnected_results, disconnected_df = test_numbers(disconnected, 'getphoneinfo_disconnected')
disconnected_df

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
2032352505 did not return results
75
8066652120 did not return results
76
9852620475 did not return results
77
3014962128 did not return results
78
7574523479 did not return results
79
9104885714 did not return results
80
5704532555 did not return results
81
5033704950 did not return results
82
9286807707 did not return results
83
4408453940 did not return results
84
4134422402 did not return results
85
9738271961 did not return results
86
8183643001 did not return results
87
9105798363 did not return results
88
9192862287 did not return results
89
6104977700 did not return results
90
7043411122 did not return results
91
6177829210 did not return results
92
4056319315 did not return results
93
3256282654 did not return results
94
9723840273 did not return results
95
8607631568

Unnamed: 0,Address,City,Date,Name,Notes,Number,PhoneType,Provider,QualityScore,State,Zipcode
0,725 S ATLANTIC BLVD APT B,MONTEREY PARK,,THOMAS M LIN MD PHYSICIANS SURGEONS,"IsMailable,IsConnected",6265769929,BUSINESS,PACIFIC BELL,HIGH,CA,91754-3856
1,,PHOENIX,,PHX CHILDRENS H,IsConnected,6025460935,BUSINESS,QWEST CORPORATION,MED,AZ,85012
2,8400 PERRY HWY,PITTSBURGH,,NARAYANAN KRISHNA MD,"IsMailable,IsPossibleDisconnected,INF",4123643117,BUSINESS,VERIZON PENNSYLVANIA,LOW,PA,15237-5235
3,10184 SANDSTONE TRL,COLUMBIA STATION,,PHYSICIANS EDGE,"IsMailable,IsPossibleDisconnected,INF",4402368484,BUSINESS,"WINDSTREAM OHIO, LLC",LOW,OH,44028-9823
4,2026 E KLEINDALE RD,TUCSON,,MATTHEW TROUARD,"IsMailable,IsConnected",5203225418,RESIDENTIAL,QWEST CORPORATION,HIGH,AZ,85719-2439
5,1900 E TAHQUITZ CANYON WAY STE A2,PALM SPRINGS,,FRAGEN RONALD A MD A MEDICAL CORP,"IsMailable,IsConnected",7603271226,BUSINESS,FRONTIER CALIFORNIA,HIGH,CA,92262-7060
6,2002 N STOCKTON HILL RD STE 102,KINGMAN,,JOE DR GOCHOCO PLLC,"IsMailable,IsConnected",9287184375,BUSINESS,FRONTIER UTIL RURAL,HIGH,AZ,86401-4698
7,1511 TAMIAMI TRL S STE 201,VENICE,,DONALD A MCEACHERN,"IsMailable,IsPossibleDisconnected,INF",9414972138,RESIDENTIAL,FRONTIER COMM OF FL,LOW,FL,34285-5578
8,800 W AIRPORT FWY,IRVING,,ADAPT OF TEXAS,"IsMailable,IsPossibleDisconnected,INF",9725547131,BUSINESS,FRONTIER COMM OF TX,LOW,TX,75062-6312
9,1020 OAKWOOD DR,WESTMONT,,SAEED M AWAN,"IsMailable,IsPossibleDisconnected,INF",6307898030,RESIDENTIAL,AMERITECH ILLINOIS,LOW,IL,60559-1040


In [18]:
disconnected_df[0:73]

Unnamed: 0,Address,City,Date,Name,Notes,Number,PhoneType,Provider,QualityScore,State,Zipcode
0,725 S ATLANTIC BLVD APT B,MONTEREY PARK,,THOMAS M LIN MD PHYSICIANS SURGEONS,"IsMailable,IsConnected",6265769929,BUSINESS,PACIFIC BELL,HIGH,CA,91754-3856
1,,PHOENIX,,PHX CHILDRENS H,IsConnected,6025460935,BUSINESS,QWEST CORPORATION,MED,AZ,85012
2,8400 PERRY HWY,PITTSBURGH,,NARAYANAN KRISHNA MD,"IsMailable,IsPossibleDisconnected,INF",4123643117,BUSINESS,VERIZON PENNSYLVANIA,LOW,PA,15237-5235
3,10184 SANDSTONE TRL,COLUMBIA STATION,,PHYSICIANS EDGE,"IsMailable,IsPossibleDisconnected,INF",4402368484,BUSINESS,"WINDSTREAM OHIO, LLC",LOW,OH,44028-9823
4,2026 E KLEINDALE RD,TUCSON,,MATTHEW TROUARD,"IsMailable,IsConnected",5203225418,RESIDENTIAL,QWEST CORPORATION,HIGH,AZ,85719-2439
5,1900 E TAHQUITZ CANYON WAY STE A2,PALM SPRINGS,,FRAGEN RONALD A MD A MEDICAL CORP,"IsMailable,IsConnected",7603271226,BUSINESS,FRONTIER CALIFORNIA,HIGH,CA,92262-7060
6,2002 N STOCKTON HILL RD STE 102,KINGMAN,,JOE DR GOCHOCO PLLC,"IsMailable,IsConnected",9287184375,BUSINESS,FRONTIER UTIL RURAL,HIGH,AZ,86401-4698
7,1511 TAMIAMI TRL S STE 201,VENICE,,DONALD A MCEACHERN,"IsMailable,IsPossibleDisconnected,INF",9414972138,RESIDENTIAL,FRONTIER COMM OF FL,LOW,FL,34285-5578
8,800 W AIRPORT FWY,IRVING,,ADAPT OF TEXAS,"IsMailable,IsPossibleDisconnected,INF",9725547131,BUSINESS,FRONTIER COMM OF TX,LOW,TX,75062-6312
9,1020 OAKWOOD DR,WESTMONT,,SAEED M AWAN,"IsMailable,IsPossibleDisconnected,INF",6307898030,RESIDENTIAL,AMERITECH ILLINOIS,LOW,IL,60559-1040


In [24]:
connected_df = connected_df.rename(columns={'Number':'OFFICE_TELEPHONE'})
disconnected_df = disconnected_df.rename(columns={'Number':'OFFICE_TELEPHONE'})
CONNECTED = pd.merge(connected, connected_df, on= 'OFFICE_TELEPHONE')
DISCONNECTED = pd.merge(disconnected, disconnected_df[0:73], on= 'OFFICE_TELEPHONE')
CONNECTED.to_csv('getphone_connected_rpv.csv',index=False)
DISCONNECTED.to_csv('getphone_disconnected_rpv.csv',index=False)

In [26]:
CONNECTED.shape

(136, 24)

In [27]:
DISCONNECTED.shape

(73, 24)

In [50]:
count = 0
for result in connected_results:
    result['OFFICE_TELEPHONE']=str(connected_df['OFFICE_TELEPHONE'][count])
    count += 1
connected_results

[OrderedDict([('PhoneInfoResponse',
               OrderedDict([('@xmlns', 'http://www.serviceobjects.com'),
                            ('@xmlns:i',
                             'http://www.w3.org/2001/XMLSchema-instance'),
                            ('PhoneInfo',
                             OrderedDict([('Provider',
                                           OrderedDict([('Name', 'CODETEL'),
                                                        ('City',
                                                         'SANTO DOMINGO'),
                                                        ('State', 'DR'),
                                                        ('Latitude', None),
                                                        ('Longitude', None),
                                                        ('LineType',
                                                         'UNKNOWN')])),
                                          ('Contacts',
                                       

In [49]:
count = 0
for result in short_results:
    result['OFFICE_TELEPHONE']=str(short_df['Number'][count])
    count += 1
short_results

[OrderedDict([('PhoneInfoResponse',
               OrderedDict([('@xmlns', 'http://www.serviceobjects.com'),
                            ('@xmlns:i',
                             'http://www.w3.org/2001/XMLSchema-instance'),
                            ('PhoneInfo',
                             OrderedDict([('Provider',
                                           OrderedDict([('Name',
                                                         'FRONTIER COMM OF TX'),
                                                        ('City', 'SAN ANGELO'),
                                                        ('State', 'TEXAS'),
                                                        ('Latitude', '31.455'),
                                                        ('Longitude',
                                                         '-100.451'),
                                                        ('LineType',
                                                         'LANDLINE')])),
              

In [40]:
short_df

Unnamed: 0,Address,City,Date,Name,Notes,Number,PhoneType,Provider,QualityScore,State,Zipcode
0,,SAN ANGELO,,WEST TEXAS MEDICAL ASSOCIATES,IsConnected,3252245871,BUSINESS,FRONTIER COMM OF TX,MED,TX,76901
1,,CHICAGO,2019-04-01,NW MEMORIAL HOS,IsPorted,3126952857,BUSINESS,TELEPORT COMM AM-IL,LOW,IL,60611
2,2500 METROHEALTH DR,CLEVELAND,2019-09-12,METROHEALTH MEDICAL CENTER,"IsMailable,IsPorted",2167784174,BUSINESS,TELEPORT COMM AM-OH,LOW,OH,44109-1900
3,,GUAYNABO,2005-06-03,RADIATION THERAPY & CANCER INSTITUTE,IsPorted,7877745555,BUSINESS,"AT&T, INC. - PR",LOW,PR,00968
4,699 ELM ST,BUFFALO,2010-03-02,ROSWELL PARK CANCER INSTITUTE CHARITABLE,"IsConnected,IsPorted",7168452300,BUSINESS,PAETEC COMM - NY,HIGH,NY,14263


In [48]:
count = 0
for result in disconnected_results:
    result['OFFICE_TELEPHONE']=str(disconnected_df['OFFICE_TELEPHONE'][count])
    count += 1
disconnected_results

[OrderedDict([('PhoneInfoResponse',
               OrderedDict([('@xmlns', 'http://www.serviceobjects.com'),
                            ('@xmlns:i',
                             'http://www.w3.org/2001/XMLSchema-instance'),
                            ('PhoneInfo',
                             OrderedDict([('Provider',
                                           OrderedDict([('Name',
                                                         'PACIFIC BELL'),
                                                        ('City', 'ALHAMBRA'),
                                                        ('State',
                                                         'CALIFORNIA'),
                                                        ('Latitude',
                                                         '34.0884'),
                                                        ('Longitude',
                                                         '-118.13'),
                                              

In [44]:
short_results + connected_results

[OrderedDict([('PhoneInfoResponse',
               OrderedDict([('@xmlns', 'http://www.serviceobjects.com'),
                            ('@xmlns:i',
                             'http://www.w3.org/2001/XMLSchema-instance'),
                            ('PhoneInfo',
                             OrderedDict([('Provider',
                                           OrderedDict([('Name',
                                                         'FRONTIER COMM OF TX'),
                                                        ('City', 'SAN ANGELO'),
                                                        ('State', 'TEXAS'),
                                                        ('Latitude', '31.455'),
                                                        ('Longitude',
                                                         '-100.451'),
                                                        ('LineType',
                                                         'LANDLINE')])),
              

In [51]:
all_connect = connected_results + short_results
with open(f'getphone_connected_data.txt', 'w') as outfile:
            json.dump(all_connect, outfile)

In [52]:
with open(f'getphone_disconnected_data.txt', 'w') as outfile:
            json.dump(disconnected_results, outfile)

In [103]:
disconnected_results

[OrderedDict([('PhoneInfoResponse',
               OrderedDict([('@xmlns', 'http://www.serviceobjects.com'),
                            ('@xmlns:i',
                             'http://www.w3.org/2001/XMLSchema-instance'),
                            ('PhoneInfo',
                             OrderedDict([('Provider',
                                           OrderedDict([('Name',
                                                         'PACIFIC BELL'),
                                                        ('City', 'ALHAMBRA'),
                                                        ('State',
                                                         'CALIFORNIA'),
                                                        ('Latitude',
                                                         '34.0884'),
                                                        ('Longitude',
                                                         '-118.13'),
                                              

In [104]:
disconnected_PROVIDER_LIST =[]
for dictionary in disconnected_results:
    NEW_DICT = {}
    NEW_DICT['OFFICE_TELEPHONE'] = dictionary['OFFICE_TELEPHONE']
    try:
        NEW_DICT['carrier'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['Name']
        NEW_DICT['carrier_city'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['City']
        NEW_DICT['carrier_state'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['State']
        NEW_DICT['linetype'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['LineType']
    except:
        NEW_DICT['carrier'] = 'None'
        NEW_DICT['carrier_city'] = 'None'
        NEW_DICT['carrier_state'] = 'None'
        NEW_DICT['linetype'] = 'None'
    disconnected_PROVIDER_LIST.append(NEW_DICT)

In [67]:
PROVIDER_LIST =[]
for dictionary in all_connect:
    NEW_DICT = {}
    NEW_DICT['OFFICE_TELEPHONE'] = dictionary['OFFICE_TELEPHONE']
    NEW_DICT['carrier'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['Name']
    NEW_DICT['carrier_city'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['City']
    NEW_DICT['carrier_state'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['State']
    NEW_DICT['linetype'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['LineType']
    PROVIDER_LIST.append(NEW_DICT)

In [106]:
disproviders = pd.DataFrame(disconnected_PROVIDER_LIST)
disproviders

Unnamed: 0,OFFICE_TELEPHONE,carrier,carrier_city,carrier_state,linetype
0,6265769929,PACIFIC BELL,ALHAMBRA,CALIFORNIA,LANDLINE
1,6025460935,QWEST CORPORATION,PHOENIX,ARIZONA,LANDLINE
2,4123643117,VERIZON PENNSYLVANIA,PITTSBURGH,PENNSYLVANIA,LANDLINE
3,4402368484,"WINDSTREAM OHIO, LLC",COLUMBIA STATION,OHIO,LANDLINE
4,5203225418,QWEST CORPORATION,TUCSON,ARIZONA,LANDLINE
5,7603271226,FRONTIER CALIFORNIA,PALM SPRINGS,CALIFORNIA,LANDLINE
6,9287184375,FRONTIER UTIL RURAL,KINGMAN,ARIZONA,LANDLINE
7,9414972138,FRONTIER COMM OF FL,VENICE,FLORIDA,LANDLINE
8,9725547131,FRONTIER COMM OF TX,IRVING,TEXAS,LANDLINE
9,6307898030,AMERITECH ILLINOIS,HINSDALE,ILLINOIS,LANDLINE


In [76]:
RPV = pd.read_csv('../RPV_archive.csv')
RPV.dtypes

phone            int64
status          object
error_text      object
iscell          object
carrier         object
date_checked    object
dtype: object

In [107]:
RPV = RPV.astype(str)
dismerged = pd.merge(RPV, disproviders[0:72], left_on='phone', right_on='OFFICE_TELEPHONE', suffixes=['_RPV','_GetPhone'])

In [114]:
total=dismerged.shape[0]
count = 0 
other_count = 0
for row in dismerged.itertuples():
    RPV_list = row.carrier_RPV.lower().split(' ')
    GetPhone_list = row.carrier_GetPhone.lower().split(' ')
    mismatched = True
    for word in RPV_list:
        if word in GetPhone_list:
            mismatched = False
    if str(row.carrier_RPV).lower() not in str(row.carrier_GetPhone).lower() and str(row.carrier_GetPhone).lower() not in str(row.carrier_RPV).lower():
        count += 1
    if mismatched==True:
        other_count += 1
        print(f'{row.carrier_RPV} is not {row.carrier_GetPhone}')
print(count/total)
print(other_count/total)

Cablevision Corp is not CABLEVSN LGHTPATH NJ
Cincinnati Bell Tel is not CBTS TECH SOL LLC-OH
Integra  Oregon is not ALLSTREAM - OR
CTE Services is not CTSI, INC. - PA
CenturyLink is not CENTRAL TEL CO NV
Mid Continent Comm is not MIDCONTINENT COM-KS.
McLeod USA is not MCLEODUSA TEL - UT
Charter Fiber is not SOUTHWESTERN BELL
0.4305555555555556
0.1111111111111111


In [110]:
dismerged = dismerged.drop_duplicates('phone')
merged = merged.drop_duplicates('phone')

In [None]:
PROVIDER_LIST =[]
for dictionary in all_connect:
    NEW_DICT = {}
    NEW_DICT['OFFICE_TELEPHONE'] = dictionary['OFFICE_TELEPHONE']
    NEW_DICT['carrier'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['Name']
    NEW_DICT['carrier_city'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['City']
    NEW_DICT['carrier_state'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['State']
    NEW_DICT['linetype'] = dictionary['PhoneInfoResponse']['PhoneInfo']['Provider']['LineType']
    PROVIDER_LIST.append(NEW_DICT)

In [151]:
def count_this_shit(df):
    total = len(df)
    lat_count = 0
    address_count = 0
    unclear_count = 0 
    for dictionary in df:
        try:
            lat = dictionary['PhoneInfoResponse']['PhoneInfo']['Contacts']['Contact']['Latitude']
        except:
            lat='nan'
        try:
            address = dictionary['PhoneInfoResponse']['PhoneInfo']['Contacts']['Contact']['Address']
        except:
            address='nan'
        try:
            unclear = dictionary['PhoneInfoResponse']['PhoneInfo']['Contacts']['Contact']['SICDesc']
        except:
            unclear='nan'
        if pd.notna(address):
            address_count += 1
            if pd.notna(lat):
                lat_count += 1
            if pd.notna(unclear):
                unclear_count += 1
    print(f'{lat_count/address_count *100} coordinates missing. {address_count/total *100} addresses missing. {unclear_count/address_count *100} SICDescs missing.')
    

In [133]:
total_connected = len(all_connect)

In [134]:
total_disconnected = len(disconnected_results)

In [138]:
all_connect[0]

OrderedDict([('PhoneInfoResponse',
              OrderedDict([('@xmlns', 'http://www.serviceobjects.com'),
                           ('@xmlns:i',
                            'http://www.w3.org/2001/XMLSchema-instance'),
                           ('PhoneInfo',
                            OrderedDict([('Provider',
                                          OrderedDict([('Name', 'CODETEL'),
                                                       ('City',
                                                        'SANTO DOMINGO'),
                                                       ('State', 'DR'),
                                                       ('Latitude', None),
                                                       ('Longitude', None),
                                                       ('LineType',
                                                        'UNKNOWN')])),
                                         ('Contacts',
                                          OrderedDict(

In [141]:
this = 'this'
pd.notna(this)

True

In [152]:
count_this_shit(disconnected_results)

80.48780487804879 coordinates missing. 84.5360824742268 addresses missing. 67.07317073170732 SICDescs missing.


In [148]:
count_this_shit(all_connect)

44.680851063829785 coordinates missing. 72.3404255319149 addresses missing. 44.680851063829785 SICDescs missing.
