# RPV + GetPhoneInfo Model

In [1]:
import pandas as pd
import pickle as pk
import numpy as np

from datetime import datetime
from dateutil.parser import parse

In [None]:
wslive_path = 'U:\\Source Files\\Data Analytics\\Derek\\SAS_DATA\\SURVEY\\wslive_results.sas7bdat'

In [None]:
wslive_df = pd.read_sas(wslive_path, encoding='latin')

In [None]:
wslive_df.head()

In [None]:
wslive_df.columns

In [None]:
wslive_df['Source'].drop_duplicates()

In [None]:
wslive_df = wslive_df[wslive_df['WSLIVE_FILE_DT'] >= '2019-08-01']

In [None]:
len(wslive_df)

In [2]:
wslive_df = pk.load(open('wslive_2019.pk','rb'))  # pickled the above wslive_2019

In [3]:
wslive_df.head()

Unnamed: 0,PHYSICIAN_ME_NUMBER,PHYSICIAN_FIRST_NAME,PHYSICIAN_MIDDLE_NAME,PHYSICIAN_LAST_NAME,SUFFIX,DEGREE,OFFICE_ADDRESS_LINE_1,OFFICE_ADDRESS_LINE_2,OFFICE_ADDRESS_CITY,OFFICE_ADDRESS_STATE,...,PRESENT_EMPLOYMENT_UPDATED,COMMENTS,Source,WSLIVE_SOURCE,WSLIVE_FILE_DT,MATCH_ADDR,MATCH_PHONE,MATCH_ADDR_LONG,SPECIALTY,SPECIALTY_UPDATED
1830990,74820970041,RONALD,ANTHONY,FELIPE,,,,2 HILLVIEW DR,NEWTOWN,PA,...,1,ANSWERING SERVICE,Z,OTHERS,2019-01-04,7482097004218940,74820970042155791774,,IM,1
1830991,1902961581,ANGELA,MARIE,ROWDEN,,,,9157 HUEBNER RD,SAN ANTONIO,TX,...,1,"MOVED, NO FORWARDING INFO",Z,OTHERS,2019-01-04,190296158915778240,01902961589128394949,,OPH,1
1830992,6703650013,DENIS,J,BLAIS,,,,780 MAIN ST STE 2C,S WEYMOUTH,MA,...,1,"MOVED, NO FORWARDING INFO",Z,OTHERS,2019-01-04,67036500178002190,06703650017813314600,,U,1
1830993,2507980956,MOHAMMED,OVAIS,PERACHA,,,,1500 ASSOCIATES DR,DUBUQUE,IA,...,1,"MOVED, NO FORWARDING INFO",Z,OTHERS,2019-01-04,250798095150052002,02507980957088913330,,OPH,1
1830994,64903840097,ALEJANDRO,G,HINOJOSA,,,ALEJANDRO HINOJOSA-VALENCIA M D INC,340 FOURTH AVE STE 8,CHULA VISTA,CA,...,1,"MOVED, NO FORWARDING INFO",Z,OTHERS,2019-01-04,649038400934091910,64903840096192167546,,IM,1


In [4]:
wsl = wslive_df[wslive_df['WSLIVE_FILE_DT'] >= '2019-07-01']
len(wsl)

76676

In [5]:
gpi = pd.read_csv('GetPhoneInfoAPIResults.csv', dtype=object)
gpi.drop(columns='WSLive_Status', axis=1, inplace=True)

for col in gpi:
    gpi[col] = gpi[col].astype(str)

print('Sample size:', len(gpi))


to_del = []

# encoding 'Notes'

gpi['gpi_isConnected'] = gpi['Notes'].apply(lambda x: 1 if 'IsConnected' in x else 0)
gpi['gpi_isPorted'] = gpi['Notes'].apply(lambda x: 1 if 'IsPorted' in x else 0)
gpi['gpi_isMailable'] = gpi['Notes'].apply(lambda x: 1 if 'IsMailable' in x else 0)
gpi['gpi_isWireless'] = gpi['Notes'].apply(lambda x: 1 if 'IsWireless' in x else 0)
gpi['gpi_isPossibleDisconnected'] = gpi['Notes'].apply(lambda x: 1 if 'IsPossibleDisconnected' in x else 0)
gpi['gpi_isPossiblePortableVOIP'] = gpi['Notes'].apply(lambda x: 1 if 'IsPossiblePortableVOIP' in x else 0)
gpi['gpi_INF'] = gpi['Notes'].apply(lambda x: 1 if 'INF' in x else 0)

to_del.append('Notes')

# encoding 'QualityScore'
quality = {
    'HIGH': 3,
    'MED': 2,
    'LOW': 1,
    'nan': 1,
    None: 1
}
gpi['gpi_qs_null'] = gpi['QualityScore'].apply(lambda x: 1 if x in ['nan', None] else 0)
gpi['gpi_qs'] = gpi['QualityScore'].apply(lambda x: quality[x] if x in quality else 1)

to_del.append('QualityScore')

# encoding name
med_terms = [
    'PHYSICIAN',
    'SURGEON',
    'HOSPITAL',
    'CLINIC',
    'HEALTH',
    ' MD',
    'MEDICAL',
    'NEUROLOGY',
    'CANCER',
    'MEDICINE',
    'FAMILY',
    'PRACTICE',
    'MEMORIAL'
]
gpi['gpi_name_meddy'] = gpi['Name'].apply(lambda x: 1 if any([t in x for t in med_terms]) else 0)

to_del.append('Name')


# standardizing Zipcode to first 5 digits

gpi['gpi_zip'] = gpi['Zipcode'].apply(lambda x: x[:5] if len(x)>=5 else x)  # reduce extended Zipcodes
gpi['gpi_zip'] = gpi['gpi_zip'].apply(lambda x: ('000' + x)[-5:] if x.isdigit() else x)

to_del.append('Zipcode')


# standardizing PhoneType

gpi['gpi_phonetype'] = gpi['PhoneType'].apply(lambda x: x if x in ['BUSINESS', 'RESIDENTIAL'] else 'UNKNOWN')

to_del.append('PhoneType')


# parse 'Date' as Datetime Date
gpi['gpi_date'] = gpi['Date'].apply(lambda x: parse(x) if x not in ['nan', 'None', None] else x)
to_del.append('Date')
# get date diff
gpi['gpi_date_diff'] = gpi['gpi_date'].apply(lambda x: (datetime.now() - x).days if x not in ['nan','None'] else x)
notnull_date_diffs = [d for d in gpi['gpi_date_diff'].values if d not in ['nan', 'None', None]]
gpi['gpi_date_diff'] = gpi['gpi_date_diff'].apply(lambda x: x if x not in ['nan', 'None', None] else 2*max(notnull_date_diffs))
gpi['gpi_date_missing'] = gpi['gpi_date'].apply(lambda x: 1 if x in ['nan', 'None', None] else 0)

to_del.append('gpi_date')


gpi.drop(columns=to_del, axis=1, inplace=True)


# area and prefix features
gpi['gpi_area_code'] = gpi['OFFICE_TELEPHONE'].apply(lambda x: x[:3])
gpi['gpi_area+prefix'] = gpi['OFFICE_TELEPHONE'].apply(lambda x: x[:6])


# renaming address fields
gpi.rename(columns={'Address': 'gpi_address',
                   'City': 'gpi_city',
                   'State':'gpi_state'}, inplace=True)


# categorize
gpi_categoricals = [
    'gpi_city',
    'Provider',
    'gpi_state',
    'gpi_zip',
    'gpi_phonetype',
    'gpi_area_code',
    'gpi_area+prefix'
]

for col in gpi_categoricals:
    gpi[col] = gpi[col].astype('category')

gpi.head()

Sample size: 459


Unnamed: 0,OFFICE_TELEPHONE,gpi_address,gpi_city,Provider,gpi_state,gpi_isConnected,gpi_isPorted,gpi_isMailable,gpi_isWireless,gpi_isPossibleDisconnected,...,gpi_INF,gpi_qs_null,gpi_qs,gpi_name_meddy,gpi_zip,gpi_phonetype,gpi_date_diff,gpi_date_missing,gpi_area_code,gpi_area+prefix
0,2013586776,,HILLSDALE,CABLEVSN LGHTPATH NJ,NJ,1,1,0,0,0,...,0,0,2,0,7642,BUSINESS,618,0,201,201358
1,2014874088,20 PROSPECT AVE,HACKENSACK,VERIZON NEW JERSEY,NJ,1,0,1,0,0,...,0,0,3,1,7601,BUSINESS,13720,1,201,201487
2,2014883131,160 OVERLOOK AVE STE 1A,HACKENSACK,"AIRUS, INC. - NJ",NJ,1,1,1,0,0,...,0,0,3,1,7601,BUSINESS,610,0,201,201488
3,2018714346,401 S VAN BRUNT ST STE 405,ENGLEWOOD,MONMOUTH TEL&TEL NJ,NJ,1,1,1,0,0,...,0,0,3,1,7631,BUSINESS,1978,0,201,201871
4,2019962403,30 PROSPECT AVE,HACKENSACK,TELEPORT COM NY - NJ,NJ,0,1,1,0,0,...,0,0,1,0,7601,BUSINESS,6615,0,201,201996


In [6]:
gpi.dtypes

OFFICE_TELEPHONE                object
gpi_address                     object
gpi_city                      category
Provider                      category
gpi_state                     category
gpi_isConnected                  int64
gpi_isPorted                     int64
gpi_isMailable                   int64
gpi_isWireless                   int64
gpi_isPossibleDisconnected       int64
gpi_isPossiblePortableVOIP       int64
gpi_INF                          int64
gpi_qs_null                      int64
gpi_qs                           int64
gpi_name_meddy                   int64
gpi_zip                       category
gpi_phonetype                 category
gpi_date_diff                    int64
gpi_date_missing                 int64
gpi_area_code                 category
gpi_area+prefix               category
dtype: object

In [7]:
# get RPV archive

rpv_archive=pd.read_csv('U:\\Source Files\\Data Analytics\\Data-Science\\Data\\RPV\output\\_archive\\RPV_archive.csv', dtype=object).drop_duplicates()
print('Number of archived RPV results:', len(rpv_archive))

# encode RPV status
rpv_statuses = {
    'disconnected': 4,
    'disconnected-70': 3,
    'connected-75': 2,
    'connected': 1
}
rpv_statuses_inv = {
    4: 'disconnected',
    3: 'disconnected-70',
    2: 'connected-75',
    1: 'connected',
    0: 'other (error)'
}
rpv_archive['rpv_status'] = rpv_archive['status'].apply(lambda x: rpv_statuses[x] if x in rpv_statuses else 0)

# encode rpv iscell
rpv_archive['rpv_iscell_V'] = rpv_archive['iscell'].apply(lambda x: 1 if x=='V' else 0)
rpv_archive['rpv_iscell_Y'] = rpv_archive['iscell'].apply(lambda x: 1 if x=='Y' else 0)

# categorize carrier
rpv_archive['carrier'] = rpv_archive['carrier'].astype('category')

rpv_archive.drop(columns=['status','error_text','iscell','date_checked'], axis=1, inplace=True)


# rename carrier
rpv_archive.rename(columns={'carrier': 'rpv_carrier'},inplace=True)


Number of archived RPV results: 5076


In [8]:
rpv_archive.head()

Unnamed: 0,phone,rpv_carrier,rpv_status,rpv_iscell_V,rpv_iscell_Y
0,2012000318,Comcast of MD,1,0,0
1,2012002626,MCImetro Former MCI,1,0,0
2,2012040004,Level 3,1,0,0
3,2012071052,Verizon Wireless,1,0,1
4,2012074846,Verizon Wireless,1,0,1


In [9]:
# add RPV results to gpi

gpi = gpi.merge(rpv_archive, left_on='OFFICE_TELEPHONE', right_on='phone', how='inner')
gpi.drop(columns='phone', axis=1, inplace=True)
len(gpi)

412

In [10]:
gpi.dtypes

OFFICE_TELEPHONE                object
gpi_address                     object
gpi_city                      category
Provider                      category
gpi_state                     category
gpi_isConnected                  int64
gpi_isPorted                     int64
gpi_isMailable                   int64
gpi_isWireless                   int64
gpi_isPossibleDisconnected       int64
gpi_isPossiblePortableVOIP       int64
gpi_INF                          int64
gpi_qs_null                      int64
gpi_qs                           int64
gpi_name_meddy                   int64
gpi_zip                       category
gpi_phonetype                 category
gpi_date_diff                    int64
gpi_date_missing                 int64
gpi_area_code                 category
gpi_area+prefix               category
rpv_carrier                   category
rpv_status                       int64
rpv_iscell_V                     int64
rpv_iscell_Y                     int64
dtype: object

In [None]:
## get disconnect model predictions
#scores = pd.read_csv('scored_ppd_recall.csv')
#scores['ppd_telephone_number'] = scores['ppd_telephone_number'].astype(str)
#scores = scores.sort_values(by='pred_probability').groupby('ppd_telephone_number').first().reset_index()


In [None]:
#scores = scores[['ppd_telephone_number', 'pred_probability']]

In [None]:
#scores.dtypes

In [None]:
#gpi = gpi.merge(scores, left_on='OFFICE_TELEPHONE', right_on='ppd_telephone_number', how='inner')
#len(gpi)

In [None]:
#m_categoricals = [
#    'phone_src',
#    'ppd_address_type',
#    'ppd_region',
#    'ppd_division',
#    'ppd_group',
#    'ppd_msa_population_size',
#    'ppd_micro_metro_ind',
#    'ppd_polo_state',
#    'pe_description',
#    'ppd_top_cd',
#    'ppd_pe_cd',
#    'ppd_prim_spec_cd'
#]
#for c in m_categoricals:
#    gpi[c] = scores[c].astype('category')

In [11]:
gpi.dtypes

OFFICE_TELEPHONE                object
gpi_address                     object
gpi_city                      category
Provider                      category
gpi_state                     category
gpi_isConnected                  int64
gpi_isPorted                     int64
gpi_isMailable                   int64
gpi_isWireless                   int64
gpi_isPossibleDisconnected       int64
gpi_isPossiblePortableVOIP       int64
gpi_INF                          int64
gpi_qs_null                      int64
gpi_qs                           int64
gpi_name_meddy                   int64
gpi_zip                       category
gpi_phonetype                 category
gpi_date_diff                    int64
gpi_date_missing                 int64
gpi_area_code                 category
gpi_area+prefix               category
rpv_carrier                   category
rpv_status                       int64
rpv_iscell_V                     int64
rpv_iscell_Y                     int64
dtype: object

In [13]:
wsl['COMMENTS'].value_counts()

FAIL                         29975
COMPLETE                     25838
2ND ATTEMPT                   7096
MOVED, NO FORWARDING INFO     5677
NOT IN SERVICE                3294
FAX MODEM                     1443
WRONG NUMBER                  1248
REFUSAL                        972
RETIRED                        879
ANSWERING SERVICE               77
LANGUAGE/HEARING                63
DO NOT CALL                     55
DECEASED                        26
RESPONDED TO SURVEY - AMA        2
DUPLICATE                        1
Name: COMMENTS, dtype: int64

In [14]:
wsl['isDisconnected'] = wsl['COMMENTS'].apply(lambda x: 1 if x == 'NOT IN SERVICE' else 0)
wsl['isWrong'] = wsl['COMMENTS'].apply(lambda x: 1 if x in ['MOVED, NO FORWARDING INFO','FAX MODEM','WRONG NUMBER'] else 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [15]:
wsl = wsl[['OFFICE_TELEPHONE','isDisconnected', 'isWrong']]
gpi = gpi.merge(wsl, on='OFFICE_TELEPHONE', how='inner')

In [16]:
gpi.dtypes

OFFICE_TELEPHONE                object
gpi_address                     object
gpi_city                      category
Provider                      category
gpi_state                     category
gpi_isConnected                  int64
gpi_isPorted                     int64
gpi_isMailable                   int64
gpi_isWireless                   int64
gpi_isPossibleDisconnected       int64
gpi_isPossiblePortableVOIP       int64
gpi_INF                          int64
gpi_qs_null                      int64
gpi_qs                           int64
gpi_name_meddy                   int64
gpi_zip                       category
gpi_phonetype                 category
gpi_date_diff                    int64
gpi_date_missing                 int64
gpi_area_code                 category
gpi_area+prefix               category
rpv_carrier                   category
rpv_status                       int64
rpv_iscell_V                     int64
rpv_iscell_Y                     int64
isDisconnected           

In [20]:


Xy = gpi.set_index('OFFICE_TELEPHONE')
del Xy['gpi_address']
#del Xy['ppd_telephone_number']


target = Xy['isDisconnected']
del Xy['isDisconnected']
#del Xy['isWrong']

In [21]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [32]:
X_train, X_test, y_train, y_test = train_test_split(Xy, target, test_size=0.4)

train_dataset = lgb.Dataset(X_train, y_train)
test_dataset = lgb.Dataset(X_test, y_test)


# T - no. of total samples
# P - no. of positive samples
T = len(target)
P = sum(target)
pos = ((1.0 * T)/P) - 1


# experiment with different parameters
param = {
    'num_leaves': 71, 
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.2,
    'scale_pos_weight': pos
}
n_iter = 500

bst = lgb.train(param, train_dataset, n_iter, valid_sets=test_dataset, early_stopping_rounds=50)

y_pred = bst.predict(X_test)
y_pred_c = [1 if p >= 0.5 else 0 for p in y_pred]

print()
print('Precision:', metrics.precision_score(y_true=y_test, y_pred=y_pred_c))
print('Recall:', metrics.recall_score(y_true=y_test, y_pred=y_pred_c))

[1]	valid_0's auc: 0.79498
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.805575
[3]	valid_0's auc: 0.814308
[4]	valid_0's auc: 0.828928
[5]	valid_0's auc: 0.83208
[6]	valid_0's auc: 0.843017
[7]	valid_0's auc: 0.854599
[8]	valid_0's auc: 0.859117
[9]	valid_0's auc: 0.862687
[10]	valid_0's auc: 0.861852
[11]	valid_0's auc: 0.861472
[12]	valid_0's auc: 0.858776
[13]	valid_0's auc: 0.860674
[14]	valid_0's auc: 0.86113
[15]	valid_0's auc: 0.857978
[16]	valid_0's auc: 0.85851
[17]	valid_0's auc: 0.859649
[18]	valid_0's auc: 0.859269
[19]	valid_0's auc: 0.860333
[20]	valid_0's auc: 0.860409
[21]	valid_0's auc: 0.859497
[22]	valid_0's auc: 0.860409
[23]	valid_0's auc: 0.860181
[24]	valid_0's auc: 0.862231
[25]	valid_0's auc: 0.862231
[26]	valid_0's auc: 0.859649
[27]	valid_0's auc: 0.862155
[28]	valid_0's auc: 0.862231
[29]	valid_0's auc: 0.859877
[30]	valid_0's auc: 0.858396
[31]	valid_0's auc: 0.861586
[32]	valid_0's auc: 0.859155
[33]	valid_0's auc: 0.8

# Conclusion

Much better than RPV alone,
still more to potentially gain from additional feature engineering and training data.