Import packages required

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import pylab as pl
import datetime
from sklearn.model_selection import train_test_split
from shutil import copyfile


In [3]:
import datetime
import os
import sys
import tkinter as tk
from tkinter import filedialog
import warnings

import settings

In [4]:
from process_model_data import convert_data_types, create_new_phone_vars, clean_model_data 
from process_model_data import create_explore_plots
from create_phone_model_input_data import create_model_initial_data
from get_entity_ppd_info import clean_ent_comm_data, clean_phn_data, clean_ent_usg_data
from get_entity_ppd_info import clean_fone_zr_data, create_ent_me_data



In [6]:
from get_input_date_range import get_input_date_range
import datalabs.curate.dataframe as df
from remove_col_name_float_pt import remove_col_name_float_pt
from get_wslive_res_init_ppd_info import get_latest_uniq_wslive


Suppress warnings that do not affect execution

In [7]:
import sklearn.exceptions
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None


# ------------------------------ Define Common Functions ---------------------------------------

In [8]:
def get_target_dist(data_df, target_var_name):
    num_target_0 = sum(data_df[target_var_name] == 0)
    num_target_1 = sum(data_df[target_var_name] == 1)
    perc_target_0 = num_target_0/(num_target_0 + num_target_1)
    perc_target_1 = num_target_1/(num_target_0 + num_target_1)

    print('Number of {} 0: {}'.format(target_var_name, num_target_0))
    print('Number of {} 1: {}'.format(target_var_name, num_target_1))
    print('Percent of {} with value 0: {}'.format(target_var_name, perc_target_0))
    print('Percent of {} with value 1: {}'.format(target_var_name, perc_target_1))

# ------------------------------ Assign Common Variables ----------------------------------------

Get paths required

In [9]:
# Get files needed
#ddb_info_file = filedialog.askopenfilename(initialdir = "C:\\",
#                                       title = "Choose txt file with database login information...")

init_wslive_dir = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\WSLive\\'
wslive_results_file = filedialog.askopenfilename(initialdir = init_wslive_dir,
                                         title = "Choose wslive file with results encoded...")

init_ppd_dir = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\PPD\\'
ppd_file_lst = filedialog.askopenfilenames(initialdir = init_ppd_dir,
                                        title = \
                                        "Choose the PPD files used to generate the WSLive samples...")

init_save_path = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Phone_Quality_Model\\'
base_save_path = filedialog.askdirectory(initialdir = init_save_path,
                                         title = "Choose base save directory...")
base_save_path = base_save_path.replace("/", "\\")
base_save_path += "\\"

init_ent_comm_dir = 'C:\\'
ent_comm_file = filedialog.askopenfilename(initialdir = init_ent_comm_dir,
                                         title = \
                                         "Choose the entity_comm_at data csv file...")

ent_comm_usg_file = filedialog.askopenfilename(title = \
                                         "Choose the entity_comm_usg_at data csv file...")

phone_file = filedialog.askopenfilename(title = \
                                            "Choose the phone_at data csv file...")

license_file = filedialog.askopenfilename(title = \
                                            "Choose the license_lt data csv file...")

ent_key_file = filedialog.askopenfilename(title = \
                                            "Choose the entity_key_et data csv file...")

fone_zr_file = filedialog.askopenfilename(title = \
                                            "Choose the fone_zr data csv file...")


Define variables needed throughout code

In [10]:
model_vars = ['correct', 'lic_state_match', 'dpc', 'res', 'pcp', 'yop', 
                       'doctor_age_yrs', 'polo_ind', 'ppd_address_type', 'ppd_region', 'ppd_division', 
                       'ppd_group', 'ppd_msa_population_size', 'ppd_md_do_code', 'ppd_micro_metro_ind', 
                       'ppd_gender', 'phone_age_yrs', 'yop_yrs', 'ppd_top_cd', 'ppd_pe_cd', 
                       'ppd_prim_spec_cd', 'ppd_polo_state', 'ent_comm_src_cat_code', 'ent_comm_comm_type',
                       'hist_ent_id_phn_count', 'hist_ent_all_phn_count', 'curr_ent_id_phn_count', 
                       'curr_ent_all_phn_count', 'curr_usg_all_phn_count', 'area_state_match']

info_vars = ['ppd_me', 'ppd_first_name', 'ppd_middle_name', 'ppd_last_name', 'ppd_suffix', 
             'ppd_polo_mailing_line_1', 'ppd_polo_mailing_line_2', 'ppd_polo_city', 
             'ppd_polo_state', 'ppd_polo_zip',
             'ppd_telephone_number', 'ppd_prim_spec_cd', 'ppd_pe_cd', 'ppd_fax_number', 
             'INIT_POLO_MAILING_LINE_1', 'INIT_POLO_MAILING_LINE_2', 'INIT_POLO_CITY', 
             'INIT_POLO_STATE', 'INIT_POLO_ZIP', 'INIT_TELEPHONE_NUMBER', 'INIT_FAX_NUMBER',
             'INIT_SAMPLE_MAX_PERFORM_MONTH', 'INIT_SAMPLE_SENT_MONTH', 'INIT_SAMPLE_DATE', 
             'OFFICE_ADDRESS_LINE_2', 'OFFICE_ADDRESS_LINE_1', 'OFFICE_ADDRESS_CITY', 
             'OFFICE_ADDRESS_STATE', 'OFFICE_ADDRESS_ZIP', 'OFFICE_TELEPHONE', 
             'OFFICE_FAX', 'WS_MONTH', 'COMMENTS', 'WSLIVE_FILE_DT', 'SOURCE', 'PPD_DATE',
             'PHONE_STATUS', 'aims_phone', 'ent_comm_comm_type', 'ent_comm_begin_dt', 
             'ent_comm_end_dt', 'ent_comm_comm_id', 'ent_comm_entity_id']

target_name = 'correct'
file_base_name = 'PhoneQualityModel'


rand_state = 45
testing_size = 0.2


Get current time and create string

In [11]:
current_time = datetime.datetime.now()
start_time_str = current_time.strftime("%Y-%m-%d")

Create model output directory based on date

In [12]:
data_path = base_save_path + 'Data\\'

if not os.path.exists(data_path):
    os.mkdir(data_path)
    
data_save_path = data_path + start_time_str + '_Model_Data\\'

if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)


Create exploratory output directory based on date

In [13]:
explore_path = data_save_path + 'Exploratory\\'

if not os.path.exists(explore_path):
    os.mkdir(explore_path)


# ------------------------------- Import Data ------------------------------------------

Get time frame of WSLive data to use

In [14]:
start_date, end_date, date_range_str = get_input_date_range()

Enter starting year (4 digit format): 2019
Enter starting month (numeric value, Jan = 1): 3
Enter starting day (default = 1): 
Enter ending year (4 digit format): 2019
Enter ending month (numeric value, Jan = 1): 8
Enter ending day (default = last day of month): 


Get samples sent to Humach for dates chosen

In [15]:
init_sample_dir = \
    'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\WSLive\\Model_Init_Samples\\'
init_sample_file_lst = filedialog.askopenfilenames(initialdir = init_sample_dir,
                       title = \
                       "Choose the WSLive standard samples sent to Humach for dates chosen...")


Get WSLive data for time frame indicated

In [16]:
# Read in wslive data
wslive_results_df = pd.read_csv(wslive_results_file, delimiter = ",", index_col = None,
                                header = 0, dtype = str)
wslive_results_df = df.rename_in_upper_case(wslive_results_df)


In [17]:
# Get data for date range specified
wslive_results_df['WSLIVE_FILE_DT'] = pd.to_datetime(wslive_results_df['WSLIVE_FILE_DT'])
wslive_date_df = wslive_results_df[(wslive_results_df['WSLIVE_FILE_DT'] >= start_date) & \
                                  (wslive_results_df['WSLIVE_FILE_DT'] <= end_date)]


In [18]:
# Use only Confirmed data
wslive_date_df = df.rename_in_upper_case(wslive_date_df)
wslive_res_df = wslive_date_df[(wslive_date_df['PHONE_STATUS'] == 'Confirmed') | \
                               (wslive_date_df['PHONE_STATUS'] == 'Updated') | \
                               (wslive_date_df['PHONE_STATUS'] == 'Known Bad')] 


In [19]:
wslive_uniq_me_df = get_latest_uniq_wslive(wslive_res_df)


In [20]:
wslive_uniq_me_df.shape

(74294, 37)

Read in entity data

In [21]:
ent_comm_df = pd.read_csv(ent_comm_file, delimiter = ",", index_col = None, header = 0, dtype = str)
ent_comm_df = ent_comm_df[ent_comm_df['comm_cat'] == 'P']

ent_comm_usg_df = pd.read_csv(ent_comm_usg_file, delimiter = ",", index_col = None, header = 0, dtype = str)
ent_comm_usg_df = ent_comm_usg_df[ent_comm_usg_df['comm_cat'] == 'P']

phone_df = pd.read_csv(phone_file, delimiter = ",", index_col = None, header = 0, dtype = str)

license_df = pd.read_csv(license_file, delimiter = ",", index_col = None, header = 0, dtype = str)

ent_key_df = pd.read_csv(ent_key_file, delimiter = ",", index_col = None, header = 0, dtype = str)

fone_zr_df = pd.read_csv(fone_zr_file, delimiter = ",", index_col = None, header = 0, dtype = str)


In [22]:
ent_comm_df = clean_ent_comm_data(ent_comm_df)    
phone_df = clean_phn_data(phone_df)
ent_comm_usg_df = clean_ent_usg_data(ent_comm_usg_df)
ent_key_df = create_ent_me_data(ent_key_df)
fone_zr_df = clean_fone_zr_data(fone_zr_df)


Call function to compile data from WSLive, entity tables, and PPD

In [23]:
orig_db_data = create_model_initial_data(wslive_uniq_me_df, init_sample_file_lst, 
                                         ppd_file_lst, ent_comm_df, ent_comm_usg_df, 
                                         phone_df, license_df, ent_key_df, fone_zr_df)


Display size of original dataset

In [24]:
orig_db_data.shape

(56290, 152)

In [25]:
orig_db_data.head(1).T

Unnamed: 0,0
ppd_address_type,1
ADDRESS_UNDELIVERABLE_FLAG,
ADDR_STATUS,Updated
BIRTH_CITY,MONROEVILLE
BIRTH_COUNTRY,US1
BIRTH_STATE,AL
ppd_birth_year,1975
BLOCK_GROUP,1
CARRIER_ROUTE,C044
CBSA,33660


Create new target variable with 1 if phone number was confirmed and 0 if it was updated or known bad

In [26]:
orig_db_data[target_name] = 0
target_one_ndx = (orig_db_data['PHONE_STATUS'] == 'Confirmed') | ((orig_db_data['PHONE_STATUS'] == 'Updated') & \
    (orig_db_data['INIT_TELEPHONE_NUMBER'] == orig_db_data['OFFICE_TELEPHONE']))
orig_db_data.loc[target_one_ndx, target_name] = 1


In [27]:
get_target_dist(orig_db_data, target_name)

Number of correct 0: 23008
Number of correct 1: 33282
Percent of correct with value 0: 0.40874045123467756
Percent of correct with value 1: 0.5912595487653224


In [28]:
orig_db_data_name = data_save_path + start_time_str + '_' + file_base_name + '_OrigDbData.csv'
orig_db_data.to_csv(orig_db_data_name, sep = ',', header = True, index = True)


Set Jupyter so that all output is displayed (not abbreviated with ...)

In [29]:
num_rows = orig_db_data.shape[0] + 1
num_cols = orig_db_data.shape[1] + 1
pd.set_option('max_rows', num_rows)
pd.set_option('max_columns', num_cols)
np.set_printoptions(threshold=np.inf)

# ------------------------------- Convert Data ------------------------------------------

Convert dataframe data types to correct ones

In [30]:
orig_data = convert_data_types(orig_db_data)

# ----------------------------- Create New Model Variables ---------------------------------------

Create new training variables

In [31]:
orig_data = create_new_phone_vars(orig_data)

In [32]:
get_target_dist(orig_data, target_name)

Number of correct 0: 23008
Number of correct 1: 33282
Percent of correct with value 0: 0.40874045123467756
Percent of correct with value 1: 0.5912595487653224


In [33]:
orig_data_name = data_save_path + start_time_str + '_' + file_base_name + '_OrigDataB4Clean.csv'
orig_data.to_csv(orig_data_name, sep = ',', header = True, index = True)


# ---------------------------- Create and Clean Model Data --------------------------------------

In [34]:
model_data = orig_data.loc[:, model_vars]

In [35]:
model_data.shape

(56290, 30)

Clean model data to remove NaN values

In [41]:
model_data, orig_data = clean_model_data(model_data, orig_data)

In [42]:
model_data.shape

(50634, 30)

Verify NaN values were removed (should be none in the dataset)

In [43]:
na_cnt = model_data.isna().sum()
na_ndx = na_cnt > 0
na_cnt_df = na_cnt[na_ndx]
print(na_cnt_df)
print(na_cnt_df.index.values)

Series([], dtype: int64)
[]


Display descriptive statistics for model dataset

In [44]:
model_data.describe(include = "all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
correct,50634,,,,0.590473,0.491751,0.0,0.0,1.0,1.0,1.0
lic_state_match,50634,2.0,1,46254.0,,,,,,,
dpc,50634,2.0,1,50631.0,,,,,,,
res,50634,1.0,0,50634.0,,,,,,,
pcp,50634,2.0,0,28901.0,,,,,,,
yop,50634,4.0,4,42967.0,,,,,,,
doctor_age_yrs,50634,,,,54.9957,10.9459,28.8438,45.8548,54.8603,62.8658,119.904
polo_ind,50634,2.0,1,50360.0,,,,,,,
ppd_address_type,50634,3.0,2,25975.0,,,,,,,
ppd_region,50634,5.0,3,18288.0,,,,,,,


Print number of 0 and 1 valus of target variable

In [45]:
get_target_dist(model_data, target_name)


Number of correct 0: 20736
Number of correct 1: 29898
Percent of correct with value 0: 0.40952719516530395
Percent of correct with value 1: 0.5904728048346961


# ---------------------------- Perform Exploratory Analysis --------------------------------------

Generate figures of the distributions of each variable and save off for further analysis

In [46]:
create_explore_plots(model_data, explore_path)


Generate plots of independent variables and the target correctness (crosstab and scatter plots)

In [47]:
model_columns = model_data.columns.values

for i in range(len(model_columns)):
    if str(model_data[model_columns[i]].dtypes) == 'category' or str(model_data[model_columns[i]].dtypes) == 'object':
        obs_race_cross = pd.crosstab(model_data[model_columns[i]], model_data[target_name])
        obs_race_cross.plot(kind="bar")
        x_lab = model_columns[i] + ' by ' + target_name
        plt.xlabel(x_lab)
        plt.ylabel('Count')
        title = 'Crosstab of ' + model_columns[i] + ' by ' + target_name
        plt.title(title)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    else:
        temp_ndx = model_data[model_columns[i]].isnull()
        temp_var = model_data[model_columns[i]][~temp_ndx]
        temp_correct = model_data[target_name][~temp_ndx]
        
        plt.scatter(x=temp_correct, y=temp_var)
        x_title = target_name + ' value'
        plt.xlabel(x_title)
        plt.ylabel(model_columns[i])
        title = model_columns[i] + ' by ' + target_name
        plt.title(title)    

    fig_path = explore_path + model_columns[i] + '_by_' + target_name + '.png'
    plt.savefig(fig_path)
    plt.close()

# ---------------------------- Create Dummy Variables --------------------------------------

Save off data before dummy variable creation

In [48]:
model_info_df = model_data[:]
model_info_df[info_vars] = orig_data[info_vars]


In [49]:
model_info_name = data_save_path + start_time_str + '_' + file_base_name + '_BeforeDummy.csv'
model_info_df.to_csv(model_info_name, sep = ',', header = True, index = True)


In [50]:
orig_data_name = data_save_path + start_time_str + '_' + file_base_name + '_CleanedOrigData.csv'
orig_data.to_csv(orig_data_name, sep = ',', header = True, index = True)


Create dummy variables for all categorical/object columns

In [51]:
model_data_all = pd.get_dummies(model_data)
model_data_all.shape

(50634, 397)

In [52]:
model_data_all = remove_col_name_float_pt(model_data_all)

Create and save pearson correlation matrix of all variables

In [53]:
all_var_corr = model_data_all.corr(method = "pearson")

corr_file = explore_path + 'All_Variable_Correlation.csv'
all_var_corr.to_csv(corr_file, sep=",")


# ------------------------------ Create Test and Train Data ----------------------------------------

Create target dataset

In [54]:
target_df = model_data_all[target_name]
target_df.shape

(50634,)

Create training dataset

In [55]:
train_df = model_data_all[:]
train_df = train_df.drop(target_name, axis = 1) 
train_df.shape

(50634, 396)

Add informational columns back into data so they will be split accordingly

In [56]:
train_df_all = train_df[:]
train_df_all[info_vars] = orig_data[info_vars]
train_df_all.shape

(50634, 439)

Create random test/train split with testing set size given by variable at the top

In [57]:
# If testing_size is > 0, split data into testing/training sets
train_columns = list(train_df_all.columns.values)

if testing_size > 0:
    data_train, data_test, target_train, target_test = \
    train_test_split(train_df_all, target_df, test_size = testing_size, 
                     random_state = rand_state, stratify = target_df)
# Else use entire dataset for both testing and training
else:
    data_train = train_df
    data_test = train_df
    target_train = target_df
    target_test = target_df
    

Convert numpy outputs back into dataframes for saving

In [58]:
data_train_df = pd.DataFrame(data_train, columns = train_columns)
train_indices = data_train_df.index.values
target_train_df = pd.DataFrame(target_train, index = train_indices, columns = [target_name])

data_test_df = pd.DataFrame(data_test, columns = train_columns)
test_indices = data_test_df.index.values
target_test_df = pd.DataFrame(target_test, index = test_indices, columns = [target_name])


In [59]:
data_train_df.shape

(40507, 439)

In [60]:
data_test_df.shape

(10127, 439)

Extract informational columns from training and testing datasets

In [61]:
data_train_info_df = data_train_df.loc[:, info_vars]
data_train_df = data_train_df.drop(info_vars, axis = 1)

data_test_info_df = data_test_df.loc[:, info_vars]
data_test_df = data_test_df.drop(info_vars, axis = 1)


In [62]:
data_train_df.shape

(40507, 396)

In [63]:
data_test_df.shape

(10127, 396)

Print number of 0s and 1s in training dataset target variable

In [64]:
get_target_dist(target_train_df, target_name)


Number of correct 0: 16589
Number of correct 1: 23918
Percent of correct with value 0: 0.4095341545905646
Percent of correct with value 1: 0.5904658454094354


Print number of 0s and 1s in testing dataset target variable

In [65]:
get_target_dist(target_test_df, target_name)


Number of correct 0: 4147
Number of correct 1: 5980
Percent of correct with value 0: 0.40949935815147626
Percent of correct with value 1: 0.5905006418485238


# -------------------------- Save Testing and Training Data -------------------------------------

Save training data and target to csv files

In [66]:
train_data_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainData.csv'
data_train_df.to_csv(train_data_name, sep = ',', header = True, index = True)
train_target_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainTarget.csv'
target_train_df.to_csv(train_target_name, sep = ',', header = True, index = True)


Save testing data and target to csv files

In [67]:

test_data_name = data_save_path + start_time_str + '_' + file_base_name + '_TestData.csv'
data_test_df.to_csv(test_data_name, sep = ',', header = True, index = True)
test_target_name = data_save_path + start_time_str + '_' + file_base_name + '_TestTarget.csv'
target_test_df.to_csv(test_target_name, sep = ',', header = True, index = True)


Save information dataframes

In [68]:
train_data_info_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainData_Info.csv'
data_train_info_df.to_csv(train_data_info_name, sep = ',', header = True, index = True)

test_data_info_name = data_save_path + start_time_str + '_' + file_base_name + '_TestData_Info.csv'
data_test_info_df.to_csv(test_data_info_name, sep = ',', header = True, index = True)


Save unsplit data to csv files (to use to duplicate previous modeling)

In [69]:
unsplit_test_data_name = data_save_path + start_time_str + '_' + file_base_name + '_UnsplitData.csv'
train_df.to_csv(unsplit_test_data_name, sep = ',', header = True, index = True)
unsplit_test_target_name = data_save_path + start_time_str + '_' + file_base_name + \
    '_UnsplitTarget.csv'
target_df.to_csv(unsplit_test_target_name, sep = ',', header = True, index = True)


Save unsplit data information dataframes

In [70]:
unsplit_test_data_info_name = data_save_path + start_time_str + '_' + file_base_name + \
    '_UnsplitData_Info.csv'
orig_data[info_vars].to_csv(unsplit_test_data_info_name, sep = ',', header = True, index = True)

Save results

In [71]:
end_time = datetime.datetime.now()
elapsed_time = end_time - current_time
print('Elapsed time (sec): {}'.format(elapsed_time))
print('Elapsed time (min): {}'.format((elapsed_time / 60)))
print('Elapsed time (hrs): {}'.format(((elapsed_time / 60) / 60)))

Elapsed time (sec): 0:01:50.893240
Elapsed time (min): 0:00:01.848221
Elapsed time (hrs): 0:00:00.030804
