Import packages required

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import pylab as pl
import datetime
from sklearn.model_selection import train_test_split
from shutil import copyfile


In [3]:
import datetime
import os
import sys
import tkinter as tk
from tkinter import filedialog
import warnings

import settings

In [4]:
from process_model_data import convert_data_types, create_new_addr_vars, clean_model_data 
from process_model_data import create_explore_plots
from create_addr_model_input_data import create_model_initial_data
from get_wslive_ppd_entity_info import get_latest_uniq_wslive


In [6]:
from get_input_date_range import get_input_date_range
import datalabs.curate.dataframe as df
from remove_col_name_float_pt import remove_col_name_float_pt


Suppress warnings that do not affect execution

In [7]:
import sklearn.exceptions
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None


# ------------------------------ Define Common Functions ---------------------------------------

In [8]:
def get_target_dist(data_df, target_var_name):
    num_target_0 = sum(data_df[target_var_name] == 0)
    num_target_1 = sum(data_df[target_var_name] == 1)
    perc_target_0 = num_target_0/(num_target_0 + num_target_1)
    perc_target_1 = num_target_1/(num_target_0 + num_target_1)

    print('Number of {} 0: {}'.format(target_var_name, num_target_0))
    print('Number of {} 1: {}'.format(target_var_name, num_target_1))
    print('Percent of {} with value 0: {}'.format(target_var_name, perc_target_0))
    print('Percent of {} with value 1: {}'.format(target_var_name, perc_target_1))

# ------------------------------ Assign Common Variables ----------------------------------------

Get paths required

In [9]:
# Get files needed
#ddb_info_file = filedialog.askopenfilename(initialdir = "C:\\",
#                                       title = "Choose txt file with database login information...")

init_wslive_dir = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\WSLive\\'
wslive_results_file = filedialog.askopenfilename(initialdir = init_wslive_dir,
                                         title = "Choose wslive file with results encoded...")

init_ppd_dir = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\PPD\\'
ppd_file_lst = filedialog.askopenfilenames(initialdir = init_ppd_dir,
                                        title = \
                                        "Choose the PPD files used to generate the WSLive samples...")

init_save_path = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Polo_Rank_Model\\'
base_save_path = filedialog.askdirectory(initialdir = init_save_path,
                                         title = "Choose base save directory...")
base_save_path = base_save_path.replace("/", "\\")
base_save_path += "\\"

init_ent_comm_dir = 'C:\\'
ent_comm_file = filedialog.askopenfilename(initialdir = init_ent_comm_dir,
                                         title = \
                                         "Choose the entity_comm_at data csv file...")

ent_comm_usg_file = filedialog.askopenfilename(title = \
                                         "Choose the entity_comm_usg_at data csv file...")

post_addr_file = filedialog.askopenfilename(title = \
                                            "Choose the post_addr_at data csv file...")

license_file = filedialog.askopenfilename(title = \
                                            "Choose the license_lt data csv file...")

ent_key_file = filedialog.askopenfilename(title = \
                                            "Choose the entity_key_et data csv file...")


Define variables needed throughout code

In [10]:
model_vars = ['correct', 'lic_state_match', 'pcp', 'ent_comm_src_cat_code', 'ent_comm_comm_type',
              'addr_age_yrs', 'yop_yrs', 'doctor_age_yrs','ppd_address_type', 'ppd_region', 
              'ppd_division', 'ppd_group', 'ppd_msa_population_size', 'ppd_md_do_code', 
              'ppd_micro_metro_ind', 'ppd_gender', 'ppd_top_cd', 'ppd_pe_cd', 'ppd_prim_spec_cd', 
              'ppd_polo_state', 
              'hist_ent_id_addr_count', 'hist_ent_all_addr_count', 'curr_ent_id_addr_count', 
              'curr_ent_all_addr_count', 'curr_usg_all_addr_count']

info_vars = ['ppd_me', 'ppd_first_name', 'ppd_middle_name', 'ppd_last_name', 'ppd_suffix', 
             'ppd_polo_mailing_line_1', 'ppd_polo_mailing_line_2', 'ppd_polo_city', 
             'ppd_polo_state', 'ppd_polo_zip',
             'ppd_telephone_number', 'ppd_prim_spec_cd', 'ppd_pe_cd', 'ppd_fax_number', 
             'post_addr_line1', 'post_addr_line2', 'post_city_cd', 'post_state_cd',
             'post_zip',
             'INIT_POLO_MAILING_LINE_1', 'INIT_POLO_MAILING_LINE_2', 'INIT_POLO_CITY',
             'INIT_POLO_STATE', 'INIT_POLO_ZIP', 'ADDR_STATUS', 'COMMENTS',
             'ent_comm_comm_type', 'ent_comm_begin_dt', 'ent_comm_end_dt']

target_name = 'correct'
file_base_name = 'PoloRankModel'


rand_state = 45
testing_size = 0.2


Get current time and create string

In [11]:
current_time = datetime.datetime.now()
start_time_str = current_time.strftime("%Y-%m-%d")

Create model output directory based on date

In [12]:
data_path = base_save_path + 'Data\\'

if not os.path.exists(data_path):
    os.mkdir(data_path)
    
data_save_path = data_path + start_time_str + '_Model_Data\\'

if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)


Create exploratory output directory based on date

In [13]:
explore_path = data_save_path + 'Exploratory\\'

if not os.path.exists(explore_path):
    os.mkdir(explore_path)


# ------------------------------- Import Data ------------------------------------------

Get time frame of WSLive data to use

In [14]:
start_date, end_date, date_range_str = get_input_date_range()

Enter starting year (4 digit format): 2019
Enter starting month (numeric value, Jan = 1): 3
Enter starting day (default = 1): 
Enter ending year (4 digit format): 2019
Enter ending month (numeric value, Jan = 1): 8
Enter ending day (default = last day of month): 


Get samples sent to Humach for dates chosen

In [15]:
init_sample_dir = \
    'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\WSLive\\Model_Init_Samples\\'
init_sample_file_lst = filedialog.askopenfilenames(initialdir = init_sample_dir,
                       title = \
                       "Choose the WSLive standard samples sent to Humach for dates chosen...")


Get WSLive data for time frame indicated

In [16]:
# Read in wslive data
wslive_results_df = pd.read_csv(wslive_results_file, delimiter = ",", index_col = None,
                                header = 0, dtype = str)
wslive_results_df = df.rename_in_upper_case(wslive_results_df)


In [17]:
# Get data for date range specified
wslive_results_df['WSLIVE_FILE_DT'] = pd.to_datetime(wslive_results_df['WSLIVE_FILE_DT'])
wslive_date_df = wslive_results_df[(wslive_results_df['WSLIVE_FILE_DT'] >= start_date) & \
                                  (wslive_results_df['WSLIVE_FILE_DT'] <= end_date)]


In [18]:
# Use only Confirmed data
wslive_date_df = df.rename_in_upper_case(wslive_date_df)
wslive_res_df = wslive_date_df[(wslive_date_df['ADDR_STATUS'] == 'Confirmed') | \
                               (wslive_date_df['ADDR_STATUS'] == 'Updated')] 


In [19]:
wslive_uniq_me_df = get_latest_uniq_wslive(wslive_res_df)


In [20]:
wslive_uniq_me_df.shape

(47661, 37)

Read in entity data

In [21]:
ent_comm_df = pd.read_csv(ent_comm_file, delimiter = ",", index_col = None, header = 0, dtype = str)
ent_comm_df = ent_comm_df[ent_comm_df['comm_cat'] == 'A']

ent_comm_usg_df = pd.read_csv(ent_comm_usg_file, delimiter = ",", index_col = None, header = 0, dtype = str)
ent_comm_usg_df = ent_comm_usg_df[ent_comm_usg_df['comm_cat'] == 'A']

post_addr_df = pd.read_csv(post_addr_file, delimiter = ",", index_col = None, header = 0, dtype = str)

license_df = pd.read_csv(license_file, delimiter = ",", index_col = None, header = 0, dtype = str)

ent_key_df = pd.read_csv(ent_key_file, delimiter = ",", index_col = None, header = 0, dtype = str)


In [23]:
ppd_file_lst


('C:/Data-Science/Data/PPD/ppd_data_20190216.csv',
 'C:/Data-Science/Data/PPD/ppd_data_20190322.csv',
 'C:/Data-Science/Data/PPD/ppd_data_20190430.csv',
 'C:/Data-Science/Data/PPD/ppd_data_20190531.csv',
 'C:/Data-Science/Data/PPD/ppd_data_20190629.csv',
 'C:/Data-Science/Data/PPD/ppd_data_20190713.csv')

Call function to compile data from WSLive, entity tables, and PPD

In [24]:
orig_db_data = create_model_initial_data(wslive_uniq_me_df, init_sample_file_lst, 
                                         ppd_file_lst, ent_comm_df, ent_comm_usg_df, 
                                         post_addr_df, license_df, ent_key_df)


Display size of original dataset

In [25]:
orig_db_data.shape

(30328, 152)

In [26]:
orig_db_data.head(1).T

Unnamed: 0,0
ppd_address_type,2
ADDRESS_UNDELIVERABLE_FLAG,
ADDR_STATUS,Updated
BIRTH_CITY,STARKVILLE
BIRTH_COUNTRY,US1
BIRTH_STATE,MS
ppd_birth_year,1970
BLOCK_GROUP,3
CARRIER_ROUTE,R007
CBSA,32820


Create new target variable with 1 if phone number was confirmed and 0 if it was updated or known bad

In [27]:
orig_db_data[target_name] = 0
target_one_ndx = (orig_db_data['ent_addr_key'] == orig_db_data['WSLIVE_ADDR_KEY']) & \
    ((orig_db_data['ADDR_STATUS'] == 'Confirmed') | ((orig_db_data['ADDR_STATUS'] == 'Updated') & \
    (orig_db_data['INIT_SMPL_ADDR_KEY'] == orig_db_data['WSLIVE_ADDR_KEY'])))
orig_db_data.loc[target_one_ndx, target_name] = 1


In [28]:
get_target_dist(orig_db_data, target_name)

Number of correct 0: 8940
Number of correct 1: 21388
Percent of correct with value 0: 0.2947771036665787
Percent of correct with value 1: 0.7052228963334213


In [29]:
orig_db_data_name = data_save_path + start_time_str + '_' + file_base_name + '_OrigDbData.csv'
orig_db_data.to_csv(orig_db_data_name, sep = ',', header = True, index = True)


Set Jupyter so that all output is displayed (not abbreviated with ...)

In [30]:
num_rows = orig_db_data.shape[0] + 1
num_cols = orig_db_data.shape[1] + 1
pd.set_option('max_rows', num_rows)
pd.set_option('max_columns', num_cols)
np.set_printoptions(threshold=np.inf)

# ------------------------------- Convert Data ------------------------------------------

Convert dataframe data types to correct ones

In [31]:
orig_data = convert_data_types(orig_db_data)

# ----------------------------- Create New Model Variables ---------------------------------------

Create new training variables

In [32]:
orig_data = create_new_addr_vars(orig_data)

In [33]:
get_target_dist(orig_data, target_name)

Number of correct 0: 8940
Number of correct 1: 21388
Percent of correct with value 0: 0.2947771036665787
Percent of correct with value 1: 0.7052228963334213


In [34]:
orig_data_name = data_save_path + start_time_str + '_' + file_base_name + '_OrigDataB4Clean.csv'
orig_data.to_csv(orig_data_name, sep = ',', header = True, index = True)


# ---------------------------- Create and Clean Model Data --------------------------------------

In [35]:
model_data = orig_data.loc[:, model_vars]

In [36]:
model_data.shape

(30328, 25)

Clean model data to remove NaN values

In [37]:
model_data, orig_data = clean_model_data(model_data, orig_data)

In [38]:
model_data.shape

(27404, 25)

Verify NaN values were removed (should be none in the dataset)

In [39]:
na_cnt = model_data.isna().sum()
na_ndx = na_cnt > 0
na_cnt_df = na_cnt[na_ndx]
print(na_cnt_df)
print(na_cnt_df.index.values)

Series([], dtype: int64)
[]


Display descriptive statistics for model dataset

In [40]:
model_data.describe(include = "all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
correct,27404,,,,0.705773,0.455703,0.0,0.0,1.0,1.0,1.0
lic_state_match,27404,2.0,1,26080.0,,,,,,,
pcp,27404,2.0,0,16717.0,,,,,,,
ent_comm_src_cat_code,27404,44.0,GROUP,9135.0,,,,,,,
ent_comm_comm_type,27404,3.0,OF,19410.0,,,,,,,
addr_age_yrs,27404,,,,9.20008,12.3851,0.205479,2.68493,7.69041,10.4082,119.83
yop_yrs,27404,,,,29.0397,11.1862,4.75342,20.7644,28.7699,36.7753,68.7973
doctor_age_yrs,27404,,,,56.2682,10.8809,28.7699,47.7836,56.789,64.7945,100.819
ppd_address_type,27404,3.0,1,16423.0,,,,,,,
ppd_region,27404,5.0,3,9815.0,,,,,,,


Print number of 0 and 1 valus of target variable

In [41]:
get_target_dist(model_data, target_name)


Number of correct 0: 8063
Number of correct 1: 19341
Percent of correct with value 0: 0.2942271201284484
Percent of correct with value 1: 0.7057728798715516


# ---------------------------- Perform Exploratory Analysis --------------------------------------

Generate figures of the distributions of each variable and save off for further analysis

In [42]:
create_explore_plots(model_data, explore_path)


Generate plots of independent variables and the target correctness (crosstab and scatter plots)

In [43]:
model_columns = model_data.columns.values

for i in range(len(model_columns)):
    if str(model_data[model_columns[i]].dtypes) == 'category' or str(model_data[model_columns[i]].dtypes) == 'object':
        obs_race_cross = pd.crosstab(model_data[model_columns[i]], model_data[target_name])
        obs_race_cross.plot(kind="bar")
        x_lab = model_columns[i] + ' by ' + target_name
        plt.xlabel(x_lab)
        plt.ylabel('Count')
        title = 'Crosstab of ' + model_columns[i] + ' by ' + target_name
        plt.title(title)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    else:
        temp_ndx = model_data[model_columns[i]].isnull()
        temp_var = model_data[model_columns[i]][~temp_ndx]
        temp_correct = model_data[target_name][~temp_ndx]
        
        plt.scatter(x=temp_correct, y=temp_var)
        x_title = target_name + ' value'
        plt.xlabel(x_title)
        plt.ylabel(model_columns[i])
        title = model_columns[i] + ' by ' + target_name
        plt.title(title)    

    fig_path = explore_path + model_columns[i] + '_by_' + target_name + '.png'
    plt.savefig(fig_path)
    plt.close()

# ---------------------------- Create Dummy Variables --------------------------------------

Save off data before dummy variable creation

In [44]:
model_info_df = model_data[:]
model_info_df[info_vars] = orig_data[info_vars]


In [45]:
model_info_name = data_save_path + start_time_str + '_' + file_base_name + '_BeforeDummy.csv'
model_info_df.to_csv(model_info_name, sep = ',', header = True, index = True)


In [46]:
orig_data_name = data_save_path + start_time_str + '_' + file_base_name + '_CleanedOrigData.csv'
orig_data.to_csv(orig_data_name, sep = ',', header = True, index = True)


Create dummy variables for all categorical/object columns

In [47]:
model_data_all = pd.get_dummies(model_data)
model_data_all.shape

(27404, 361)

In [48]:
model_data_all = remove_col_name_float_pt(model_data_all)

Create and save pearson correlation matrix of all variables

In [49]:
all_var_corr = model_data_all.corr(method = "pearson")

corr_file = explore_path + 'All_Variable_Correlation.csv'
all_var_corr.to_csv(corr_file, sep=",")


# ------------------------------ Create Test and Train Data ----------------------------------------

Create target dataset

In [50]:
target_df = model_data_all[target_name]
target_df.shape

(27404,)

Create training dataset

In [51]:
train_df = model_data_all[:]
train_df = train_df.drop(target_name, axis = 1) 
train_df.shape

(27404, 360)

Add informational columns back into data so they will be split accordingly

In [52]:
train_df_all = train_df[:]
train_df_all[info_vars] = orig_data[info_vars]
train_df_all.shape

(27404, 389)

Create random test/train split with testing set size given by variable at the top

In [53]:
# If testing_size is > 0, split data into testing/training sets
train_columns = list(train_df_all.columns.values)

if testing_size > 0:
    data_train, data_test, target_train, target_test = \
    train_test_split(train_df_all, target_df, test_size = testing_size, 
                     random_state = rand_state, stratify = target_df)
# Else use entire dataset for both testing and training
else:
    data_train = train_df
    data_test = train_df
    target_train = target_df
    target_test = target_df
    

Convert numpy outputs back into dataframes for saving

In [54]:
data_train_df = pd.DataFrame(data_train, columns = train_columns)
train_indices = data_train_df.index.values
target_train_df = pd.DataFrame(target_train, index = train_indices, columns = [target_name])

data_test_df = pd.DataFrame(data_test, columns = train_columns)
test_indices = data_test_df.index.values
target_test_df = pd.DataFrame(target_test, index = test_indices, columns = [target_name])


In [55]:
data_train_df.shape

(21923, 389)

In [56]:
data_test_df.shape

(5481, 389)

Extract informational columns from training and testing datasets

In [57]:
data_train_info_df = data_train_df.loc[:, info_vars]
data_train_df = data_train_df.drop(info_vars, axis = 1)

data_test_info_df = data_test_df.loc[:, info_vars]
data_test_df = data_test_df.drop(info_vars, axis = 1)


In [58]:
data_train_df.shape

(21923, 360)

In [59]:
data_test_df.shape

(5481, 360)

Print number of 0s and 1s in training dataset target variable

In [60]:
get_target_dist(target_train_df, target_name)


Number of correct 0: 6450
Number of correct 1: 15473
Percent of correct with value 0: 0.29421155863704784
Percent of correct with value 1: 0.7057884413629522


Print number of 0s and 1s in testing dataset target variable

In [61]:
get_target_dist(target_test_df, target_name)


Number of correct 0: 1613
Number of correct 1: 3868
Percent of correct with value 0: 0.2942893632548805
Percent of correct with value 1: 0.7057106367451195


# -------------------------- Save Testing and Training Data -------------------------------------

Save training data and target to csv files

In [62]:
train_data_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainData.csv'
data_train_df.to_csv(train_data_name, sep = ',', header = True, index = True)
train_target_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainTarget.csv'
target_train_df.to_csv(train_target_name, sep = ',', header = True, index = True)


Save testing data and target to csv files

In [63]:

test_data_name = data_save_path + start_time_str + '_' + file_base_name + '_TestData.csv'
data_test_df.to_csv(test_data_name, sep = ',', header = True, index = True)
test_target_name = data_save_path + start_time_str + '_' + file_base_name + '_TestTarget.csv'
target_test_df.to_csv(test_target_name, sep = ',', header = True, index = True)


Save information dataframes

In [64]:
train_data_info_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainData_Info.csv'
data_train_info_df.to_csv(train_data_info_name, sep = ',', header = True, index = True)

test_data_info_name = data_save_path + start_time_str + '_' + file_base_name + '_TestData_Info.csv'
data_test_info_df.to_csv(test_data_info_name, sep = ',', header = True, index = True)


Save unsplit data to csv files (to use to duplicate previous modeling)

In [65]:
unsplit_test_data_name = data_save_path + start_time_str + '_' + file_base_name + '_UnsplitData.csv'
train_df.to_csv(unsplit_test_data_name, sep = ',', header = True, index = True)
unsplit_test_target_name = data_save_path + start_time_str + '_' + file_base_name + \
    '_UnsplitTarget.csv'
target_df.to_csv(unsplit_test_target_name, sep = ',', header = True, index = True)


Save unsplit data information dataframes

In [66]:
unsplit_test_data_info_name = data_save_path + start_time_str + '_' + file_base_name + \
    '_UnsplitData_Info.csv'
orig_data[info_vars].to_csv(unsplit_test_data_info_name, sep = ',', header = True, index = True)

Save results

In [67]:
end_time = datetime.datetime.now()
elapsed_time = end_time - current_time
print('Elapsed time (sec): {}'.format(elapsed_time))
print('Elapsed time (min): {}'.format((elapsed_time / 60)))
print('Elapsed time (hrs): {}'.format(((elapsed_time / 60) / 60)))

Elapsed time (sec): 0:34:57.442066
Elapsed time (min): 0:00:34.957368
Elapsed time (hrs): 0:00:00.582623
