Import packages required

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import pylab as pl
import datetime
from sklearn.model_selection import train_test_split
from shutil import copyfile


In [3]:
import os
import sys
import tkinter as tk
from tkinter import filedialog

import settings
from process_model_data import convert_data_types, create_new_phone_vars, clean_model_data, create_explore_plots
from create_phone_model_input_data import create_model_initial_data



In [4]:
from get_input_date_range import get_input_date_range
import datalabs.curate.dataframe as df


Suppress warnings that do not affect execution

In [5]:
import warnings, sklearn.exceptions
warnings.filterwarnings("ignore")

pd.options.mode.chained_assignment = None


# ------------------------------ Define Common Functions ---------------------------------------

In [6]:
def get_target_dist(data_df, target_var_name):
    num_target_0 = sum(data_df.disconnect == 0)
    num_target_1 = sum(data_df.disconnect == 1)
    perc_target_0 = num_target_0/(num_target_0 + num_target_1)
    perc_target_1 = num_target_1/(num_target_0 + num_target_1)

    print('Number of {} 0: {}'.format(target_var_name, num_target_0))
    print('Number of {} 1: {}'.format(target_var_name, num_target_1))
    print('Percent of {} with value 0: {}'.format(target_var_name, perc_target_0))
    print('Percent of {} with value 1: {}'.format(target_var_name, perc_target_1))

# ------------------------------ Assign Common Variables ----------------------------------------

Get paths required

In [7]:
# Get files needed
ddb_info_file = filedialog.askopenfilename(initialdir = "C:\\",
                                         title = "Choose txt file with database login information...")

init_wslive_dir = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\WSLive\\'
wslive_results_file = filedialog.askopenfilename(initialdir = init_wslive_dir,
                                         title = "Choose wslive file with results encoded...")

init_ppd_dir = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\PPD\\'
ppd_file = filedialog.askopenfilename(initialdir = init_ppd_dir,
                                         title = "Choose PPD file used for the wslive file chosen...")

init_save_path = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Phone_Disconnect_Model\\'
base_save_path = filedialog.askdirectory(initialdir = init_save_path,
                                         title = "Choose base save directory...")
base_save_path = base_save_path.replace("/", "\\")
base_save_path += "\\"


Define variables needed throughout code

In [106]:
model_vars = ['disconnect', 'lic_state_match', 'pcp', 'ent_comm_src_cat_code', 'phone_age_yrs', 'yop_yrs', 
             'doctor_age_yrs','ppd_address_type', 'ppd_region', 'ppd_division', 'ppd_group', 
             'ppd_msa_population_size', 'ppd_md_do_code', 'ppd_micro_metro_ind', 'ppd_gender', 'ppd_top_cd',  
             'ppd_pe_cd', 'ppd_prim_spec_cd', 'ppd_polo_state', 'num_oldphones']

info_vars = ['ppd_me', 'ppd_first_name', 'ppd_middle_name', 'ppd_last_name', 'ppd_suffix', 
             'ppd_polo_mailing_line_1', 'ppd_polo_mailing_line_2', 'ppd_polo_city', 'ppd_polo_state', 'ppd_polo_zip',
             'ppd_telephone_number', 'ppd_prim_spec_cd', 'ppd_pe_cd', 'ppd_fax_number', 
             'PHONE_STATUS', 'COMMENTS']

target_name = 'disconnect'
file_base_name = 'PhoneDisconnectModel'


rand_state = 45
testing_size = 0.2


Get current time and create string

In [9]:
current_time = datetime.datetime.now()
start_time_str = current_time.strftime("%Y-%m-%d")

Create model output directory based on date

In [10]:
data_path = base_save_path + 'Data\\'

if not os.path.exists(data_path):
    os.mkdir(data_path)
    
data_save_path = data_path + start_time_str + '_Model_Data\\'

if not os.path.exists(data_save_path):
    os.mkdir(data_save_path)


Create exploratory output directory based on date

In [11]:
explore_path = data_save_path + 'Exploratory\\'

if not os.path.exists(explore_path):
    os.mkdir(explore_path)


# ------------------------------- Import Data ------------------------------------------

Get time frame of WSLive data to use

In [12]:
start_date, end_date, date_range_str = get_input_date_range()

Enter starting year (4 digit format): 2019
Enter starting month (numeric value, Jan = 1): 6
Enter starting day (default = 1): 
Enter ending year (4 digit format): 2019
Enter ending month (numeric value, Jan = 1): 7
Enter ending day (default = last day of month): 25


Get WSLive data for time frame indicated

In [13]:
# Read in wslive data
wslive_results_df = pd.read_csv(wslive_results_file, delimiter = ",", index_col = None, header = 0, dtype = str)
wslive_results_df = df.rename_in_upper_case(wslive_results_df)


In [14]:
# Get data for date range specified
wslive_results_df['WSLIVE_FILE_DT'] = pd.to_datetime(wslive_results_df['WSLIVE_FILE_DT'])
wslive_date_df = wslive_results_df[(wslive_results_df['WSLIVE_FILE_DT'] >= start_date) & \
                                  (wslive_results_df['WSLIVE_FILE_DT'] <= end_date)]


In [15]:
# If source type for MF is a list so need to treat differently
source_code = ['C', 'Z']
wslive_source_df = wslive_date_df[wslive_date_df['SOURCE'].isin(source_code)]


In [16]:
wslive_uniq_me_df = wslive_source_df.sort_values('WSLIVE_FILE_DT', 
                        ascending = False).groupby('PHYSICIAN_ME_NUMBER').first().reset_index()


In [17]:
wslive_discnx_df = wslive_uniq_me_df[wslive_uniq_me_df['COMMENTS'] == 'NOT IN SERVICE']
wslive_discnx_df.shape

(1978, 37)

In [18]:
wslive_uniq_me_df.shape

(56274, 37)

In [19]:
wslive_source_df.shape

(56983, 37)

Read in ppd data

In [20]:
ppd_df = pd.read_csv(ppd_file, delimiter = ",", index_col = None, header = 0, dtype = str)


Get database login information

Call function to pull required information from databases

In [63]:
orig_db_data = create_model_initial_data(wslive_uniq_me_df, ppd_df, 
                                                         ddb_login_dict['AIMS']['username'], 
                                                         ddb_login_dict['AIMS']['password'])


In [64]:
orig_db_data.shape

(53379, 115)

In [65]:
orig_unclean_data = orig_db_data[(orig_db_data['PHONE_STATUS'] == 'Confirmed') | (orig_db_data['PHONE_STATUS'] == 'Updated') | \
                        (orig_db_data['PHONE_STATUS'] == 'Known Bad')]

In [66]:
orig_unclean_data.shape

(27578, 115)

In [67]:
orig_discnx_df = orig_unclean_data[orig_unclean_data['COMMENTS'] == 'NOT IN SERVICE']
orig_discnx_df.shape

(1875, 115)

Set Jupyter so that all output is displayed (not abbreviated with ...)

In [68]:
num_rows = orig_unclean_data.shape[0] + 1
num_cols = orig_unclean_data.shape[1] + 1
pd.set_option('max_rows', num_rows)
pd.set_option('max_columns', num_cols)
np.set_printoptions(threshold=np.inf)

Display size of original dataset

In [69]:
orig_unclean_data.shape

(27578, 115)

# ------------------------------- Convert Data ------------------------------------------

Convert dataframe data types to correct ones

In [70]:
orig_data = convert_data_types(orig_unclean_data)

# ----------------------------- Create New Model Variables ---------------------------------------

Create new training variables

In [71]:
orig_data = create_new_phone_vars(orig_data)

Create new target variable with 1 if phone number was confirmed and 0 if it was updated or known bad

In [72]:
disconnect = np.zeros((orig_data.shape[0], 1))
disconnect_ndx = orig_data['COMMENTS'] == 'NOT IN SERVICE'
disconnect[disconnect_ndx] = 1
orig_data[target_name] = disconnect


# ---------------------------- Create and Clean Model Data --------------------------------------

In [107]:
model_data = orig_data.loc[:, model_vars]

In [108]:
model_data.shape

(22950, 20)

Clean model data to remove NaN values

In [109]:
model_data, orig_data = clean_model_data(model_data, orig_data)

In [110]:
model_data.shape

(22950, 20)

Verify NaN values were removed (should be none in the dataset)

In [111]:
na_cnt = model_data.isna().sum()
na_ndx = na_cnt > 0
na_cnt_df = na_cnt[na_ndx]
print(na_cnt_df)
print(na_cnt_df.index.values)

Series([], dtype: int64)
[]


Display descriptive statistics for model dataset

In [112]:
model_data.describe(include = "all").T

Unnamed: 0,count,unique,top,freq,mean,std,min,25%,50%,75%,max
disconnect,22950,,,,0.0687146,0.252974,0.0,0.0,0.0,0.0,1.0
lic_state_match,22950,,,,0.969891,0.170891,0.0,1.0,1.0,1.0,1.0
pcp,22950,,,,0.358344,0.479524,0.0,0.0,0.0,1.0,1.0
ent_comm_src_cat_code,22950,32.0,PHNSURV,5367.0,,,,,,,
phone_age_yrs,22950,,,,7.0647,6.89084,0.0109589,1.11233,4.8,11.9349,29.063
yop_yrs,22950,,,,27.7377,10.9648,4.69863,18.7068,27.7151,35.7205,72.7452
doctor_age_yrs,22950,,,,55.0887,10.727,29.7151,46.726,54.7315,62.737,119.775
ppd_address_type,22950,3.0,1,11534.0,,,,,,,
ppd_region,22950,5.0,3,8105.0,,,,,,,
ppd_division,22950,10.0,5,4339.0,,,,,,,


Print number of 0 and 1 valus of target variable

In [113]:
get_target_dist(model_data, target_name)


Number of disconnect 0: 21373
Number of disconnect 1: 1577
Percent of disconnect with value 0: 0.931285403050109
Percent of disconnect with value 1: 0.06871459694989107


# ---------------------------- Perform Exploratory Analysis --------------------------------------

Generate figures of the distributions of each variable and save off for further analysis

In [114]:
create_explore_plots(model_data, explore_path)


Generate plots of independent variables and the target correctness (crosstab and scatter plots)

In [115]:
model_columns = model_data.columns.values

for i in range(len(model_columns)):
    if str(model_data[model_columns[i]].dtypes) == 'category' or str(model_data[model_columns[i]].dtypes) == 'object':
        obs_race_cross = pd.crosstab(model_data[model_columns[i]], model_data[target_name])
        obs_race_cross.plot(kind="bar")
        x_lab = model_columns[i] + ' by ' + target_name
        plt.xlabel(x_lab)
        plt.ylabel('Count')
        title = 'Crosstab of ' + model_columns[i] + ' by ' + target_name
        plt.title(title)
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    else:
        temp_ndx = model_data[model_columns[i]].isnull()
        temp_var = model_data[model_columns[i]][~temp_ndx]
        temp_correct = model_data[target_name][~temp_ndx]
        
        plt.scatter(x=temp_correct, y=temp_var)
        x_title = target_name + ' value'
        plt.xlabel(x_title)
        plt.ylabel(model_columns[i])
        title = model_columns[i] + ' by ' + target_name
        plt.title(title)    

    fig_path = explore_path + model_columns[i] + '_by_' + target_name + '.png'
    plt.savefig(fig_path)
    plt.close()

# ---------------------------- Create Dummy Variables --------------------------------------

Save off data before dummy variable creation

In [116]:
model_info_df = model_data[:]
model_info_df[info_vars] = orig_data[info_vars]


In [117]:
model_info_name = data_save_path + start_time_str + '_' + file_base_name + '_BeforeDummy.csv'
model_info_df.to_csv(model_info_name, sep = ',', header = True, index = True)


Create dummy variables for all categorical/object columns

In [118]:
model_data_all = pd.get_dummies(model_data)
model_data_all.shape

(22950, 342)

Create and save pearson correlation matrix of all variables

In [119]:
all_var_corr = model_data_all.corr(method = "pearson")

corr_file = explore_path + 'All_Variable_Correlation.csv'
all_var_corr.to_csv(corr_file, sep=",")


# ------------------------------ Create Test and Train Data ----------------------------------------

Create target dataset

In [120]:
target_df = model_data_all[target_name]
target_df.shape

(22950,)

Create training dataset

In [121]:
train_df = model_data_all[:]
train_df = train_df.drop(target_name, axis = 1) 
train_df.shape

(22950, 341)

Add informational columns back into data so they will be split accordingly

In [122]:
train_df_all = train_df[:]
train_df_all[info_vars] = orig_data[info_vars]
train_df_all.shape

(22950, 357)

Create random test/train split with testing set size given by variable at the top

In [123]:
# If testing_size is > 0, split data into testing/training sets
train_columns = list(train_df_all.columns.values)

if testing_size > 0:
    data_train, data_test, target_train, target_test = train_test_split(train_df_all, target_df, test_size = testing_size, 
                                                                        random_state = rand_state, stratify = target_df)
# Else use entire dataset for both testing and training
else:
    data_train = train_df
    data_test = train_df
    target_train = target_df
    target_test = target_df
    

Convert numpy outputs back into dataframes for saving

In [124]:
data_train_df = pd.DataFrame(data_train, columns = train_columns)
train_indices = data_train_df.index.values
target_train_df = pd.DataFrame(target_train, index = train_indices, columns = [target_name])

data_test_df = pd.DataFrame(data_test, columns = train_columns)
test_indices = data_test_df.index.values
target_test_df = pd.DataFrame(target_test, index = test_indices, columns = [target_name])


In [125]:
data_train_df.shape

(18360, 357)

In [126]:
data_test_df.shape

(4590, 357)

Extract informational columns from training and testing datasets

In [127]:
data_train_info_df = data_train_df.loc[:, info_vars]
data_train_df = data_train_df.drop(info_vars, axis = 1)

data_test_info_df = data_test_df.loc[:, info_vars]
data_test_df = data_test_df.drop(info_vars, axis = 1)


In [128]:
data_train_df.shape

(18360, 341)

In [129]:
data_test_df.shape

(4590, 341)

Print number of 0s and 1s in training dataset target variable

In [130]:
get_target_dist(target_train_df, target_name)


Number of disconnect 0: 17098
Number of disconnect 1: 1262
Percent of disconnect with value 0: 0.9312636165577342
Percent of disconnect with value 1: 0.06873638344226579


Print number of 0s and 1s in testing dataset target variable

In [131]:
get_target_dist(target_test_df, target_name)


Number of disconnect 0: 4275
Number of disconnect 1: 315
Percent of disconnect with value 0: 0.9313725490196079
Percent of disconnect with value 1: 0.06862745098039216


# -------------------------- Save Testing and Training Data -------------------------------------

Save training data and target to csv files

In [132]:
train_data_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainData.csv'
data_train_df.to_csv(train_data_name, sep = ',', header = True, index = True)
train_target_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainTarget.csv'
target_train_df.to_csv(train_target_name, sep = ',', header = True, index = True)


Save testing data and target to csv files

In [133]:

test_data_name = data_save_path + start_time_str + '_' + file_base_name + '_TestData.csv'
data_test_df.to_csv(test_data_name, sep = ',', header = True, index = True)
test_target_name = data_save_path + start_time_str + '_' + file_base_name + '_TestTarget.csv'
target_test_df.to_csv(test_target_name, sep = ',', header = True, index = True)


Save information dataframes

In [134]:
train_data_info_name = data_save_path + start_time_str + '_' + file_base_name + '_TrainData_Info.csv'
data_train_info_df.to_csv(train_data_info_name, sep = ',', header = True, index = True)

test_data_info_name = data_save_path + start_time_str + '_' + file_base_name + '_TestData_Info.csv'
data_test_info_df.to_csv(test_data_info_name, sep = ',', header = True, index = True)


Save unsplit data to csv files (to use to duplicate previous modeling)

In [135]:
unsplit_test_data_name = data_save_path + start_time_str + '_' + file_base_name + '_UnsplitData.csv'
train_df.to_csv(unsplit_test_data_name, sep = ',', header = True, index = True)
unsplit_test_target_name = data_save_path + start_time_str + '_' + file_base_name + '_UnsplitTarget.csv'
target_df.to_csv(unsplit_test_target_name, sep = ',', header = True, index = True)


Save unsplit data information dataframes

In [136]:
unsplit_test_data_info_name = data_save_path + start_time_str + '_' + file_base_name + '_UnsplitData_Info.csv'
orig_data[info_vars].to_csv(unsplit_test_data_info_name, sep = ',', header = True, index = True)

Save results

In [137]:
end_time = datetime.datetime.now()
elapsed_time = end_time - current_time
print('Elapsed time (sec): {}'.format(elapsed_time))
print('Elapsed time (min): {}'.format((elapsed_time / 60)))
print('Elapsed time (hrs): {}'.format(((elapsed_time / 60) / 60)))

Elapsed time (sec): 1:38:32.131017
Elapsed time (min): 0:01:38.535517
Elapsed time (hrs): 0:00:01.642259
