In [None]:
"""
The purpose of this module is to create a dataset of doctor data that mimics what we might expect to find
among out client's databases. The content can then be used to trial analysis methods and generate example dashboard
outpts to showcase the kind of data analytics we can offer.
"""

In [38]:
import random as rn
import pandas as pd
from pandas import Series, DataFrame

In [34]:
# dataframes / lookup tables

# Number of doctors in each state - input to assumption that all practices in any state have the same ratio
dfDrByState1 = pd.read_csv('DoctorsByState3.csv')
global dfDrByState1
dfDrByState2 = dfDrByState1.set_index('Place')
global dfDrByState2

# Insurance coverage in each state - input to assumption that all practices in any state have the same ratio
# https://www.kff.org/other/state-indicator/total-population
# determines what patient coverages are for practice based on state data
df2 = pd.read_csv('InsuranceCoverage2017v3.csv')
dfCoverageByState = df2.set_index('Location')
global dfCoverageByState

In [39]:
# Functions

# selector - made up shit: Doctor's age is indicative of how likely they are to prescribe Drug X
def DrAge():
    lower = 30
    upper= 70
    return rn.randint(lower,upper)

# selector - made up shit: Doctor's gender is indicative of how likely they are to prescribe Drug X
def DrGender():
    genderChoices = ['Male','Female']
    numGenderChoices = 1
    return rn.choices(genderChoices,k=numGenderChoices)[0]

# Journal of Medical Regulation Vol. 103 , No. 2 - A Census of Actively Licensed Physicians in the United States, 2016, Young et al
# this uses the number of doctors in state as a weighting for how likely the random doctor will be located in any particular US state
def DrLocationSelector():
#    doctorLocationChoices = dfDrByState2['Place']
    doctorLocationChoices = dfDrByState1['Place']
    doctorLocationFrequency = dfDrByState2['#Physicians']
    numLocationChoices = 1
    return rn.choices(doctorLocationChoices,weights=doctorLocationFrequency,k=numLocationChoices)[0] # choices returns a list

# selector - determines practice type basd on US national distribution
# https://www.ama-assn.org/about/research/physician-practice-benchmark-survey
def PracticeTypeSelector():
    practiceTypeChoices = ['Solo','Group - Single Specialty','Group - Multi-specialty','Employed Physician']
    practiceTypeFrequency = [16,47,28,9]
    numDrChoices = 1
    return rn.choices(practiceTypeChoices,weights=practiceTypeFrequency,k=numDrChoices)[0]

# selector - practice speciality
# Guesses and approximations have been used to set what practice types can have what specialties
# Guesses have been used to determine what ratio each specialty type appears in compared to the others
"""
Solo - Family, Internal, Orthopedic, General Surgery
Single Specialty - 'Emergency','Family','Internal','Orthopedic','Oncology', 'General Surgery', Endocrinology', 'Cardiology','Gastroenterology','Pulmonology', 'Ambulatory' 
Multi - Spec - 'Internal','Orthopedic','Oncology', 'General Surgery', Endocrinology', 'Cardiology','Gastroenterology','Pulmonology', 'Ambulatory'
Employed Phys - 'Emergency','Internal','Orthopedic','Oncology', 'General Surgery', 'Endocrinology', 'Cardiology','Gastroenterology','Pulmonology', 'Ambulatory'
"""
def PracticeSpecialtySelector(pracType):
    if pracType == 'Solo':
        selectFrom = ['Family', 'Internal', 'Orthopedic', 'General Surgery']
        selectFromWts = [35,25,20,20]
        
    elif pracType == 'Group - Single Specialty':
        selectFrom = ['Emergency','Family','Internal','Orthopedic','Oncology', 'General Surgery',
                      'Endocrinology', 'Cardiology','Gastroenterology','Pulmonology', 'Ambulatory']
        selectFromWts = [8,12,6,4,4,10,3,4,5,4,9]

    elif pracType == 'Group - Multi-specialty':
        selectFrom = ['Internal','Orthopedic','Oncology', 'General Surgery', 'Endocrinology',
                      'Cardiology','Gastroenterology','Pulmonology', 'Ambulatory']
        selectFromWts = [12,11,7,14,6,7,4,6,11]
        
    else: 
        selectFrom = ['Emergency','Internal','Orthopedic','Oncology', 'General Surgery', 'Endocrinology',
                      'Cardiology','Gastroenterology','Pulmonology', 'Ambulatory']
        selectFromWts = [11,10,9,3,14,5,9,7,7,10]
       
    specialtyChoice = rn.choices(selectFrom,weights=selectFromWts,k=1)[0]
    return specialtyChoice

# made-up shit: uses made-up (fictitious!!) factor based on practice type and doctor/patient ratio to determine # visitors
def PracticeSizeSelector(locn,pracType):
    typeFactor = {'Solo':1,'Group - Single Specialty':3,'Group - Multi-specialty':6,'Employed Physician':18}
    locFactor = dfDrByState2.loc[locn][2]
    return typeFactor[pracType] * locFactor

# https://www.kff.org/other/state-indicator/total-population
# determines what patient coverages are for practice based on state data
# returns medicare%
def CoverageSelector(locn):
    return dfCoverageByState.loc[locn][3]

# selector - made up shit: how many medical society dinners a Dr attends is indicative of how likely they are to prescribe Drug X
def SocietyDinners():
    lower = 0
    upper= 5
    return rn.randint(lower,upper)

# selector - made up shit: how many CME events a Dr attends is indicative of how likely they are to prescribe Drug X
def ContinuingMedEd():
    lower = 0
    upper= 3
    return rn.randint(lower,upper)

# selector - made up shit: difference in how many prescriptions a Dr writes is a sucess measure for marketing Drug X
def ScriptsWritten():
    lower = 0
    upper= 50
    return rn.randint(lower,upper)

# selector - made up shit: how many times Dr visits web page is indicative of how likely they are to prescribe Drug X
def WebViews():
    lower = 0
    upper= 5
    return rn.randint(lower,upper)

# selector - made up shit: how many branded e-mails a Dr opens is indicative of how likely they are to prescribe Drug X
def BrandedEMopenRate():
    return rn.random()

# selector - made up shit: how many times Dr is contacted by Drug X rep is indicative of how likely they are to prescribe Drug X
def RepTouches():
    lower = 0
    upper= 3
    return rn.randint(lower,upper)

In [42]:
# dummy table value populator

# create empty lists/series for each variable
# start for (each Doctor) loop
# generate a value for each variable for 1 Doctor
# Do next Doctor
# merge lists to form Dataframe

numDoctors = 5

doctorAges = []
doctorGenders = []
doctorLocations = []
practiceTypes = []
practiceSpecialties = []
practiceSizes = []
practiceInsCoverage = []
societyDinners = []
cmeCount = []
scriptCount = []
webViewCount = []
emailOpenRates = []
repTouchCounts = []

for dr in range(0,numDoctors):
    location = None
    practiceType = None
    practiceSize = 0
    
    doctorAges.append(DrAge())
    doctorGenders.append(DrGender())
    
    location = DrLocationSelector()
    doctorLocations.append(location)
    
    practiceType = PracticeTypeSelector()
    practiceTypes.append(practiceType)
    
    practiceSpecialty = PracticeSpecialtySelector(practiceType)
    practiceSpecialties.append(practiceSpecialty)
    
    practiceSize = PracticeSizeSelector(location,practiceType)
    practiceSizes.append(practiceSize)
    
    practiceInsCoverage.append(CoverageSelector(location))
    societyDinners.append(SocietyDinners())
    cmeCount.append(ContinuingMedEd())
    scriptCount.append(ScriptsWritten())
    webViewCount.append(WebViews())
    emailOpenRates.append(BrandedEMopenRate())
    repTouchCounts.append(RepTouches())

In [43]:
print(doctorAges)

[64, 35, 41, 34, 55]


In [44]:
# make empty DF
dfDrDataSet1 = pd.DataFrame()

In [45]:
# populate empty DF with values from dummy generator
dfDrDataSet1['Age'] = doctorAges
dfDrDataSet1['Gender'] = doctorGenders
dfDrDataSet1['Location'] = doctorLocations
dfDrDataSet1['Prac Type'] = practiceTypes
dfDrDataSet1['Specialty'] = practiceSpecialties
dfDrDataSet1['Prac Size'] = practiceSizes
dfDrDataSet1['Medicare'] = practiceInsCoverage
dfDrDataSet1['Soc Dinners'] = societyDinners
dfDrDataSet1['CMEs'] = cmeCount
dfDrDataSet1['Scripts'] = scriptCount
dfDrDataSet1['Web views'] = webViewCount
dfDrDataSet1['e-m opens'] = emailOpenRates
dfDrDataSet1['Rep touches'] = repTouchCounts

In [46]:
dfDrDataSet1

Unnamed: 0,Age,Gender,Location,Prac Type,Specialty,Prac Size,Medicare,Soc Dinners,CMEs,Scripts,Web views,e-m opens,Rep touches
0,64,Female,PA,Group - Multi-specialty,Internal,2646,0.16,3,3,8,3,0.208841,1
1,35,Female,AL,Group - Single Specialty,Family,984,0.16,1,1,27,4,0.730639,1
2,41,Female,MO,Group - Single Specialty,General Surgery,1269,0.16,4,3,42,1,0.281868,2
3,34,Male,MO,Group - Multi-specialty,Oncology,2538,0.16,3,1,27,5,0.253925,3
4,55,Male,KY,Group - Single Specialty,Cardiology,1254,0.15,2,3,42,4,0.345231,2


In [None]:
# order in which to select variables:
# location
# practice type
# practice specialty
# practice size
# Medicare coverage

In [23]:
location = DrLocationSelector()
print(location)

MN


In [26]:
practiceType = PracticeTypeSelector()
print(practiceType)

Group - Multi-specialty


In [28]:
practiceSpecialty = PracticeSpecialtySelector(practiceType)
print(practiceSpecialty)

Ambulatory


In [29]:
practiceSize = PracticeSizeSelector(location,practiceType)
print(practiceSize)

2556


In [36]:
coverage = CoverageSelector(location)
print(coverage)

0.14
