<h1 align=center>Accelerated Data Science Workflows with RAPIDS</h1>
<h2 align=center># No Show Predictive Model Data Generator #</h2>

Model to predict if a patient is likely to either NO SHOW or be significantly late for a doctor's appointment. 

## Imports

In [1]:
import sys
import numpy as np
import scipy.stats as stats
import pandas as pd

import cudf



import dask
import numba

from timeit import default_timer

class Timer(object):
    """Timer class.
    Examples:
        >>> big_num = 100000
        >>> t = Timer()
        >>> t.start()
        >>> for i in range(big_num):
        >>>     r = 1
        >>> t.stop()
        >>> print(t.interval)
        0.0946876304844
        >>> with Timer() as t:
        >>>     for i in range(big_num):
        >>>         r = 1
        >>> print(t.interval)
        0.0766928562442
        >>> try:
        >>>     with Timer() as t:
        >>>         for i in range(big_num):
        >>>             r = 1
        >>>             raise(Exception("Get out!"))
        >>> finally:
        >>>     print(t.interval)
        0.0757778924471

    """
    def __init__(self):
        self._timer = default_timer
    
    def __enter__(self):
        self.start()
        return self

    def __exit__(self, *args):
        self.stop()

    def start(self):
        """Start the timer."""
        self.start = self._timer()

    def stop(self):
        """Stop the timer. Calculate the interval in seconds."""
        self.end = self._timer()
        self.interval = self.end - self.start

        

# Label Encoder Class
        
import cudf
import nvcategory

from librmm_cffi import librmm
import numpy as np


def _enforce_str(y: cudf.Series) -> cudf.Series:
    if y.dtype != "object":
        return y.astype("str")
    return y


class Base(object):
    def __init__(self, *args, **kwargs):
        self._fitted = False

    def check_is_fitted(self):
        if not self._fitted:
            raise TypeError("Model must first be .fit()")


import cudf
import nvcategory

from librmm_cffi import librmm
import numpy as np


def _enforce_str(y: cudf.Series) -> cudf.Series:
    if y.dtype != "object":
        return y.astype("str")
    return y


class Base(object):
    def __init__(self, *args, **kwargs):
        self._fitted = False

    def check_is_fitted(self):
        if not self._fitted:
            raise TypeError("Model must first be .fit()")


class LabelEncoder(Base):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._cats: nvcategory.nvcategory = None
        self._dtype = None

    def fit(self, y: cudf.Series) -> "LabelEncoder":
        self._dtype = y.dtype
        y = _enforce_str(y)

        self._cats = nvcategory.from_strings(y.data)
        self._fitted = True
        return self

    def transform(self, y: cudf.Series) -> cudf.Series:
        self.check_is_fitted()
        y = _enforce_str(y)
        encoded = cudf.Series(
            nvcategory.from_strings(y.data)
            .set_keys(self._cats.keys())
            .values()
        )
        if -1 in encoded:
            raise KeyError("Attempted to encode unseen key")
        return encoded

    def fit_transform(self, y: cudf.Series) -> cudf.Series:
        self._dtype = y.dtype
        y = _enforce_str(y)
        self._cats = nvcategory.from_strings(y.data)
        self._fitted = True
        arr: librmm.device_array = librmm.device_array(
            y.data.size(), dtype=np.int32
        )
        self._cats.values(devptr=arr.device_ctypes_pointer.value)
        return cudf.Series(arr)

    def inverse_transform(self, y: cudf.Series):
        raise NotImplementedError
        
# Given a cudf string column, returns the unique values
def get_unique_strings(ds):
    c = nvcategory.from_strings(ds.data)
    return c
        

## Function and Variable Definitions

In [2]:
# Parmeters that can be used to control the size of this synthetic dataset
num_departments = 12
num_providers_per_department = 10
num_appts_per_day = 20
num_working_days_year = 50 * 5
num_lookback_years = 3
num_samples = num_departments * num_providers_per_department * num_appts_per_day * num_working_days_year * num_lookback_years
print(num_samples/1E6, 'million doctor visits')

1.8 million doctor visits


In [3]:
features = [
    'AGE', 'GENDER', 'INSURANCE', 'VISIT_TYPE', 'DEPT_SPECIALTY', 'DEPT_ID', 
    'NO_SHOW_RATE', 'LATE_START_RATE',
    'APPT_WEEKDAY', 'APPT_TIME', 'DAY', 'MONTH',
    'ZIPCODE', 'DISTANCE_FROM_CLINIC', 'PREDICTED_ADVERSE_WEATHER', 'ACTUAL_ADVERSE_WEATHER']

In [4]:
gender_dict = {
    1: 'MALE',
    2: 'FEMALE', 
    3: 'OTHER', 
    4: 'UNKNOWN'
}
genders = list(gender_dict.values())

In [5]:
insurance_dict = {
    1: 'MEDICARE', 
    2: 'MEDICAID',
    3: 'EMPLOYER', 
    4: 'PRIVATE', 
    5: 'OTHER'
}

insurances = list(insurance_dict.values())

In [6]:
visit_type_dict = {
    1: 'Office_Visit', 
    2: 'MRI', 
    3: 'CT', 
    4: 'Physical_Exam', 
    5: 'Flu_Clinic', 
    6: 'OP_Procedure', 
    7: 'PT', 
    8: 'OTHER'
}
visit_types = list(visit_type_dict.values())

In [7]:
dept_specialty_dict = {
    1: 'PRIMARY_CARE',
    2: 'RADIOLOGY', 
    3: 'ONCOLOGY', 
    4: 'PEDIATRICS', 
    5: 'CARDIOLOGY', 
    6: 'NEUROLOGY', 
    7: 'URGENT_CARE', 
    8: 'GI', 
    9: 'UROLOGY',
    10: 'DERMATOLOGY', 
    11: 'PULMONOLOGY', 
    12: 'ENDOCRINOLOGY', 
    13: 'PYSCHIATRY', 
    14: 'OBGYN', 
    15: 'ORTHO', 
    16: 'INTERNAL_MEDICINE', 
    17: 'PT', 
    18: 'OTHER'
}
dept_specialties = list(dept_specialty_dict.values())

In [8]:
departments = [dep for dep in range(1, num_departments+1)]

In [9]:
weekday_dict= {
    1: 'MON', 
    2: 'TUE', 
    3: 'WED', 
    4: 'THU', 
    5: 'FRI'
}
weekdays = list(weekday_dict.values())

In [10]:
appt_times = [x/100 for x in range(800, 1850, 50)]
days = [d for d in range(1, 29)]
months = [m for m in range(1, 13)]
zipcodes = [z for z in range(90001, 96162)] # Roughly maps to CA

In [11]:
# Top Appointment Reasons
appt_reason_dict = {
    1: 'Skin', 
    2: 'Joint', 
    3: 'Back', 
    4: 'Cholesterol', 
    5: 'Respiratory',
    6: 'Mental_Health', 
    7: 'Neurologic', 
    8: 'BP', 
    9: 'Headache', 
    10: 'Diabetes', 
    11: 'Other'
}
appt_reasons = list(appt_reason_dict.values())
appt_reason_features = ['APPT_REASON_' + str(rsn) for rsn in  appt_reasons]
features += appt_reason_features

In [12]:
# Top Health Issues
health_issue_dict = {
    1: 'Heart_Disease', 
    2: 'Cancer', 
    3: 'Stroke', 
    4: 'Respiratory_Disease', 
    5: 'Injuries',
    6: 'Diabetes', 
    7: 'Alzheimers', 
    8: 'Pneumonia', 
    9: 'Kidney_Disease', 
    10: 'Mental_Health', 
    11: 'Pregnancy', 
    12: 'Other'
}
health_issues = list(health_issue_dict.values())
health_issue_features = ['HEALTH_ISSUE_' + str(iss) for iss in  health_issues]

features += health_issue_features

In [13]:
def generate_features():
    gdf = cudf.DataFrame()
    binary_choice = [1, 0]
    gdf['AGE'] = np.random.randint(1,100, size=(num_samples,))
    gdf['GENDER'] = np.random.choice(genders, size=num_samples, p=[0.45, 0.45, 0.05, 0.05])
    gdf['INSURANCE'] = np.random.choice(insurances, size=num_samples, p=[0.15, 0.15, 0.50, 0.15, 0.05])
    gdf['VISIT_TYPE'] = np.random.choice(visit_types, size=num_samples, p=[0.45, 0.05, 0.05, 0.05, 0.2, 0.1, 0.05, 0.05])
    gdf['DEPT_SPECIALTY'] = np.random.choice(dept_specialties, size=num_samples)
    gdf['DEPT_ID'] = np.random.choice(departments, size=num_samples)
    gdf['APPT_WEEKDAY'] = np.random.choice(weekdays, size=num_samples)
    gdf['APPT_TIME'] = np.random.choice(appt_times, size=num_samples)
    gdf['DAY'] = np.random.choice(days, size=num_samples)
    gdf['MONTH'] = np.random.choice(months, size=num_samples)
    
    # Created a truncated normal distribution for distance from clinic
    lower, upper = 0, 20
    mu, sigma = 3, 3
    X = stats.truncnorm((lower - mu) / sigma, (upper - mu) / sigma, loc=mu, scale=sigma)
    gdf['DISTANCE_FROM_CLINIC'] = X.rvs(num_samples)
    gdf['DISTANCE_FROM_CLINIC'] = gdf['DISTANCE_FROM_CLINIC'].astype('int8')
    
    gdf['PREDICTED_ADVERSE_WEATHER'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['ACTUAL_ADVERSE_WEATHER'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    
    # Generate some socio-economic features in a separate zipcode dataframe
    gdf['ZIPCODE'] = np.random.choice(zipcodes, size=num_samples)
    zipcode_gdf = cudf.DataFrame()
    zipcode_gdf['ZIPCODE'] = zipcodes
    zipcode_gdf['ACCESS_TO_TRANSPORTATION'] = np.random.choice(binary_choice, size=len(zipcodes), p=[0.9, 0.1])

    
    # Special handling for categorical data that's multiple response
    gdf['APPT_REASON_Skin'] = np.random.choice(binary_choice, size=num_samples, p=[0.43, 0.57])
    gdf['APPT_REASON_Joint'] = np.random.choice(binary_choice, size=num_samples, p=[0.34, 0.66])
    gdf['APPT_REASON_Back'] = np.random.choice(binary_choice, size=num_samples, p=[0.24, 0.76])
    gdf['APPT_REASON_Cholesterol'] = np.random.choice(binary_choice, size=num_samples, p=[0.22, 0.78])
    gdf['APPT_REASON_Respiratory'] = np.random.choice(binary_choice, size=num_samples, p=[0.22, 0.78])
    gdf['APPT_REASON_Mental_Health'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['APPT_REASON_Neurologic'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['APPT_REASON_BP'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['APPT_REASON_Headache'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['APPT_REASON_Diabetes'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['APPT_REASON_Other'] = np.random.choice(binary_choice, size=num_samples, p=[0.3, 0.7])

    gdf['HEALTH_ISSUE_Heart_Disease'] = np.random.choice(binary_choice, size=num_samples, p=[0.2, 0.8])
    gdf['HEALTH_ISSUE_Cancer'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['HEALTH_ISSUE_Stroke'] = np.random.choice(binary_choice, size=num_samples, p=[0.05, 0.95])
    gdf['HEALTH_ISSUE_Respiratory_Disease'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['HEALTH_ISSUE_Injuries'] = np.random.choice(binary_choice, size=num_samples, p=[0.2, 0.8])
    gdf['HEALTH_ISSUE_Diabetes'] = np.random.choice(binary_choice, size=num_samples, p=[0.2, 0.8])
    gdf['HEALTH_ISSUE_Alzheimers'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['HEALTH_ISSUE_Pneumonia'] = np.random.choice(binary_choice, size=num_samples, p=[0.1, 0.9])
    gdf['HEALTH_ISSUE_Kidney_Disease'] = np.random.choice(binary_choice, size=num_samples, p=[0.05, 0.95])
    gdf['HEALTH_ISSUE_Mental_Health'] = np.random.choice(binary_choice, size=num_samples, p=[0.2, 0.8])
    gdf['HEALTH_ISSUE_Other'] = np.random.choice(binary_choice, size=num_samples, p=[0.3, 0.7])
    

    # Create a Pregnancy feature using Gender and a pregnancy probability
    # Since we don't support string values in lambda functions, label encode gender first
    le = LabelEncoder()
    gdf['GENDER_CODE'] = le.fit_transform(gdf['GENDER'])
    
    # Double the probability of what I actually want, since I'll be splitting it by half between male & female.
    gdf['PREG_PROBABILITY'] = np.random.choice([1, 0], size=num_samples, p=[0.2, 0.8])

    def preg_kernel(GENDER_CODE, PREG_PROBABILITY, HEALTH_ISSUE_Pregnancy, kwarg1):
        for i, (gender_code, preg_probability) in enumerate(zip(GENDER_CODE, PREG_PROBABILITY)):
            preg_val = 0
            if (gender_code == 2):
                preg_val = preg_probability
            HEALTH_ISSUE_Pregnancy[i] = preg_val        
    
    gdf = gdf.apply_rows(preg_kernel,
              incols=['GENDER_CODE', 'PREG_PROBABILITY'],
               outcols=dict(HEALTH_ISSUE_Pregnancy=np.int),
               kwargs=dict(kwarg1=1)
              )
    gdf.drop_column('PREG_PROBABILITY')
    
    gdf['NO_SHOW_RATE'] = np.random.choice([0, 0.2, 0.4, 0.6, 0.8, 1.0 ], size=num_samples, p=[0.5, 0.2, 0.15, 0.1, 0.05, 0])
    gdf['LATE_START_RATE'] = np.random.choice([0, 0.2, 0.4, 0.6, 0.8, 1.0 ], size=num_samples, p=[0.2, 0.4, 0.2, 0.1, 0.05, 0.05])

    # Create a column for noise
    gdf['NOISE'] = np.random.ranf(size=num_samples)
    
    return gdf, zipcode_gdf

In [14]:
# Generate labels using cuDF
def generate_labels(AGE, GENDER_CODE, NO_SHOW_RATE, LATE_START_RATE, \
           DISTANCE_FROM_CLINIC, ACCESS_TO_TRANSPORTATION, DAY, MONTH, \
           HEALTH_ISSUE_Mental_Health, DEPT_ID, NOISE,\
           LABEL, kwarg1):
    for i, (age, gender_code, no_show_rate, late_start_rate, \
            distance_from_clinic, access_to_transportation, day, month, \
            health_issue_mental_health, dept_id, noise) \
        in enumerate(zip(AGE, GENDER_CODE, NO_SHOW_RATE, LATE_START_RATE, \
                         DISTANCE_FROM_CLINIC, ACCESS_TO_TRANSPORTATION, DAY, MONTH, \
                         HEALTH_ISSUE_Mental_Health, DEPT_ID, NOISE)):

        prob = 0
        
        if (age>=18) and (age<30) and (gender_code==1):
            prob += 0.1
        if (age>=30) and (age<40) and (distance_from_clinic>=3) and (distance_from_clinic<6):
            prob += 0.1
        if access_to_transportation == 0:
            prob += 0.1
        if no_show_rate > 0.2 or late_start_rate > 0.6:
            prob += 0.1
        if health_issue_mental_health == 1:
            prob += 0.1
        if (dept_id==2) or (dept_id==3) or (dept_id==5) or (dept_id==7):
            prob += 0.1
        
        holiday_week = 0
        if (month==5 and day>24) \
            or (month==7 and day<8) \
            or (month==9 and day<8) \
            or (month==12 and day>21) \
            or (month==1 and day<3):  \
            holiday_week = 1
            
        if (holiday_week==1):
            prob += 0.2
        
        # Add some noise
        prob = prob + (0.33 * noise)
        
        if prob > 0.5:
            prob = 1
        else:
            prob = 0
        
        LABEL[i] = prob

## Generate Features and Labels

In [15]:
gdf, zipcode_gdf = generate_features()

In [18]:
gdf = gdf.merge(zipcode_gdf, how="left", on=['ZIPCODE'])

In [19]:
gdf = gdf.apply_rows(generate_labels,
              incols=['AGE', 'GENDER_CODE', 'NO_SHOW_RATE', 'LATE_START_RATE', 
                      'DISTANCE_FROM_CLINIC', 'ACCESS_TO_TRANSPORTATION', 'DAY', 'MONTH', \
                      'HEALTH_ISSUE_Mental_Health', 'DEPT_ID', 'NOISE'],
               outcols=dict(LABEL=np.int),
               kwargs=dict(kwarg1=1)
              )    

# Remove column so that you can save it with just zipcode info
gdf.drop_column('ACCESS_TO_TRANSPORTATION')

# Take out the encoded column and just leave the string one
gdf.drop_column('GENDER_CODE')

# Remove noise column
gdf.drop_column('NOISE')

In [20]:
print("Samples: {:.1f} million".format(len(gdf)/1E6))
print("Features + Label:", len(gdf.columns.tolist()))
print("Dataset size: {:.1f} GB".format(sys.getsizeof(gdf)/1E9))

Samples: 1.8 million
Features + Label: 40
Dataset size: 0.5 GB


In [21]:
print(gdf.head())

   AGE  GENDER INSURANCE     VISIT_TYPE DEPT_SPECIALTY  DEPT_ID APPT_WEEKDAY  \
0   71    MALE  MEDICAID          OTHER    DERMATOLOGY       11          FRI   
1   10  FEMALE  EMPLOYER   OP_Procedure          OTHER        2          WED   
2   65  FEMALE   PRIVATE   OP_Procedure             PT        6          MON   
3   41  FEMALE  MEDICAID  Physical_Exam      RADIOLOGY        6          THU   
4   75  FEMALE   PRIVATE     Flu_Clinic          OBGYN        7          WED   

   APPT_TIME  DAY  MONTH  ...  HEALTH_ISSUE_Diabetes  HEALTH_ISSUE_Alzheimers  \
0       12.5    1      2  ...                      0                        0   
1       16.5    7     12  ...                      0                        1   
2        9.5    1     12  ...                      0                        0   
3       16.5   19      3  ...                      0                        1   
4       11.0   13      4  ...                      0                        0   

   HEALTH_ISSUE_Pneumonia  HEALT

## Save Dataframes to CSV

In [22]:
# Convert to pandas and save it off for reuse
pdf = gdf.to_pandas()
zipcode_pdf = zipcode_gdf.to_pandas()

In [23]:
# Add a few nulls
pdf['INSURANCE'] = pdf['INSURANCE'].replace('OTHER', np.nan)
pdf['INSURANCE'].head(10)

0    MEDICAID
1    EMPLOYER
2     PRIVATE
3    MEDICAID
4     PRIVATE
5    EMPLOYER
6    EMPLOYER
7    EMPLOYER
8    EMPLOYER
9     PRIVATE
Name: INSURANCE, dtype: object

In [24]:
path='patient_data.csv'
pdf.to_csv(path, index=False)

In [25]:
path='zipcode_data.csv'
zipcode_pdf.to_csv(path, index=False)