# Setting up the environment

We'll load the needed libraries:


In [6]:
options(repr.matrix.max.rows=100, repr.matrix.max.cols=300)
options(repr.plot.width = 20, repr.plot.height = 15)
options(width=300)

numcores=8

library(tidyverse)
library(data.table)
library(fst)
library(comorbidity)
library(reshape)
library(dtplyr)
library(haven)
library(vroom)
library(dplyr)
`%!in%` = Negate(`%in%`)

setDTthreads(numcores)

── [1mAttaching packages[22m ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.1 ──

[32m✔[39m [34mggplot2[39m 3.3.5     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.6     [32m✔[39m [34mdplyr  [39m 1.0.8
[32m✔[39m [34mtidyr  [39m 1.2.0     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 2.1.2     [32m✔[39m [34mforcats[39m 0.5.1

── [1mConflicts[22m ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()


Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


The following object is masked from ‘package:purrr’:

    transpose



Attachi

# Codes

First, we will add codes from ICD and Medicare:primary_care_specialty_codes

In [7]:
#diagnosis codes

office_visit_codes=c("99201","99202","99203","99204","99205","99211","99212","99213","99214"
                     ,"99215")

IHD_icd_9_codes=c(410, 411, 412,413,414)
IHD_icd_10_codes=c("I20", "I21", "I22", "I23", "I24", "I25")

non_us_state_codes=c(40,54,56,57,58,59,60,61,62,63,64,65,66,97,98,99)

primary_care_specialty_codes=c("01", "08", "11", "38")

#http://www.icd9data.com/2015/Volume1/390-459/401-405/default.htm
#https://www.icd10data.com/ICD10CM/Codes/I00-I99/I10-I16
hypertension_icd_9_codes=c("401","402","403","404","405")
hypertension_icd_10_codes=c("I10","I11","I12","I13","I15","I16")

#http://www.icd9data.com/2014/Volume1/290-319/295-299/296/default.htm
#https://www.icd10data.com/ICD10CM/Codes/F01-F99/F30-F39
depression_icd_9_codes=c("2962","2963")
depression_icd_10_codes=c("F32","F33")

#http://www.icd9data.com/2015/Volume1/240-279/249-259/default.htm
#https://www.icd10data.com/ICD10CM/Codes/E00-E89/E08-E13
diabetes_icd_9_codes=c("250")
diabetes_icd_10_codes=c("E08","E09","E10","E11","E13")

#http://www.icd9data.com/2014/Volume1/710-739/710-719/714/default.htm
#https://www.icd10data.com/ICD10CM/Codes/M00-M99/M05-M14
arthritis_icd_9_codes=c("714")
arthritis_icd_10_codes=c("M05","M06","M07","M08","M09","M10","M11","M12","M13","M14")





# Patient level calculations

## Yearly Calculators

These are the main functions that calculate yearly expenditures for patients and their corresponding physicians.\

### Read data from choose columns



In [3]:
carrier_data_all_years = read_fst(
    "carrier_data_all_years.fst", as.data.table = T,to = 1000000)

In [4]:
outpatient_data_all_years = read_fst(
    "outpatient_data_all_years.fst", as.data.table = T,to = 1000000)
inpatient_data_all_years = read_fst(
    "inpatient_data_all_years.fst", as.data.table = T,to = 1000000)

In [5]:
mbsf_data = read_fst(
  "/work/postresearch/Shared/Projects/Data_fst/mbsf_data", as.data.table = T,to = 1000000)
revenue_center_outpatient_all_years = read_fst(
  "/work/postresearch/Shared/Projects/Data_fst/revenue_center_outpatient_all_years.fst", as.data.table = T,to = 1000000)
outpatient_and_revenue_center_data = read_fst(
  "/work/postresearch/Shared/Projects/Data_fst/outpatient_and_revenue_center_data.fst", as.data.table = T,to = 1000000)

In [9]:
#carrier_sample = tail(carrier_data_all_years,1000000)
#outpatient_sample = tail(outpatient_data_all_years,1000000)
#inpatient_sample = tail(inpatient_data_all_years,1000000)
#mbsf_sample = tail(mbsf_data,1000000)
#revenue_center_outpatient_sample=tail(revenue_center_outpatient_all_years,1000000)
#outpatient_and_revenue_center_data_sample=tail(outpatient_and_revenue_center_data,1000000)
#head(carrier_sample)
#head(outpatient_sample)
#head(inpatient_sample)
#head(mbsf_sample)
#head(revenue_center_outpatient_sample)
#head(outpatient_and_revenue_center_data_sample)

### Loading sample data (for pc)

In [2]:
sample_data=readRDS(file = "sample_data.RDS")

In [9]:
carrier_data_all_years=sample_data[[1]]
outpatient_data_all_years=sample_data[[2]]
inpatient_data_all_years=sample_data[[3]]
mbsf_data=read_fst("mbsf_data_long.fst")
revenue_center_outpatient_all_years=sample_data[[5]]
outpatient_and_revenue_center_data=sample_data[[6]]

### Patient yearly expenditures and use of services carrier

I will first create a function that adds conditions of interest to the data.


#### Finding conditions for each claim line

In [7]:
yearly_calculator_patient_conditions = function(data) {
  
  #requirements
  require(data.table)
  require(dtplyr)
  require(tidyverse)
  require(lubridate)
  
  data %>%
    mutate(
      is_office_visit = HCPCS_CD %in% office_visit_codes,
      
      is_by_primary_care_physician= PRVDR_SPCLTY %in% primary_care_specialty_codes,

      is_hypertension= if_else(
        LINE_ICD_DGNS_VRSN_CD == 0,
        substr(LINE_ICD_DGNS_CD, 0, 3) %in% hypertension_icd_10_codes,
        if_else(
          LINE_ICD_DGNS_VRSN_CD == 9,
          substr(LINE_ICD_DGNS_CD, 0, 3) %in% hypertension_icd_9_codes,NA)),
      
      is_arthritis= if_else(
        LINE_ICD_DGNS_VRSN_CD == 0,
        substr(LINE_ICD_DGNS_CD, 0, 3) %in% arthritis_icd_10_codes,
        if_else(
          LINE_ICD_DGNS_VRSN_CD == 9,
          substr(LINE_ICD_DGNS_CD, 0, 3) %in% arthritis_icd_9_codes,NA)),
      
      is_IHD = if_else(
        LINE_ICD_DGNS_VRSN_CD == 0,
        substr(LINE_ICD_DGNS_CD, 0, 3) %in% IHD_icd_10_codes,
        if_else(
          LINE_ICD_DGNS_VRSN_CD == 9,
          substr(LINE_ICD_DGNS_CD, 0, 3) %in% IHD_icd_9_codes,NA)),
      
      is_diabetes= if_else(
        LINE_ICD_DGNS_VRSN_CD == 0,
        substr(LINE_ICD_DGNS_CD, 0, 3) %in% diabetes_icd_10_codes,
        if_else(
          LINE_ICD_DGNS_VRSN_CD == 9,
          substr(LINE_ICD_DGNS_CD, 0, 3) %in% diabetes_icd_9_codes,NA)),
      
      is_depression= if_else(
        LINE_ICD_DGNS_VRSN_CD == 0,
        substr(LINE_ICD_DGNS_CD, 0, 3) %in% depression_icd_10_codes,
        if_else(
          LINE_ICD_DGNS_VRSN_CD == 9,
          substr(LINE_ICD_DGNS_CD, 0, 4) %in% depression_icd_9_codes,NA))

      
    ) %>%
    as.data.table()
}

yearly_patient_conditions_carrier=yearly_calculator_patient_conditions(carrier_data_all_years)
head(yearly_patient_conditions_carrier)


Loading required package: lubridate


Attaching package: ‘lubridate’


The following object is masked from ‘package:reshape’:

    stamp


The following objects are masked from ‘package:data.table’:

    hour, isoweek, mday, minute, month, quarter, second, wday, week, yday, year


The following objects are masked from ‘package:base’:

    date, intersect, setdiff, union




DESY_SORT_KEY,CLAIM_NO,LINE_NUM,CLM_THRU_DT,LINE_PLACE_OF_SRVC_CD,HCPCS_CD,LINE_ICD_DGNS_VRSN_CD,LINE_ICD_DGNS_CD,LINE_ALOWD_CHRG_AMT,PRF_PHYSN_NPI,PRVDR_SPCLTY,PRVDR_STATE_CD,date,year,month_year,is_office_visit,is_by_primary_care_physician,is_hypertension,is_arthritis,is_IHD,is_diabetes,is_depression
<int>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<chr>,<chr>,<int>,<date>,<dbl>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
100000015,2,1,20130425,22,94375,9,496,15.26,1073503884,29,22,2013-04-25,2013,2013-04,False,False,False,False,False,False,False
100000015,2,2,20130425,22,94726,9,496,13.54,1073503884,29,22,2013-04-25,2013,2013-04,False,False,False,False,False,False,False
100000015,2,3,20130425,22,94729,9,496,9.95,1073503884,29,22,2013-04-25,2013,2013-04,False,False,False,False,False,False,False
100000015,3,1,20130528,11,99214,9,41400,114.64,1285600932,11,22,2013-05-28,2013,2013-05,True,True,False,False,True,False,False
100000015,3,2,20130528,11,93000,9,41400,20.08,1285600932,11,22,2013-05-28,2013,2013-05,False,True,False,False,True,False,False
100000015,4,1,20130719,22,99213,9,496,51.76,1659344091,29,22,2013-07-19,2013,2013-07,True,False,False,False,False,False,False


#### Summarizing patient data
I will now summarise the data for each patient.


In [12]:
summarise_carrier = function(data, time_frame = 365){
  
  data%>%
    group_by(DESY_SORT_KEY,year) %>%
    summarise(
      #tot_allowed_carrier = sum(na.rm = T, LINE_ALOWD_CHRG_AMT),
      
      #office_visit_count = sum(na.rm = T, is_office_visit),
      
      #office_visit_cost_carrier = sum(na.rm = T, LINE_ALOWD_CHRG_AMT * is_office_visit),
      
      distinct_clinicians = length(unique(PRF_PHYSN_NPI)),
      
      distinct_primary_care_physicians = length(.[is_by_primary_care_physician, unique(PRF_PHYSN_NPI)]),

      hypertension = sum(is_hypertension, na.rm = T) > 0,
      
      arthritis = sum(is_arthritis, na.rm = T) > 0,
      
      IHD = sum(is_IHD, na.rm = T) > 0,
      
      diabetes = sum(is_diabetes, na.rm = T) > 0,
  
      depression = sum(is_depression, na.rm = T) > 0,
      
      icd_9_pure = ifelse(prod(LINE_ICD_DGNS_VRSN_CD, na.rm = T) == 0, F, T),
      
      icd_10_pure = ifelse(sum(LINE_ICD_DGNS_VRSN_CD, na.rm = T) == 0, T, F),
      
    ) %>%
    as.data.table()
}


summary_patient_by_year = summarise_carrier(yearly_patient_conditions_carrier)
head(summary)


`summarise()` has grouped output by 'DESY_SORT_KEY'. You can override using the `.groups` argument.


DESY_SORT_KEY,year,tot_allowed_carrier,office_visit_count,office_visit_cost_carrier,distinct_clinicians,distinct_primary_care_physicians,hypertension,arthritis,IHD,diabetes,depression,icd_9_pure,icd_10_pure
<int>,<dbl>,<dbl>,<int>,<dbl>,<int>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
100000015,2013,1021.05,4,453.65,7,1,False,False,True,False,False,True,False
100000019,2013,32873.53,46,5307.06,27,2,True,False,False,False,False,True,False
100000053,2013,1880.87,9,866.48,6,0,False,False,False,False,False,True,False
100000099,2013,10677.32,25,1598.26,29,5,True,False,False,False,False,True,False
100000129,2013,0.0,0,0.0,1,0,False,False,False,False,False,True,False
100000203,2013,11762.77,27,2802.06,21,4,True,False,False,False,False,True,False


In [None]:
add_patient_characteristics = function(mbsf_data,summary_data){
  require(dtplyr)
  require(lubridate)
  require(tidyverse)
  data = left_join(summary_data,mbsf_data,by="DESY_SORT_KEY,year") %>% as.data.frame()
  
  data %>%
  mutate(
    died_this_year=
  )
}

summary_with_patient_characteristics=add_patient_characteristics(mbsf_data,summary_with_outpatient)
head(summary_with_patient_characteristics)

In [10]:
head(mbsf_data)

Unnamed: 0_level_0,DESY_SORT_KEY,REFERENCE_YEAR,STATE_CODE,COUNTY_CODE,SEX_CODE,RACE_CODE,AGE,ORIG_REASON_FOR_ENTITLEMENT,CURR_REASON_FOR_ENTITLEMENT,ENTITLEMENT_BUY_IN_IND01,ENTITLEMENT_BUY_IN_IND02,ENTITLEMENT_BUY_IN_IND03,ENTITLEMENT_BUY_IN_IND04,ENTITLEMENT_BUY_IN_IND05,ENTITLEMENT_BUY_IN_IND06,ENTITLEMENT_BUY_IN_IND07,ENTITLEMENT_BUY_IN_IND08,ENTITLEMENT_BUY_IN_IND09,ENTITLEMENT_BUY_IN_IND10,ENTITLEMENT_BUY_IN_IND11,ENTITLEMENT_BUY_IN_IND12,HMO_INDICATOR01,HMO_INDICATOR02,HMO_INDICATOR03,HMO_INDICATOR04,HMO_INDICATOR05,HMO_INDICATOR06,HMO_INDICATOR07,HMO_INDICATOR08,HMO_INDICATOR09,HMO_INDICATOR10,HMO_INDICATOR11,HMO_INDICATOR12,VALID_DATE_OF_DEATH_SWITCH,DATE_OF_DEATH,year
Unnamed: 0_level_1,<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
1,0,13,45,910,2,1,75,0,0,C,C,C,C,C,C,C,C,C,C,C,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2013
2,0,13,22,170,2,1,71,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2013
3,0,13,33,420,2,1,93,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2013
4,0,13,49,801,2,1,71,0,0,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,,,2013
5,0,13,33,400,2,1,75,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2013
6,0,13,10,510,1,1,70,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2013
