In [2]:
#install.packages("dplyr")
#install.packages("icd")
#installed caret using r-essentials=1.5.2
#install.packages("caret")

also installing the dependencies ‘rlang’, ‘assertthat’, ‘Rcpp’, ‘tibble’, ‘lazyeval’, ‘DBI’, ‘BH’

Updating HTML index of packages in '.Library'
Making 'packages.html' ... done


In [1]:
library("dplyr")
library("icd")


Attaching package: ‘dplyr’

The following objects are masked from ‘package:stats’:

    filter, lag

The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union

Welcome to the "icd" package for finding comorbidities and interpretation of ICD-9 and ICD-10 codes. Suggestions and contributions are welcome at https://github.com/jackwasey/icd .

See the vignettes and help for examples.

Please cite this package if you find it useful in your published work.
citation(package = "icd")



In [25]:
options(repr.matrix.max.cols=50, repr.matrix.max.rows=100)

In [26]:
#import CY2014 Medical Conditions file 
mc14 <- read.csv("Data/2014 - conditions.csv")
pop14 <- read.csv("Data/2014 - popchar.csv")
out14 <- read.csv("Data/2014 - outpatient.csv")
office14 <- read.csv("Data/2014 - office.csv")

In [27]:
#Create flag for person IDs (i.e DUPERSID) with Crohns Disease
crohns_marker <- mc14 %>% #pipe operator
  mutate(crohnsICD = ifelse(ICD9CODX == "555", 1, 0)) %>% #555 is the diagnosis code for Crohns
  group_by(DUPERSID) %>% #group_by precedes and informs the summarise function
  summarise(crohns = max(crohnsICD)) %>% #DUPERSID can have more than one row (i.e. diagnosis). 
                                        ##We only care about if they have crohns or not, so a maximum (1) will suffice
  arrange(crohns) 

In [28]:
#keep only subsect of variable of interest to be used for this analysis
base <- select(pop14, SEX, AGE14X, DUPERSID, RACEV1X, MARRY14X, 
               EDRECODE,REGION14, INSCOV14)

In [29]:
#In SEX variable, Female originally coded as 2. Female now coded as 0
base$SEX <- ifelse(base$SEX !=1, 0, 1) 

#Want to merge our base file with crohns_marker to flag DUPERSID with Crohns
base <- left_join(base, crohns_marker, by = "DUPERSID")
#However, some DUPERSID show "NA" because they were not in medical conditions file.
##Therefore, these are DUPERSIDs without any diagnoses. Thus, crohns marker equals 0
base$crohns[is.na(base$crohns)] <- 0

In [30]:
#Previous count of chronic diseases. Counts crohns as well, so not technically cobmorbidity of crohns, but total conditions
##I want to verify the co_count somepoint because I think the results are way too high. Make sure we aren't double counting
comorbid <- mc14 %>%
  select(DUPERSID, ICD9CODX) %>%
  group_by(DUPERSID) %>%
  summarise(co_count = n())

base <- left_join(base, comorbid, by = "DUPERSID") #merge to base
base$co_count[is.na(base$co_count)] <- 0 #again, some may be "NA" if not in mc14. These cases are converted to 0

In [31]:
glimpse(base)

Observations: 34,875
Variables: 10
$ SEX      <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1...
$ AGE14X   <int> 36, 36, 15, 8, 85, 34, 32, 15, 11, 9, 4, 8, -1, 79, 47, 35...
$ DUPERSID <int> 40001101, 40001102, 40001103, 40001104, 40002101, 40004101...
$ RACEV1X  <int> 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 6, 1, 1, 1, 1, 2...
$ MARRY14X <int> 1, 1, 6, 6, 2, 1, 1, 6, 6, 6, 6, 6, 5, 2, 5, 1, 1, 6, 6, 5...
$ EDRECODE <int> 15, 14, 1, 1, 13, 13, 2, 1, 1, 1, -1, 1, 2, 15, 2, 15, 16,...
$ REGION14 <int> 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, -1, 3, 4, 2, 2, 2, 2, ...
$ INSCOV14 <int> 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 2, 2, 3, 1, 1, 1, 1, 1, 1, 3...
$ crohns   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ co_count <dbl> 5, 2, 0, 1, 3, 0, 5, 1, 12, 6, 12, 12, 0, 10, 1, 2, 2, 0, ...


In [32]:
#New Charlson Comorbidity Index scoring from 'icd' package
mc14.charlson <- mc14 %>%
  rename(icd9cm = ICD9CODX, id = DUPERSID) %>%
  select(id, icd9cm)

In [33]:
charlson <- icd_charlson(mc14.charlson, short_code = TRUE, return_df = TRUE)
charlson <- rename(charlson, DUPERSID = id) #convert back to DUPERSID so we can merge with our base file
charlson$DUPERSID <- as.integer(levels(charlson$DUPERSID))

In [34]:
glimpse(charlson)

Observations: 25,870
Variables: 2
$ DUPERSID <int> 40001101, 40001102, 40001104, 40002101, 40004102, 40004103...
$ Charlson <dbl> 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0...


In [35]:
base <- left_join(base, charlson, by = "DUPERSID")
base$Charlson[is.na(base$Charlson)] <- 0 #again, some may be "NA" if not in mc14. These cases are converted to 0

In [36]:
#import CY2014 office-based file
#office14 <- read_dta("2014_Office.dta")
#Sum total office-based expenditures by DUPERSID
officeXPdf <- office14 %>%
  group_by(DUPERSID) %>%
  summarise(officeXP = sum(OBXP14X)) 
#merge total  office-based expenditures with base file
base <- left_join(base, officeXPdf, by = "DUPERSID")
base$officeXP[is.na(base$officeXP)] <- 0

In [37]:
#import CY2014 outpatient file
#op14 <- read_dta("2014_Outpatient.dta")
#Sum total outpatient expenditures by DUPERSID
opXPdf <- out14 %>%
  group_by(DUPERSID) %>%
  summarise(opXP = sum(OPXP14X))
#merge total  outpatient expenditures with base file
base <- left_join(base, opXPdf, by = "DUPERSID")
base$opXP[is.na(base$opXP)] <- 0

#create new variable in base file (i.e. totalXP) to sum total office + outpatient expenditures
base <- mutate(base, totalXP = officeXP + opXP)

###READY TO GO GARDENING for CARETS!!!!

In [41]:
glimpse(base)

Observations: 34,875
Variables: 14
$ SEX      <dbl> 1, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1...
$ AGE14X   <int> 36, 36, 15, 8, 85, 34, 32, 15, 11, 9, 4, 8, -1, 79, 47, 35...
$ DUPERSID <int> 40001101, 40001102, 40001103, 40001104, 40002101, 40004101...
$ RACEV1X  <int> 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 6, 1, 1, 1, 1, 2...
$ MARRY14X <int> 1, 1, 6, 6, 2, 1, 1, 6, 6, 6, 6, 6, 5, 2, 5, 1, 1, 6, 6, 5...
$ EDRECODE <int> 15, 14, 1, 1, 13, 13, 2, 1, 1, 1, -1, 1, 2, 15, 2, 15, 16,...
$ REGION14 <int> 3, 3, 3, 3, 1, 3, 3, 3, 3, 3, 3, 3, -1, 3, 4, 2, 2, 2, 2, ...
$ INSCOV14 <int> 1, 1, 1, 1, 1, 3, 3, 3, 2, 2, 2, 2, 3, 1, 1, 1, 1, 1, 1, 3...
$ crohns   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0...
$ co_count <dbl> 5, 2, 0, 1, 3, 0, 5, 1, 12, 6, 12, 12, 0, 10, 1, 2, 2, 0, ...
$ Charlson <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0...
$ officeXP <dbl> 762.06, 275.06, 115.91, 92.00, 82.97, 0.00, 31.07, 120.51,...
$ opXP     <dbl> 

In [44]:
write.csv(base, file = "Crohns_DF.csv")

In [40]:
# ensure results are repeatable
set.seed(7)
# load the library
library("caret")
# load the dataset
#data(iris)
# prepare training scheme
#control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
#model <- train(Species~., data=iris, method="lvq", trControl=control, tuneLength=5)
# summarize the model
#print(model)
# ensure results are repeatable
#set.seed(7)
# load the library
#library(caret)
# load the dataset
#data(iris)
# prepare training scheme
#control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the model
#model <- train(Species~., data=iris, method="lvq", trControl=control, tuneLength=5)
# summarize the model
#print(model)

ERROR: Error: package or namespace load failed for ‘caret’
