# Setup

In [2]:
library(keras)
library(tensorflow)
tf$test$gpu_device_name()

In [8]:
options(repr.matrix.max.rows=100, repr.matrix.max.cols=300)
options(
    repr.plot.res = 120,
    repr.plot.width = 10,
    repr.plot.height = 7, 
    repr.plot.pointsize = 12, 
    repr.plot.quality = 100
)
options(width=300)
    
numcores = 10

library(tidyverse)
library(parallel)
library(data.table)
library(fst)
library(comorbidity)
library(zeallot)
library(dtplyr)
library(vroom)
library(dplyr)
library(FactoMineR)
library(factoextra)
library(plyr)

#library(icd)
`%!in%` = Negate(`%in%`)

setDTthreads(numcores)

# Extracting the needed data from the datasets
Here, we will create a data.table with the following columns:
* id: the unique identifier of each Medicare beneficiary
* year: the year of the Medicare claim
* diagnosis: the diagnosis code of the claim
* procedure: the procedure code of the claim
* provider: the provider of the claim
* provider_specialty: the specialty of the provider
* cost: the cost of the claim

We will only use 2016 to 2019 data for this analysis.
We exclude 2020 due to the COVID-19 pandemic. We exclude 2013-2015 due to use of the ICD-9 coding system.

## Carrier files

In [4]:
carrier_data_2016 = read_fst("/work/postresearch/Shared/Projects/Data_fst/carrier_data_2016.fst",
    as.data.table = T,
    columns = c("DESY_SORT_KEY", "LINE_ICD_DGNS_CD", "PRF_PHYSN_NPI", "PRVDR_SPCLTY", "HCPCS_CD", "CLM_THRU_DT", "LINE_ALOWD_CHRG_AMT")
)
carrier_data_2017 = read_fst("/work/postresearch/Shared/Projects/Data_fst/carrier_data_2017.fst",
    as.data.table = T,
    columns = c("DESY_SORT_KEY", "LINE_ICD_DGNS_CD", "PRF_PHYSN_NPI", "PRVDR_SPCLTY", "HCPCS_CD", "CLM_THRU_DT", "LINE_ALOWD_CHRG_AMT")
)
carrier_data_2018 = read_fst("/work/postresearch/Shared/Projects/Data_fst/carrier_data_2018.fst",
    as.data.table = T,
    columns = c("DESY_SORT_KEY", "LINE_ICD_DGNS_CD", "PRF_PHYSN_NPI", "PRVDR_SPCLTY", "HCPCS_CD", "CLM_THRU_DT", "LINE_ALOWD_CHRG_AMT")
)
carrier_data_2019 = read_fst("/work/postresearch/Shared/Projects/Data_fst/carrier_data_2019.fst",
    as.data.table = T,
    columns = c("DESY_SORT_KEY", "LINE_ICD_DGNS_CD", "PRF_PHYSN_NPI", "PRVDR_SPCLTY", "HCPCS_CD", "CLM_THRU_DT", "LINE_ALOWD_CHRG_AMT")
)

In [5]:
# change the column names
setnames(
    carrier_data_2016,
    c("id", "diagnosis", "provider", "provider_specialty", "hcpcs", "date", "cost")
)

setnames(
    carrier_data_2017,
    c("id", "diagnosis", "provider", "provider_specialty", "hcpcs", "date", "cost")
)

setnames(
    carrier_data_2018,
    c("id", "diagnosis", "provider", "provider_specialty", "hcpcs", "date", "cost")
)

setnames(
    carrier_data_2019,
    c("id", "diagnosis", "provider", "provider_specialty", "hcpcs", "date", "cost")
)

In [8]:
# add the year to the data
carrier_data_2016[, year := 2016]
carrier_data_2017[, year := 2017]
carrier_data_2018[, year := 2018]
carrier_data_2019[, year := 2019]

# combine the data

carrier_data = rbindlist(
    list(
        carrier_data_2016,
        carrier_data_2017,
        carrier_data_2018,
        carrier_data_2019
    ),
    use.names = T
)

In [11]:
head(carrier_data)

id,diagnosis,provider,provider_specialty,hcpcs,date,cost,year
<int>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>
100000015,H3532,1265609275,18,67028,20160129,110.66,2016
100000015,H3532,1265609275,18,J0178,20160129,1961.0,2016
100000015,I10,1730170630,6,99214,20160212,113.64,2016
100000015,I10,1730170630,6,93000,20160212,18.34,2016
100000015,I479,1730170630,6,93457,20160218,371.86,2016
100000015,I252,1730170630,6,99217,20160219,75.37,2016


In [10]:
# Save the combined data as fst to folder with no compression
write_fst(carrier_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/carrier_data.fst", compress = 0)

## Inpatient files

In [12]:
inpatient_data_2016 = read_fst("/work/postresearch/Shared/Projects/Data_fst/inpatient_data_2016.fst", as.data.table = T)
inpatient_data_2017 = read_fst("/work/postresearch/Shared/Projects/Data_fst/inpatient_data_2017.fst",as.data.table = T)
inpatient_data_2018 = read_fst("/work/postresearch/Shared/Projects/Data_fst/inpatient_data_2018.fst", as.data.table = T)
inpatient_data_2019 = read_fst("/work/postresearch/Shared/Projects/Data_fst/inpatient_data_2019.fst",as.data.table = T)

In [13]:
# add the year to the data

inpatient_data_2016[, year := 2016]
inpatient_data_2017[, year := 2017]
inpatient_data_2018[, year := 2018]
inpatient_data_2019[, year := 2019]


#combine the data
inpatient_data = rbindlist(
    list(
        inpatient_data_2016,
        inpatient_data_2017,
        inpatient_data_2018,
        inpatient_data_2019
    ),
    use.names = T
)


In [15]:
# Save the combined data as fst to folder with no compression
write_fst(inpatient_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/inpatient_data.fst", compress = 0)

## Outpatient files

In [16]:
outpatient_data_2016 = read_fst("/work/postresearch/Shared/Projects/Data_fst/outpatient_data_2016.fst",as.data.table = T)
outpatient_data_2017 = read_fst("/work/postresearch/Shared/Projects/Data_fst/outpatient_data_2017.fst",as.data.table = T)
outpatient_data_2018 = read_fst("/work/postresearch/Shared/Projects/Data_fst/outpatient_data_2018.fst",as.data.table = T)
outpatient_data_2019 = read_fst("/work/postresearch/Shared/Projects/Data_fst/outpatient_data_2019.fst",as.data.table = T)

In [18]:
# add the year to the data

outpatient_data_2016[, year := 2016]
outpatient_data_2017[, year := 2017]
outpatient_data_2018[, year := 2018]
outpatient_data_2019[, year := 2019]


# combine the data
outpatient_data = rbindlist(
    list(
        outpatient_data_2016,
        outpatient_data_2017,
        outpatient_data_2018,
        outpatient_data_2019
    ),
    use.names = T
)

In [19]:
# Save the combined data as fst to folder with no compression
write_fst(outpatient_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/outpatient_data.fst", compress = 0)

## Create a small sample for local analyses

In [8]:
# create a sample of 10000 patients based on id
sample_id = sample(carrier_data$id, 10000)

# sample the data based on the ids sampled
carrier_data_sample = carrier_data[id %in% sample_id]
inpatient_data_sample = inpatient_data[DESY_SORT_KEY %in% sample_id]
outpatient_data_sample = outpatient_data[DESY_SORT_KEY %in% sample_id]


In [9]:
head(carrier_data_sample)
head(inpatient_data_sample)
head(outpatient_data_sample)


id,diagnosis,provider,provider_specialty,hcpcs,date,cost,year
<int>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>
100009375,H2703,1245214063,18,92014,20160118,117.32,2016
100009375,K635,1215931563,10,99215,20160307,138.05,2016
100009375,K635,1215931563,10,1036F,20160307,0.0,2016
100009375,K635,1215931563,10,G8420,20160307,0.0,2016
100009375,K635,1215931563,10,G8752,20160307,0.0,2016
100009375,K635,1215931563,10,G8754,20160307,0.0,2016


DESY_SORT_KEY,CLAIM_NO,PRVDR_NUM,CLM_THRU_DT,NCH_NEAR_LINE_REC_IDENT_CD,NCH_CLM_TYPE_CD,CLAIM_QUERY_CODE,CLM_FAC_TYPE_CD,CLM_SRVC_CLSFCTN_TYPE_CD,CLM_FREQ_CD,FI_NUM,CLM_MDCR_NON_PMT_RSN_CD,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,NCH_PRMRY_PYR_CD,FI_CLM_ACTN_CD,PRVDR_STATE_CD,ORG_NPI_NUM,AT_PHYSN_UPIN,AT_PHYSN_NPI,AT_PHYSN_SPCLTY_CD,OP_PHYSN_UPIN,OP_PHYSN_NPI,OP_PHYSN_SPCLTY_CD,OT_PHYSN_UPIN,OT_PHYSN_NPI,OT_PHYSN_SPCLTY_CD,RNDRNG_PHYSN_NPI,RNDRNG_PHYSN_SPCLTY_CD,CLM_MCO_PD_SW,PTNT_DSCHRG_STUS_CD,CLM_PPS_IND_CD,CLM_TOT_CHRG_AMT,CLM_ADMSN_DT,CLM_IP_ADMSN_TYPE_CD,CLM_SRC_IP_ADMSN_CD,NCH_PTNT_STATUS_IND_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,NCH_PROFNL_CMPNT_CHRG_AMT,NCH_IP_NCVRD_CHRG_AMT,CLM_TOT_PPS_CPTL_AMT,CLM_PPS_CPTL_FSP_AMT,CLM_PPS_CPTL_OUTLIER_AMT,CLM_PPS_CPTL_DSPRPRTNT_SHR_AMT,CLM_PPS_CPTL_IME_AMT,CLM_PPS_CPTL_EXCPTN_AMT,CLM_PPS_OLD_CPTL_HLD_HRMLS_AMT,CLM_PPS_CPTL_DRG_WT_NUM,CLM_UTLZTN_DAY_CNT,BENE_TOT_COINSRNC_DAYS_CNT,BENE_LRD_USED_CNT,CLM_NON_UTLZTN_DAYS_CNT,NCH_BLOOD_PNTS_FRNSHD_QTY,NCH_VRFD_NCVRD_STAY_FROM_DT,NCH_VRFD_NCVRD_STAY_THRU_DT,NCH_BENE_MDCR_BNFTS_EXHTD_DT_I,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,CLM_DRG_OUTLIER_STAY_CD,NCH_DRG_OUTLIER_APRVD_PMT_AMT,ADMTG_DGNS_CD,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,CLM_POA_IND_SW1,ICD_DGNS_CD2,CLM_POA_IND_SW2,ICD_DGNS_CD3,CLM_POA_IND_SW3,ICD_DGNS_CD4,CLM_POA_IND_SW4,ICD_DGNS_CD5,CLM_POA_IND_SW5,ICD_DGNS_CD6,CLM_POA_IND_SW6,ICD_DGNS_CD7,CLM_POA_IND_SW7,ICD_DGNS_CD8,CLM_POA_IND_SW8,ICD_DGNS_CD9,CLM_POA_IND_SW9,ICD_DGNS_CD10,CLM_POA_IND_SW10,ICD_DGNS_CD11,CLM_POA_IND_SW11,ICD_DGNS_CD12,CLM_POA_IND_SW12,ICD_DGNS_CD13,CLM_POA_IND_SW13,ICD_DGNS_CD14,CLM_POA_IND_SW14,ICD_DGNS_CD15,CLM_POA_IND_SW15,ICD_DGNS_CD16,CLM_POA_IND_SW16,ICD_DGNS_CD17,CLM_POA_IND_SW17,ICD_DGNS_CD18,CLM_POA_IND_SW18,ICD_DGNS_CD19,CLM_POA_IND_SW19,ICD_DGNS_CD20,CLM_POA_IND_SW20,ICD_DGNS_CD21,CLM_POA_IND_SW21,ICD_DGNS_CD22,CLM_POA_IND_SW22,ICD_DGNS_CD23,CLM_POA_IND_SW23,ICD_DGNS_CD24,CLM_POA_IND_SW24,ICD_DGNS_CD25,CLM_POA_IND_SW25,FST_DGNS_E_CD,ICD_DGNS_E_CD1,CLM_E_POA_IND_SW1,ICD_DGNS_E_CD2,CLM_E_POA_IND_SW2,ICD_DGNS_E_CD3,CLM_E_POA_IND_SW3,ICD_DGNS_E_CD4,CLM_E_POA_IND_SW4,ICD_DGNS_E_CD5,CLM_E_POA_IND_SW5,ICD_DGNS_E_CD6,CLM_E_POA_IND_SW6,ICD_DGNS_E_CD7,CLM_E_POA_IND_SW7,ICD_DGNS_E_CD8,CLM_E_POA_IND_SW8,ICD_DGNS_E_CD9,CLM_E_POA_IND_SW9,ICD_DGNS_E_CD10,CLM_E_POA_IND_SW10,ICD_DGNS_E_CD11,CLM_E_POA_IND_SW11,ICD_DGNS_E_CD12,CLM_E_POA_IND_SW12,ICD_PRCDR_CD1,PRCDR_DT1,ICD_PRCDR_CD2,PRCDR_DT2,ICD_PRCDR_CD3,PRCDR_DT3,ICD_PRCDR_CD4,PRCDR_DT4,ICD_PRCDR_CD5,PRCDR_DT5,ICD_PRCDR_CD6,PRCDR_DT6,ICD_PRCDR_CD7,PRCDR_DT7,ICD_PRCDR_CD8,PRCDR_DT8,ICD_PRCDR_CD9,PRCDR_DT9,ICD_PRCDR_CD10,PRCDR_DT10,ICD_PRCDR_CD11,PRCDR_DT11,ICD_PRCDR_CD12,PRCDR_DT12,ICD_PRCDR_CD13,PRCDR_DT13,ICD_PRCDR_CD14,PRCDR_DT14,ICD_PRCDR_CD15,PRCDR_DT15,ICD_PRCDR_CD16,PRCDR_DT16,ICD_PRCDR_CD17,PRCDR_DT17,ICD_PRCDR_CD18,PRCDR_DT18,ICD_PRCDR_CD19,PRCDR_DT19,ICD_PRCDR_CD20,PRCDR_DT20,ICD_PRCDR_CD21,PRCDR_DT21,ICD_PRCDR_CD22,PRCDR_DT22,ICD_PRCDR_CD23,PRCDR_DT23,ICD_PRCDR_CD24,PRCDR_DT24,ICD_PRCDR_CD25,PRCDR_DT25,DOB_DT,GNDR_CD,BENE_RACE_CD,BENE_CNTY_CD,BENE_STATE_CD,CWF_BENE_MDCR_STUS_CD,CLM_TRTMT_AUTHRZTN_NUM,CLM_PRCR_RTRN_CD,CLM_IP_LOW_VOL_PMT_AMT,CLM_CARE_IMPRVMT_MODEL_CD1,CLM_CARE_IMPRVMT_MODEL_CD2,CLM_CARE_IMPRVMT_MODEL_CD3,CLM_CARE_IMPRVMT_MODEL_CD4,CLM_BNDLD_MODEL_1_DSCNT_PCT,CLM_BASE_OPRTG_DRG_AMT,CLM_VBP_PRTCPNT_IND_CD,CLM_VBP_ADJSTMT_PCT,CLM_HRR_PRTCPNT_IND_CD,CLM_HRR_ADJSTMT_PCT,CLM_MODEL_4_READMSN_IND_CD,CLM_UNCOMPD_CARE_PMT_AMT,CLM_BNDLD_ADJSTMT_PMT_AMT,CLM_VBP_ADJSTMT_PMT_AMT,CLM_HRR_ADJSTMT_PMT_AMT,EHR_PYMT_ADJSTMT_AMT,PPS_STD_VAL_PYMT_AMT,FINL_STD_AMT,HAC_PGM_RDCTN_IND_SW,EHR_PGM_RDCTN_IND_SW,CLM_SITE_NTRL_PYMT_CST_AMT,CLM_SITE_NTRL_PYMT_IPPS_AMT,CLM_FULL_STD_PYMT_AMT,CLM_SS_OUTLIER_STD_PYMT_AMT,CLM_NEXT_GNRTN_ACO_IND_CD1,CLM_NEXT_GNRTN_ACO_IND_CD2,CLM_NEXT_GNRTN_ACO_IND_CD3,CLM_NEXT_GNRTN_ACO_IND_CD4,CLM_NEXT_GNRTN_ACO_IND_CD5,ACO_ID_NUM,year
<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<dbl>,<chr>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<int>,<lgl>,<int>,<lgl>,<lgl>,<int>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,<int>,<lgl>,<int>,<dbl>,<dbl>,<chr>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>
100103783,748,500064,20160331,V,60,3,1,1,1,2401,,4762.23,9807,L,1,50,1053359729,,1316155575,,,1760568414.0,,,,,,,,1,2,41295.14,20160328,1,1,A,127.33,1288,0,0,0,0.0,923.39,721.87,0,79.98,121.54,0,0,1.4723,1,0,0,2,0,,,,20160331,183,0,0,S2222XA,S2222XA,S2222XA,Y,N186,Y,S32029A,Y,I120,Y,S32049A,Y,S2243XA,Y,E1121,Y,E1122,Y,D649,Y,I4510,Y,R310,Y,E041,Y,E785,Y,Z992,0,Z794,0,S80812A,Y,E8339,Y,K219,Y,M1990,Y,Z85828,0.0,Z86010,0.0,Z87442,0.0,S20219A,Y,,,,,V4352XA,V4352XA,0.0,,,,,,,,,,,,,,,,,,,,,,,3E1M39Z,20160328.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,1,1,160,50,11,,0,0.0,,,,,0,9037.04,Y,0.9943031,1.0,0.9999,,1727.95,0,-51.48,-0.9,0,8695.82,8547.66,,,0,0,0,0,,,,,,,2016
100163419,1227,140083,20160403,V,60,3,1,1,1,6101,,4545.43,0,,1,14,1447280284,,1346525235,,,,,,,,,,,1,2,10056.07,20160401,1,1,A,7.23,1288,0,0,0,0.0,326.59,269.38,0,53.96,3.25,0,0,0.6007,2,0,0,0,0,,,,20160403,639,0,0,E162,E11649,E11649,Y,E1165,Y,Z794,0,I10,Y,E8342,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,2,2,141,14,10,,14,0.0,,,,,0,3358.41,Y,1.0072178,1.0,0.9982,,1660.13,0,24.24,-6.05,0,3547.9,3502.7,,,0,0,0,0,,,,,,,2016
100193467,1439,360179,20161122,V,60,3,1,1,1,15201,,7842.32,0,,1,36,1396714663,,1518269018,,,,,,,,,,,3,2,28561.05,20161117,1,1,A,64.57,1288,0,0,0,0.0,674.76,625.71,0,25.65,23.4,0,0,1.4796,5,0,0,0,0,,,,20161122,291,0,0,I130,I130,I130,Y,I5033,Y,E1122,Y,N184,Y,N179,N,E875,N,Z6841,0,I480,Y,M069,Y,M109,Y,R040,Y,D631,Y,E6601,Y,I2510,Y,Z951,0,Z8673,0,Z794,0,Z950,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,2,4,840,36,10,,14,0.0,,62.0,,,0,7771.03,Y,0.9941801,1.0,0.9991,,369.33,0,-45.23,-6.99,0,8822.75,8672.05,,,0,0,0,0,,,,,,,2016
100193467,1440,360354,20161227,V,60,3,1,1,1,15201,,5310.33,0,,1,36,1851549273,,1104913938,,,1447227111.0,,,1104913938.0,,,,,6,2,26356.5,20161222,1,1,A,6.79,0,0,0,0,0.0,413.72,400.43,0,13.29,0.0,0,0,0.9469,5,0,0,0,0,,,,20161227,194,0,0,J189,J189,J189,Y,N179,Y,I959,Y,E875,N,I509,Y,N184,Y,D631,Y,Z6841,0,M069,Y,I440,Y,M1710,Y,E669,Y,R197,Y,I129,Y,K219,Y,M4800,Y,R0902,Y,M19019,Y,M109,Y,Z950,0.0,Z9049,0.0,Z7982,0.0,Z7952,0,,,,,Y95,Y95,0.0,,,,,,,,,,,,,,,,,,,,,,,B246ZZZ,20161226.0,4A033R1,20161222.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,6,2,4,840,36,10,,14,0.0,,,,,0,4973.23,Y,1.0062101,1.0,0.9921,,0.0,0,30.88,-39.29,0,5646.3,5533.37,,,0,0,0,0,,,,,,,2016
100220367,1627,440151,20160516,V,60,3,1,1,1,10301,,10874.4,0,,1,44,1518346469,,1851341291,,,1669689949.0,,,,,,,,62,2,93324.92,20160511,1,2,A,0.0,1288,0,0,0,0.05,821.88,821.88,0,0.0,0.0,0,0,2.0816,5,0,0,0,0,,,,20160516,470,0,0,M25552,S72012A,S72012A,Y,D62,N,Z87891,0,M8580,Y,E119,Y,I10,Y,K219,Y,Z794,0,Z7982,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,W1830XA,W1830XA,0.0,,,,,,,,,,,,,,,,,,,,,,,0SRB0J9,20160511.0,30233N1,20160514.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,2,1,880,44,10,,44,72.42,,,,,0,10374.27,Y,1.0031251,1.0,0.999,,423.88,0,32.42,-10.37,0,12294.51,12074.38,,,0,0,0,0,,,,,,,2016
100220367,1628,44T151,20160521,V,60,3,1,1,1,10301,,13012.53,0,,1,44,1710368220,,1043367931,,,,,,,,,,,1,2,21563.04,20160516,3,D,A,0.0,0,0,0,0,0.05,0.0,0.0,0,0.0,0.0,0,0,0.9025,5,0,0,0,0,,,,20160521,560,0,0,Z471,Z471,Z471,Y,D62,Y,Z96642,Y,I10,Y,E785,Y,E119,Y,K219,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,4,2,1,880,44,10,,0,0.0,,,,,0,0.0,,0.0,,0.0,,0.0,0,0.0,0.0,0,0.0,0.0,,,0,0,0,0,,,,,,,2016


DESY_SORT_KEY,CLAIM_NO,PRVDR_NUM,CLM_THRU_DT,NCH_NEAR_LINE_REC_IDENT_CD,NCH_CLM_TYPE_CD,CLAIM_QUERY_CODE,CLM_FAC_TYPE_CD,CLM_SRVC_CLSFCTN_TYPE_CD,CLM_FREQ_CD,FI_NUM,CLM_MDCR_NON_PMT_RSN_CD,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,NCH_PRMRY_PYR_CD,PRVDR_STATE_CD,ORG_NPI_NUM,SRVC_LOC_NPI_NUM,AT_PHYSN_UPIN,AT_PHYSN_NPI,AT_PHYSN_SPCLTY_CD,OP_PHYSN_UPIN,OP_PHYSN_NPI,OP_PHYSN_SPCLTY_CD,OT_PHYSN_UPIN,OT_PHYSN_NPI,OT_PHYSN_SPCLTY_CD,RNDRNG_PHYSN_NPI,RNDRNG_PHYSN_SPCLTY_CD,RFR_PHYSN_NPI,RFR_PHYSN_SPCLTY_CD,CLM_MCO_PD_SW,PTNT_DSCHRG_STUS_CD,CLM_TOT_CHRG_AMT,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,NCH_PROFNL_CMPNT_CHRG_AMT,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,ICD_DGNS_CD2,ICD_DGNS_CD3,ICD_DGNS_CD4,ICD_DGNS_CD5,ICD_DGNS_CD6,ICD_DGNS_CD7,ICD_DGNS_CD8,ICD_DGNS_CD9,ICD_DGNS_CD10,ICD_DGNS_CD11,ICD_DGNS_CD12,ICD_DGNS_CD13,ICD_DGNS_CD14,ICD_DGNS_CD15,ICD_DGNS_CD16,ICD_DGNS_CD17,ICD_DGNS_CD18,ICD_DGNS_CD19,ICD_DGNS_CD20,ICD_DGNS_CD21,ICD_DGNS_CD22,ICD_DGNS_CD23,ICD_DGNS_CD24,ICD_DGNS_CD25,FST_DGNS_E_CD,ICD_DGNS_E_CD1,ICD_DGNS_E_CD2,ICD_DGNS_E_CD3,ICD_DGNS_E_CD4,ICD_DGNS_E_CD5,ICD_DGNS_E_CD6,ICD_DGNS_E_CD7,ICD_DGNS_E_CD8,ICD_DGNS_E_CD9,ICD_DGNS_E_CD10,ICD_DGNS_E_CD11,ICD_DGNS_E_CD12,ICD_PRCDR_CD1,PRCDR_DT1,ICD_PRCDR_CD2,PRCDR_DT2,ICD_PRCDR_CD3,PRCDR_DT3,ICD_PRCDR_CD4,PRCDR_DT4,ICD_PRCDR_CD5,PRCDR_DT5,ICD_PRCDR_CD6,PRCDR_DT6,ICD_PRCDR_CD7,PRCDR_DT7,ICD_PRCDR_CD8,PRCDR_DT8,ICD_PRCDR_CD9,PRCDR_DT9,ICD_PRCDR_CD10,PRCDR_DT10,ICD_PRCDR_CD11,PRCDR_DT11,ICD_PRCDR_CD12,PRCDR_DT12,ICD_PRCDR_CD13,PRCDR_DT13,ICD_PRCDR_CD14,PRCDR_DT14,ICD_PRCDR_CD15,PRCDR_DT15,ICD_PRCDR_CD16,PRCDR_DT16,ICD_PRCDR_CD17,PRCDR_DT17,ICD_PRCDR_CD18,PRCDR_DT18,ICD_PRCDR_CD19,PRCDR_DT19,ICD_PRCDR_CD20,PRCDR_DT20,ICD_PRCDR_CD21,PRCDR_DT21,ICD_PRCDR_CD22,PRCDR_DT22,ICD_PRCDR_CD23,PRCDR_DT23,ICD_PRCDR_CD24,PRCDR_DT24,ICD_PRCDR_CD25,PRCDR_DT25,RSN_VISIT_CD1,RSN_VISIT_CD2,RSN_VISIT_CD3,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,CLM_OP_PRVDR_PMT_AMT,CLM_OP_BENE_PMT_AMT,DOB_DT,GNDR_CD,BENE_RACE_CD,BENE_CNTY_CD,BENE_STATE_CD,CWF_BENE_MDCR_STUS_CD,FI_CLM_ACTN_CD,NCH_BLOOD_PNTS_FRNSHD_QTY,CLM_TRTMT_AUTHRZTN_NUM,CLM_PRCR_RTRN_CD,CLM_OP_TRANS_TYPE_CD,CLM_OP_ESRD_MTHD_CD,CLM_NEXT_GNRTN_ACO_IND_CD1,CLM_NEXT_GNRTN_ACO_IND_CD2,CLM_NEXT_GNRTN_ACO_IND_CD3,CLM_NEXT_GNRTN_ACO_IND_CD4,CLM_NEXT_GNRTN_ACO_IND_CD5,ACO_ID_NUM,year
<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<int>,<chr>,<int>,<chr>,<lgl>,<int>,<chr>,<lgl>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>
100009375,934,330222,20160311,W,40,3,1,4,1,13201,,33.22,0,,33,1073569331,,,1215931563,10,,,,,,,,,,,,1,678,0,,Z1211,Z1211,Z86010,K648,K635,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,Z1211,Z86010,K648,0.0,8.48,33.22,0,4,1,1,640,33,10,1,0,,1,J,0,,,,,,,2016
100009375,935,330222,20160620,W,40,5,1,3,I,13201,,27.5,0,,33,1073569331,,,1154300762,92,,,,,,,,,,,,1,277,0,,C61,C61,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C61,,,0.0,0.0,27.5,0,5,1,1,640,33,10,5,0,,1,C,0,,,,,,,2016
100009375,936,330222,20160624,W,40,3,1,3,1,13201,,71.05,0,,33,1073569331,,,1154300762,92,,,,,,,,,,,,1,227,0,,C61,C61,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C61,,,0.0,18.13,71.05,0,5,1,1,640,33,10,1,0,,1,C,0,,,,,,,2016
100009375,937,330222,20161003,W,40,3,1,3,1,13201,,27.5,0,,33,1073569331,,,1154300762,92,,,,,,,,,,,,1,277,0,,C61,C61,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C61,,,0.0,0.0,27.5,0,5,1,1,640,33,10,1,0,,1,C,0,,,,,,,2016
100009375,938,330222,20161005,W,40,3,1,3,1,13201,,71.05,0,,33,1073569331,,,1154300762,92,,,,,,,,,,,,1,227,0,,C61,C61,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,C61,,,0.0,18.13,71.05,0,5,1,1,640,33,10,1,0,,1,C,0,,,,,,,2016
100015633,1728,330194,20160111,W,40,3,1,3,1,13201,,80.29,0,,33,1093777492,,,1629262076,26,,1629262076.0,26.0,,,,,,,,,1,670,0,,F331,F331,F419,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,106.81,20.49,80.29,0,4,2,1,331,33,10,1,0,,1,C,0,,,,,,,2016


In [11]:
length(unique(carrier_data_sample$id))
length(unique(inpatient_data_sample$DESY_SORT_KEY))
length(unique(outpatient_data_sample$DESY_SORT_KEY))

In [12]:
# save the sample to fst
write_fst(carrier_data_sample, "/work/postresearch/Shared/Projects/Farbod/Clustering/carrier_data_sample.fst")
write_fst(inpatient_data_sample, "/work/postresearch/Shared/Projects/Farbod/Clustering/inpatient_data_sample.fst")
write_fst(outpatient_data_sample, "/work/postresearch/Shared/Projects/Farbod/Clustering/outpatient_data_sample.fst")

## Extract the diagnoses and procedures from the files

We will also turn all the data into wide format.

In [3]:
# read the data
carrier_data <- read_fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/carrier_data.fst",
    as.data.table = TRUE
)
inpatient_data <- read_fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/inpatient_data.fst",
    as.data.table = TRUE
)
outpatient_data <- read_fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/outpatient_data.fst",
    as.data.table = TRUE
)

In [37]:
nrow(carrier_data[unique(id)])

In [4]:
# subset the data for analysis
# we will only use 100000 patients from 2016 for the analyses
carrier_data <- carrier_data[year == 2016, ]
inpatient_data <- inpatient_data[year == 2016, ]
outpatient_data <- outpatient_data[year == 2016, ]

#sampel ids
#create a sample of 50000 patients based on id
#sample_id = sample(unique(carrier_data$id), 200000)
#
## sample the data based on the ids sampled
#carrier_data = carrier_data[id %in% sample_id]
#inpatient_data = inpatient_data[DESY_SORT_KEY %in% sample_id]
#outpatient_data = outpatient_data[DESY_SORT_KEY %in% sample_id]



In [5]:
head(carrier_data)
head(inpatient_data)
head(outpatient_data)

id,diagnosis,provider,provider_specialty,hcpcs,date,cost,year
<int>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>
100000015,H3532,1265609275,18,67028,20160129,110.66,2016
100000015,H3532,1265609275,18,J0178,20160129,1961.0,2016
100000015,I10,1730170630,6,99214,20160212,113.64,2016
100000015,I10,1730170630,6,93000,20160212,18.34,2016
100000015,I479,1730170630,6,93457,20160218,371.86,2016
100000015,I252,1730170630,6,99217,20160219,75.37,2016


DESY_SORT_KEY,CLAIM_NO,PRVDR_NUM,CLM_THRU_DT,NCH_NEAR_LINE_REC_IDENT_CD,NCH_CLM_TYPE_CD,CLAIM_QUERY_CODE,CLM_FAC_TYPE_CD,CLM_SRVC_CLSFCTN_TYPE_CD,CLM_FREQ_CD,FI_NUM,CLM_MDCR_NON_PMT_RSN_CD,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,NCH_PRMRY_PYR_CD,FI_CLM_ACTN_CD,PRVDR_STATE_CD,ORG_NPI_NUM,AT_PHYSN_UPIN,AT_PHYSN_NPI,AT_PHYSN_SPCLTY_CD,OP_PHYSN_UPIN,OP_PHYSN_NPI,OP_PHYSN_SPCLTY_CD,OT_PHYSN_UPIN,OT_PHYSN_NPI,OT_PHYSN_SPCLTY_CD,RNDRNG_PHYSN_NPI,RNDRNG_PHYSN_SPCLTY_CD,CLM_MCO_PD_SW,PTNT_DSCHRG_STUS_CD,CLM_PPS_IND_CD,CLM_TOT_CHRG_AMT,CLM_ADMSN_DT,CLM_IP_ADMSN_TYPE_CD,CLM_SRC_IP_ADMSN_CD,NCH_PTNT_STATUS_IND_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,NCH_PROFNL_CMPNT_CHRG_AMT,NCH_IP_NCVRD_CHRG_AMT,CLM_TOT_PPS_CPTL_AMT,CLM_PPS_CPTL_FSP_AMT,CLM_PPS_CPTL_OUTLIER_AMT,CLM_PPS_CPTL_DSPRPRTNT_SHR_AMT,CLM_PPS_CPTL_IME_AMT,CLM_PPS_CPTL_EXCPTN_AMT,CLM_PPS_OLD_CPTL_HLD_HRMLS_AMT,CLM_PPS_CPTL_DRG_WT_NUM,CLM_UTLZTN_DAY_CNT,BENE_TOT_COINSRNC_DAYS_CNT,BENE_LRD_USED_CNT,CLM_NON_UTLZTN_DAYS_CNT,NCH_BLOOD_PNTS_FRNSHD_QTY,NCH_VRFD_NCVRD_STAY_FROM_DT,NCH_VRFD_NCVRD_STAY_THRU_DT,NCH_BENE_MDCR_BNFTS_EXHTD_DT_I,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,CLM_DRG_OUTLIER_STAY_CD,NCH_DRG_OUTLIER_APRVD_PMT_AMT,ADMTG_DGNS_CD,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,CLM_POA_IND_SW1,ICD_DGNS_CD2,CLM_POA_IND_SW2,ICD_DGNS_CD3,CLM_POA_IND_SW3,ICD_DGNS_CD4,CLM_POA_IND_SW4,ICD_DGNS_CD5,CLM_POA_IND_SW5,ICD_DGNS_CD6,CLM_POA_IND_SW6,ICD_DGNS_CD7,CLM_POA_IND_SW7,ICD_DGNS_CD8,CLM_POA_IND_SW8,ICD_DGNS_CD9,CLM_POA_IND_SW9,ICD_DGNS_CD10,CLM_POA_IND_SW10,ICD_DGNS_CD11,CLM_POA_IND_SW11,ICD_DGNS_CD12,CLM_POA_IND_SW12,ICD_DGNS_CD13,CLM_POA_IND_SW13,ICD_DGNS_CD14,CLM_POA_IND_SW14,ICD_DGNS_CD15,CLM_POA_IND_SW15,ICD_DGNS_CD16,CLM_POA_IND_SW16,ICD_DGNS_CD17,CLM_POA_IND_SW17,ICD_DGNS_CD18,CLM_POA_IND_SW18,ICD_DGNS_CD19,CLM_POA_IND_SW19,ICD_DGNS_CD20,CLM_POA_IND_SW20,ICD_DGNS_CD21,CLM_POA_IND_SW21,ICD_DGNS_CD22,CLM_POA_IND_SW22,ICD_DGNS_CD23,CLM_POA_IND_SW23,ICD_DGNS_CD24,CLM_POA_IND_SW24,ICD_DGNS_CD25,CLM_POA_IND_SW25,FST_DGNS_E_CD,ICD_DGNS_E_CD1,CLM_E_POA_IND_SW1,ICD_DGNS_E_CD2,CLM_E_POA_IND_SW2,ICD_DGNS_E_CD3,CLM_E_POA_IND_SW3,ICD_DGNS_E_CD4,CLM_E_POA_IND_SW4,ICD_DGNS_E_CD5,CLM_E_POA_IND_SW5,ICD_DGNS_E_CD6,CLM_E_POA_IND_SW6,ICD_DGNS_E_CD7,CLM_E_POA_IND_SW7,ICD_DGNS_E_CD8,CLM_E_POA_IND_SW8,ICD_DGNS_E_CD9,CLM_E_POA_IND_SW9,ICD_DGNS_E_CD10,CLM_E_POA_IND_SW10,ICD_DGNS_E_CD11,CLM_E_POA_IND_SW11,ICD_DGNS_E_CD12,CLM_E_POA_IND_SW12,ICD_PRCDR_CD1,PRCDR_DT1,ICD_PRCDR_CD2,PRCDR_DT2,ICD_PRCDR_CD3,PRCDR_DT3,ICD_PRCDR_CD4,PRCDR_DT4,ICD_PRCDR_CD5,PRCDR_DT5,ICD_PRCDR_CD6,PRCDR_DT6,ICD_PRCDR_CD7,PRCDR_DT7,ICD_PRCDR_CD8,PRCDR_DT8,ICD_PRCDR_CD9,PRCDR_DT9,ICD_PRCDR_CD10,PRCDR_DT10,ICD_PRCDR_CD11,PRCDR_DT11,ICD_PRCDR_CD12,PRCDR_DT12,ICD_PRCDR_CD13,PRCDR_DT13,ICD_PRCDR_CD14,PRCDR_DT14,ICD_PRCDR_CD15,PRCDR_DT15,ICD_PRCDR_CD16,PRCDR_DT16,ICD_PRCDR_CD17,PRCDR_DT17,ICD_PRCDR_CD18,PRCDR_DT18,ICD_PRCDR_CD19,PRCDR_DT19,ICD_PRCDR_CD20,PRCDR_DT20,ICD_PRCDR_CD21,PRCDR_DT21,ICD_PRCDR_CD22,PRCDR_DT22,ICD_PRCDR_CD23,PRCDR_DT23,ICD_PRCDR_CD24,PRCDR_DT24,ICD_PRCDR_CD25,PRCDR_DT25,DOB_DT,GNDR_CD,BENE_RACE_CD,BENE_CNTY_CD,BENE_STATE_CD,CWF_BENE_MDCR_STUS_CD,CLM_TRTMT_AUTHRZTN_NUM,CLM_PRCR_RTRN_CD,CLM_IP_LOW_VOL_PMT_AMT,CLM_CARE_IMPRVMT_MODEL_CD1,CLM_CARE_IMPRVMT_MODEL_CD2,CLM_CARE_IMPRVMT_MODEL_CD3,CLM_CARE_IMPRVMT_MODEL_CD4,CLM_BNDLD_MODEL_1_DSCNT_PCT,CLM_BASE_OPRTG_DRG_AMT,CLM_VBP_PRTCPNT_IND_CD,CLM_VBP_ADJSTMT_PCT,CLM_HRR_PRTCPNT_IND_CD,CLM_HRR_ADJSTMT_PCT,CLM_MODEL_4_READMSN_IND_CD,CLM_UNCOMPD_CARE_PMT_AMT,CLM_BNDLD_ADJSTMT_PMT_AMT,CLM_VBP_ADJSTMT_PMT_AMT,CLM_HRR_ADJSTMT_PMT_AMT,EHR_PYMT_ADJSTMT_AMT,PPS_STD_VAL_PYMT_AMT,FINL_STD_AMT,HAC_PGM_RDCTN_IND_SW,EHR_PGM_RDCTN_IND_SW,CLM_SITE_NTRL_PYMT_CST_AMT,CLM_SITE_NTRL_PYMT_IPPS_AMT,CLM_FULL_STD_PYMT_AMT,CLM_SS_OUTLIER_STD_PYMT_AMT,CLM_NEXT_GNRTN_ACO_IND_CD1,CLM_NEXT_GNRTN_ACO_IND_CD2,CLM_NEXT_GNRTN_ACO_IND_CD3,CLM_NEXT_GNRTN_ACO_IND_CD4,CLM_NEXT_GNRTN_ACO_IND_CD5,ACO_ID_NUM,year
<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<dbl>,<chr>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<int>,<lgl>,<int>,<lgl>,<lgl>,<int>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,<int>,<lgl>,<int>,<dbl>,<dbl>,<chr>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>
100000203,2,100075,20160403,V,60,5,1,1,I,9101,,6110.17,0,,5,10,1881632818,,1134394356,,,1700010923.0,,,,,,,,1,2,39874.14,20160331,1,1,A,3.64,1288,0,0,0,0,435.76,393.21,0,42.55,0.0,0,0,0.9695,3,0,0,0,0,,,,20160403,194,0,0,E860,J189,J189,Y,E871,Y,I509,Y,F329,Y,F419,Y,E785,Y,K219,Y,I10,Y,A084,Y,K449,Y,E780,Y,Z880,0,Z882,0,E860,Y,Z7951,0,Z9049,Y,Z902,Y,Z85118,0,Z9181,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,BB241ZZ,20160402.0,BW211ZZ,20160331.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,1,510,33,10,,14,0,,62.0,,,0,4944.52,Y,1.0035584,1,0.9923,,1778.88,0,17.59,-38.07,0.0,5726.14,5637.38,,,0,0,0,0,,,,,,,2016
100000421,3,330393,20160617,V,60,3,1,1,1,13201,,26010.55,0,,1,33,1972523348,,1043251655,,,1043251655.0,,,,,,,,3,2,55892.75,20160614,3,1,A,260.35,1288,0,0,0,0,1972.96,1492.61,0,93.89,386.46,0,0,2.7513,3,0,0,0,0,,,,20160617,468,0,0,T84020A,T84020A,T84020A,Y,T84090A,Y,F952,Y,E559,Y,I071,Y,M419,Y,Z96642,Y,E780,Y,J45909,Y,M1990,Y,D649,Y,F419,Y,I371,Y,F328,Y,G43909,Y,R6250,Y,G4733,Y,I351,Y,Q6589,0.0,Z8774,0.0,Z87730,0.0,,,,,,,,,Y838,Y838,0.0,,,,,,,,,,,,,,,,,,,,,,,0SR901Z,20160614.0,0SP90JZ,20160614.0,0SP909Z,20160614.0,0SUA09Z,20160614.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,2,1,700,33,20,,14,0,,62.0,,,0,18847.54,Y,0.9977695,1,0.9837,,929.15,0,-42.04,-307.21,0.0,16249.95,15950.71,,,0,0,0,0,,,,,,,2016
100000905,4,220033,20160920,V,60,3,1,1,1,14211,,4354.34,0,,1,22,1316917941,,1700174109,,,1720159627.0,,,1720159627.0,,,,,1,2,10105.28,20160918,1,1,A,0.0,1288,0,0,0,0,404.56,382.02,0,22.54,0.0,0,0,0.7294,2,0,0,0,0,,,,20160920,694,0,0,N132,N132,N132,Y,K760,Y,E119,Y,N3000,Y,B9620,Y,E039,Y,N8320,Y,R600,Y,R0789,Y,R9431,Y,J45909,Y,Z87442,0,Z87440,0,Z9049,Y,Z888,0,Z880,0,Z91013,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0T768DZ,20160918.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,40,22,10,,14,0,,,,,0,4750.12,Y,1.0024607,1,0.9915,,455.7,0,11.69,-40.38,58.16,4308.04,4247.64,,,0,0,0,0,,,,,,,2016
100000905,5,220171,20161015,V,60,5,1,1,I,14211,,7344.59,0,,5,22,1558392563,,1912193285,,,,,,,,,,,1,2,9593.91,20161012,1,1,A,138.28,0,0,0,0,0,547.26,487.59,0,0.0,59.67,0,0,0.9469,3,0,0,0,0,,,,20161015,194,0,0,R509,J189,J189,Y,J90,Y,E119,Y,R630,Y,Z936,0,I10,Y,Z87440,0,E039,Y,Z87442,0,J45909,Y,Z9049,0,Z801,0,Z833,0,H9193,Y,Z86011,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,40,22,10,,14,0,,62.0,,,0,6060.46,Y,1.000835,1,0.9804,,0.0,0,5.06,-118.79,0.0,5646.3,5533.37,,,0,0,0,0,,,,,,,2016
100000945,6,220077,20160226,V,60,5,1,1,I,5901,,25869.46,0,,5,22,1487655064,,1730147679,,,1194832774.0,,,,,,,,1,2,15273.76,20160224,1,1,A,0.0,1288,0,0,0,0,1994.21,1585.2,0,145.2,263.81,0,0,3.0267,2,0,0,0,0,,,,20160226,654,0,0,N200,N132,N132,Y,G912,Y,N179,Y,I739,Y,N3281,Y,N189,Y,N529,Y,E785,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0TC78ZZ,20160225.0,0T7B8DZ,20160225.0,BT1FZZZ,20160225.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,1,1,70,22,10,,0,0,,,,,0,19946.33,Y,1.0000747,1,1.0,,956.45,0,1.49,0.0,0.0,17876.54,17544.77,,,0,0,0,0,,,,,,,2016
100001101,7,500072,20160314,V,60,3,1,1,1,2401,,8514.7,0,,1,50,1306845557,,1245325851,,,,,,,,,,,1,2,13765.01,20160310,1,1,A,0.0,1288,0,0,0,0,679.08,679.08,0,0.0,0.0,0,0,1.4261,4,0,0,0,0,,,,20160314,193,0,0,J189,J189,J189,Y,I5031,Y,I272,Y,I4891,Y,I071,Y,I10,Y,G4733,Y,Z96641,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,40,50,10,,14,0,,,,,0,8486.46,Y,1.0056729,1,1.0,,238.14,0,48.14,0.0,0.0,8422.95,8280.25,,,0,0,0,0,,,,,,,2016


DESY_SORT_KEY,CLAIM_NO,PRVDR_NUM,CLM_THRU_DT,NCH_NEAR_LINE_REC_IDENT_CD,NCH_CLM_TYPE_CD,CLAIM_QUERY_CODE,CLM_FAC_TYPE_CD,CLM_SRVC_CLSFCTN_TYPE_CD,CLM_FREQ_CD,FI_NUM,CLM_MDCR_NON_PMT_RSN_CD,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,NCH_PRMRY_PYR_CD,PRVDR_STATE_CD,ORG_NPI_NUM,SRVC_LOC_NPI_NUM,AT_PHYSN_UPIN,AT_PHYSN_NPI,AT_PHYSN_SPCLTY_CD,OP_PHYSN_UPIN,OP_PHYSN_NPI,OP_PHYSN_SPCLTY_CD,OT_PHYSN_UPIN,OT_PHYSN_NPI,OT_PHYSN_SPCLTY_CD,RNDRNG_PHYSN_NPI,RNDRNG_PHYSN_SPCLTY_CD,RFR_PHYSN_NPI,RFR_PHYSN_SPCLTY_CD,CLM_MCO_PD_SW,PTNT_DSCHRG_STUS_CD,CLM_TOT_CHRG_AMT,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,NCH_PROFNL_CMPNT_CHRG_AMT,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,ICD_DGNS_CD2,ICD_DGNS_CD3,ICD_DGNS_CD4,ICD_DGNS_CD5,ICD_DGNS_CD6,ICD_DGNS_CD7,ICD_DGNS_CD8,ICD_DGNS_CD9,ICD_DGNS_CD10,ICD_DGNS_CD11,ICD_DGNS_CD12,ICD_DGNS_CD13,ICD_DGNS_CD14,ICD_DGNS_CD15,ICD_DGNS_CD16,ICD_DGNS_CD17,ICD_DGNS_CD18,ICD_DGNS_CD19,ICD_DGNS_CD20,ICD_DGNS_CD21,ICD_DGNS_CD22,ICD_DGNS_CD23,ICD_DGNS_CD24,ICD_DGNS_CD25,FST_DGNS_E_CD,ICD_DGNS_E_CD1,ICD_DGNS_E_CD2,ICD_DGNS_E_CD3,ICD_DGNS_E_CD4,ICD_DGNS_E_CD5,ICD_DGNS_E_CD6,ICD_DGNS_E_CD7,ICD_DGNS_E_CD8,ICD_DGNS_E_CD9,ICD_DGNS_E_CD10,ICD_DGNS_E_CD11,ICD_DGNS_E_CD12,ICD_PRCDR_CD1,PRCDR_DT1,ICD_PRCDR_CD2,PRCDR_DT2,ICD_PRCDR_CD3,PRCDR_DT3,ICD_PRCDR_CD4,PRCDR_DT4,ICD_PRCDR_CD5,PRCDR_DT5,ICD_PRCDR_CD6,PRCDR_DT6,ICD_PRCDR_CD7,PRCDR_DT7,ICD_PRCDR_CD8,PRCDR_DT8,ICD_PRCDR_CD9,PRCDR_DT9,ICD_PRCDR_CD10,PRCDR_DT10,ICD_PRCDR_CD11,PRCDR_DT11,ICD_PRCDR_CD12,PRCDR_DT12,ICD_PRCDR_CD13,PRCDR_DT13,ICD_PRCDR_CD14,PRCDR_DT14,ICD_PRCDR_CD15,PRCDR_DT15,ICD_PRCDR_CD16,PRCDR_DT16,ICD_PRCDR_CD17,PRCDR_DT17,ICD_PRCDR_CD18,PRCDR_DT18,ICD_PRCDR_CD19,PRCDR_DT19,ICD_PRCDR_CD20,PRCDR_DT20,ICD_PRCDR_CD21,PRCDR_DT21,ICD_PRCDR_CD22,PRCDR_DT22,ICD_PRCDR_CD23,PRCDR_DT23,ICD_PRCDR_CD24,PRCDR_DT24,ICD_PRCDR_CD25,PRCDR_DT25,RSN_VISIT_CD1,RSN_VISIT_CD2,RSN_VISIT_CD3,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,CLM_OP_PRVDR_PMT_AMT,CLM_OP_BENE_PMT_AMT,DOB_DT,GNDR_CD,BENE_RACE_CD,BENE_CNTY_CD,BENE_STATE_CD,CWF_BENE_MDCR_STUS_CD,FI_CLM_ACTN_CD,NCH_BLOOD_PNTS_FRNSHD_QTY,CLM_TRTMT_AUTHRZTN_NUM,CLM_PRCR_RTRN_CD,CLM_OP_TRANS_TYPE_CD,CLM_OP_ESRD_MTHD_CD,CLM_NEXT_GNRTN_ACO_IND_CD1,CLM_NEXT_GNRTN_ACO_IND_CD2,CLM_NEXT_GNRTN_ACO_IND_CD3,CLM_NEXT_GNRTN_ACO_IND_CD4,CLM_NEXT_GNRTN_ACO_IND_CD5,ACO_ID_NUM,year
<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<int>,<chr>,<int>,<chr>,<lgl>,<int>,<chr>,<lgl>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>
100000015,2,220071,20160119,W,40,3,1,3,1,14211,,104.26,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,846.02,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,26.6,104.26,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,3,220071,20160122,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,4,220071,20160126,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,5,220071,20160128,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,6,220071,20160202,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,7,220071,20160209,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.02,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016


In [39]:
nrow(carrier_data[unique(id)])

### Inpatient and outpatient files

In [6]:
# we will only use diagnosis codes for this project
find_all_diagnosis <- function(data, chunks=5) {
    data <- data[, .(
        id = DESY_SORT_KEY,
        year = year,
        # diagnosis_prncpl = PRNCPAL_DGNS_CD,
        diagnosis1 = substr(ICD_DGNS_CD1,1,3),
        diagnosis2 = substr(ICD_DGNS_CD2,1,3),
        diagnosis3 = substr(ICD_DGNS_CD3,1,3),
        diagnosis4 = substr(ICD_DGNS_CD4,1,3),
        diagnosis5 = substr(ICD_DGNS_CD5,1,3),
        diagnosis6 = substr(ICD_DGNS_CD6,1,3),
        diagnosis7 = substr(ICD_DGNS_CD7,1,3),
        diagnosis8 = substr(ICD_DGNS_CD8,1,3),
        diagnosis9 = substr(ICD_DGNS_CD9,1,3),
        diagnosis10 = substr(ICD_DGNS_CD10,1,3),
        diagnosis11 = substr(ICD_DGNS_CD11,1,3),
        diagnosis12 = substr(ICD_DGNS_CD12,1,3),
        diagnosis13 = substr(ICD_DGNS_CD13,1,3),
        diagnosis14 = substr(ICD_DGNS_CD14,1,3),
        diagnosis15 = substr(ICD_DGNS_CD15,1,3),
        diagnosis16 = substr(ICD_DGNS_CD16,1,3),
        diagnosis17 = substr(ICD_DGNS_CD17,1,3),
        diagnosis18 = substr(ICD_DGNS_CD18,1,3),
        diagnosis19 = substr(ICD_DGNS_CD19,1,3),
        diagnosis20 = substr(ICD_DGNS_CD20,1,3),
        diagnosis21 = substr(ICD_DGNS_CD21,1,3),
        diagnosis22 = substr(ICD_DGNS_CD22,1,3),
        diagnosis23 = substr(ICD_DGNS_CD23,1,3),
        diagnosis24 = substr(ICD_DGNS_CD24,1,3),
        diagnosis25 = substr(ICD_DGNS_CD25,1,3)
        #icd_procedure1 = ICD_PRCDR_CD1,
        #icd_procedure2 = ICD_PRCDR_CD2,
        #icd_procedure3 = ICD_PRCDR_CD3,
        #icd_procedure4 = ICD_PRCDR_CD4,
        #icd_procedure5 = ICD_PRCDR_CD5,
        #icd_procedure6 = ICD_PRCDR_CD6,
        #icd_procedure7 = ICD_PRCDR_CD7,
        #icd_procedure8 = ICD_PRCDR_CD8,
        #icd_procedure9 = ICD_PRCDR_CD9,
        #icd_procedure10 = ICD_PRCDR_CD10,
        #icd_procedure11 = ICD_PRCDR_CD11,
        #icd_procedure12 = ICD_PRCDR_CD12,
        #icd_procedure13 = ICD_PRCDR_CD13,
        #icd_procedure14 = ICD_PRCDR_CD14,
        #icd_procedure15 = ICD_PRCDR_CD15,
        #icd_procedure16 = ICD_PRCDR_CD16,
        #icd_procedure17 = ICD_PRCDR_CD17,
        #icd_procedure18 = ICD_PRCDR_CD18,
        #icd_procedure19 = ICD_PRCDR_CD19,
        #icd_procedure20 = ICD_PRCDR_CD20,
        #icd_procedure21 = ICD_PRCDR_CD21,
        #icd_procedure22 = ICD_PRCDR_CD22,
        #icd_procedure23 = ICD_PRCDR_CD23,
        #icd_procedure24 = ICD_PRCDR_CD24,
        #icd_procedure25 = ICD_PRCDR_CD25
    )]
    # melt the data to include the procedure and diagnosis codes in one column
    data <- melt(data, id.vars = c("id", "year")) %>% as.data.table()

    # change the data to wide format dcast the data

    #data<- dcast(data, id + year ~ value,
    #  value.var = "value", fun.aggregate = length
    #)

    # the data will be more than the machine row limit so we will split the data based on id and then dcast then combine the data

    #split the data to chunks based on id
    data_split <- split(data, list(data$id %% chunks))

    # dcast the data
    data_split <- lapply(
        data_split,
        function(x) {
            x <- dcast(x, id + year ~ value, value.var = "value", fun.aggregate = length)
            return(x)
        }
    )

    # combine the data back using rbindlist
    data <- rbindlist(data_split, fill = TRUE)

    # replace NA with 0
    data[is.na(data)] <- 0
    
    return(data)
}

In [7]:
# let's test on a subset of data first
find_all_diagnosis(outpatient_data[1:5])
find_all_diagnosis(inpatient_data[1:5])

id,year,V1,J44
<int>,<dbl>,<int>,<int>
100000015,2016,120,5


id,year,V1,B96,E03,E11,E78,G91,H91,I10,I73,J18,J45,J90,K76,N13,N17,N18,N30,N32,N52,N83,R07,R60,R63,R94,Z80,Z83,Z86,Z87,Z88,Z90,Z91,Z93,D64,E55,F32,F41,F95,G43,G47,I07,I35,I37,M19,M41,Q65,R62,T84,Z96,A08,E86,E87,I50,K21,K44,Z79,Z85
<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000905,2016,18,1,2,2,0,0,1,1,0,1,2,1,1,1,0,0,1,0,0,1,1,1,1,1,1,1,1,4,2,2,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000945,2016,17,0,0,0,1,1,0,0,1,0,0,0,0,1,1,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000421,2016,4,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,0,0,0,0,1,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,0,0,0,0,0,0,0,0
100000203,2016,6,0,0,0,2,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,2,1,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1


In [8]:
# Works fine, let's run it on the whole data
inpatient_data_wide = find_all_diagnosis(inpatient_data, chunks=5)

In [9]:
head(inpatient_data_wide)

id,year,V1,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,⋯,M63,N01,N77,O12,O22,O25,O46,O73,O85,O88,P19,P27,Q13,Q26,Q31,Q50,Q56,Q80,Q81,R85,R86,S07,S28,S78,T26,T30,T34,T55,T61,V13,V23,V44,V87,V93,W23,W25,W50,X11,X30,X31,X95,Y28,Y37,Y75,Y81,Z34,Z36,Z39,Z70,A01,A66,A71,A78,B05,B55,E40,E76,F24,H28,H94,I02,I79,M49,N33,N46,N53,O03,O15,O82,P59,P74,P80,P96,Q10,Q16,Q36,Q37,Q70,Q97,S57,T15,V29,W13,W34,X50,Y08,Y33,Y69,A20,A21,A24,A84,A94,B65,B76,B77,B80,C39,D77,G02,G59,L41,N06,N86,O08,O63,O65,O68,O91,P08,P35,P92,Q14,Q30,Q41,S48,V57,V77,W16,W30,W64,X14,X16,X19,Y63,Y70,A26,E42,H32,L62,N07,O92,P11,Q11,Q17,Q84,Q91,R37,S95,V17,V59,V80,W92,X03,X82,X99,Y74,Y77,Z31,i85
<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000905,2016,18,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000945,2016,17,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100001395,2016,9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100001975,2016,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100003415,2016,36,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100005435,2016,5,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [10]:
outpatient_data_wide <- find_all_diagnosis(outpatient_data, chunks = 3)

In [11]:
head(outpatient_data_wide)

id,year,V1,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,A17,A18,A19,A20,A21,A23,A24,A25,A26,A27,A28,A30,A31,A32,A35,A36,A37,A38,A39,A40,A41,A42,A43,A44,A46,A48,A49,A50,A51,A52,A53,A54,A56,A57,A58,A59,A60,A63,A64,A66,A67,A68,A69,A74,A75,A77,A78,A79,A80,A81,A83,A84,A85,A86,A87,A88,A90,A91,A92,A93,A94,A95,A96,A98,B00,B01,B02,B03,B05,B06,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B41,B42,B43,B44,B45,B46,B47,B48,B49,B50,B51,B53,B54,B55,B56,B57,B58,B59,B60,B64,B65,B66,B67,B69,B70,B71,B73,B74,B75,B76,B77,B78,B80,B81,B82,B83,B85,B86,B87,B88,B89,B90,B91,B94,⋯,Z32,Z33,Z34,Z36,Z37,Z39,Z3A,Z40,Z41,Z42,Z43,Z44,Z45,Z46,Z47,Z48,Z49,Z51,Z52,Z53,Z55,Z56,Z57,Z59,Z60,Z62,Z63,Z64,Z65,Z66,Z67,Z68,Z69,Z70,Z71,Z72,Z73,Z74,Z75,Z76,Z77,Z78,Z79,Z80,Z81,Z82,Z83,Z84,Z85,Z86,Z87,Z88,Z89,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99,A82,B52,B68,C58,E68,G07,G41,H94,MM6,N24,O01,O08,O15,O31,O66,O71,P08,P10,P12,P23,P51,P54,P55,P84,Q00,Q97,V23,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W16,W24,W30,W35,W37,W40,W56,W86,W94,X04,X17,X35,X39,X72,X94,Y23,Y32,Y62,Y75,A89,O04,O07,O64,O73,O88,P03,P37,P39,P58,P60,P80,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X03,X06,X37,X93,X95,Y38,Y77,Y80
<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000053,2016,474,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000221,2016,64,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000611,2016,115,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000905,2016,177,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100001871,2016,96,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100001913,2016,312,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


### Carrier

In [12]:
# create a similar function for carrier data
#we only use the diagnosis codes
find_all_diagnosis_carrier <- function(data, chunks=10) {
    data <- data[, .(
        id = id,
        year = year,
        diagnosis = substr(diagnosis,1,3)
        # hcpcs = hcpcs
    )]
    
    # melt the data to include the procedure and diagnosis codes in one column
    data <- melt(data, id.vars = c("id", "year")) %>% as.data.table()

    # change the data to wide format so that each diagnosis code is a column and the value is the number of times the diagnosis code was used
    # and also each procedure code is a column and the value is the number of times the procedure code was used
    # the data will be more than the machine row limit so we will split the data based on id and then dcast then combine the data

    #split the data to chunks based on id
    data_split <- split(data, list(data$id %% chunks))

    # dcast the data
    #data<- dcast(data, id + year ~ value,
    #   value.var = "value", fun.aggregate = length
    #)

    # dcast the data
    data_split <- lapply(
        data_split,
        function(x) {
            x <- dcast(x, id + year ~ value, value.var = "value", fun.aggregate = length)
            return(x)
        }
    )

    # combine the data back using rbindlist
    data <- rbindlist(data_split, fill = TRUE)

    # replace NA with 0
    data[is.na(data)] <- 0
    
    return(data)
}

In [13]:
# Works fine, let's run it on the whole data
carrier_data_wide = find_all_diagnosis_carrier(carrier_data, chunks=3)

In [31]:
head(carrier_data_wide)

id,year,009,110,125,177,202,23,251,272,296,300,366,390,426,461,462,491,525,542,599,719,723,724,735,739,780,846,856,959,999,A00,A01,A02,A03,A04,A05,A06,A07,A08,A09,A15,A17,A18,A19,A20,A21,A22,A23,A24,A25,A26,A27,A28,A30,A31,A32,A34,A35,A36,A37,A38,A39,A40,A41,A42,A43,A44,A46,A48,A49,A50,A51,A52,A53,A54,A55,A56,A57,A58,A59,A60,A63,A64,A65,A66,A67,A68,A69,A70,A71,A74,A75,A77,A78,A79,A80,A81,A82,A83,A84,A85,A86,A87,A88,A90,A91,A92,A93,A94,A96,A98,A99,B00,B01,B02,B03,B04,B05,B06,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B42,B43,B44,B45,B46,B47,B48,B49,⋯,Z42,Z43,Z44,Z45,Z46,Z47,Z48,Z49,Z51,Z52,Z53,Z55,Z56,Z57,Z59,Z60,Z62,Z63,Z64,Z65,Z66,Z67,Z68,Z69,Z70,Z71,Z72,Z73,Z74,Z75,Z76,Z77,Z78,Z79,Z80,Z81,Z82,Z83,Z84,Z85,Z86,Z87,Z88,Z89,Z90,Z91,Z92,Z93,Z94,Z95,Z96,Z97,Z98,Z99,0,088,170,182,402,405,477,681,709,A89,B41,B92,C58,E33,E68,H48,K67,M78,O31,O65,O67,O74,P03,P08,P13,P24,P50,P55,P93,Q36,V03,V04,V09,V20,V29,V48,V81,V98,W04,W13,W34,W51,W61,W64,W89,X00,X30,X95,Y24,Y63,Y71,Y95,Z19,150,201,235,255,266,271,309,465,5,564,722,727,844,847,A95,B12,B56,B72,H22,I14,J29,M52,MM0,P70,P71,P90,P94,P95,RO7,V00,V32,V42,V44,V76,V82,V96,W23,W29,W60,X11,X15,X50,Y62
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000053,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000203,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000389,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000611,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000905,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100001871,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [32]:
# Now, let's put things together

# list data
data_list = list(inpatient_data_wide, outpatient_data_wide, carrier_data_wide)
# rbind
wide_data = rbindlist(data_list, fill = TRUE)
# sum based on year and id
wide_data <-
    wide_data %>%
    group_by(id, year) %>%
    summarise_all(sum, na.rm = TRUE) %>%
    as.data.table()

# change NA to 0

wide_data[is.na(wide_data)] <- 0
wide_data <- wide_data[, V1 := NULL]
head(wide_data)

id,year,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C4A,⋯,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,009,110,125,177,202,23,251,272,296,300,366,390,426,461,462,491,525,542,599,719,723,724,735,739,780,846,856,959,999,A22,A34,A65,A99,B04,B79,D 9,F35,F62,H45,MZG,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,SB2,T64,V06,W85,X40,XXX,Y64,Z05,Z38,0,088,170,182,402,405,477,681,709,B92,E33,H48,K67,M78,P93,V20,150,201,235,255,266,271,309,465,5,564,722,727,844,847,B12,H22,I14,J29,M52,MM0,P95,RO7,V32,V76,V82,V96
<int>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000015,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000019,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000053,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000099,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000203,2016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000221,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [35]:
# get rid of the wrong codes, only keep codes fromm A00 to Z99
valid_names <- expand.grid(
    c("A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P", "Q", "R", "S", "T", "U", "V", "W", "X", "Y", "Z"),
    c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9"),
    c("0", "1", "2", "3", "4", "5", "6", "7", "8", "9")
)
valid_names <- paste(valid_names[, 1], valid_names[, 2], valid_names[, 3], sep = "")

colnames_in_valid_names <- colnames(wide_data) %in% valid_names

wide_data_clean <- wide_data[, colnames_in_valid_names, with = FALSE]
wide_data_clean <- cbind(wide_data[, c("id", "year")], wide_data_clean)
head(wide_data_clean)

id,year,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C50,⋯,V58,V63,V73,V78,V79,V88,V94,W04,W21,W27,W28,W29,W32,W33,W39,W46,W52,W53,W60,W61,W89,W90,X00,X01,X02,X10,X12,X13,X15,X18,X32,X52,Y00,Y03,Y09,Y24,Y27,Y29,Y30,A82,B52,B68,C58,G07,G41,N24,O01,O31,P10,P12,P23,P51,P54,P55,P84,Q00,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,A22,A34,A65,A99,B04,B79,F35,F62,H45,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,T64,V06,W85,X40,Y64,Z05,Z38,B92,E33,H48,K67,M78,P93,V20,B12,H22,I14,J29,M52,P95,V32,V76,V82,V96
<int>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<int>,<int>,<int>,<int>,<int>,<dbl>,<dbl>,<int>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000015,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000019,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000053,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000099,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,7,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000203,2016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000221,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [38]:
nrow(carrier_data[unique(id)])

In [41]:
# save the data to fst
write.fst(wide_data_clean, "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_2016.fst")

In [42]:
# create a dataset that is only based on the presence of procedures and diagnoses and not numbers
wide_data_presence <- wide_data_clean[, (.SD > 0) * 1, .SDcols = colnames(wide_data_clean)[-c(1, 2)]]

# add id and year
wide_data_presence <- cbind(wide_data_clean[, c("id", "year")], wide_data_presence)
head(wide_data_presence)

id,year,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C50,⋯,V58,V63,V73,V78,V79,V88,V94,W04,W21,W27,W28,W29,W32,W33,W39,W46,W52,W53,W60,W61,W89,W90,X00,X01,X02,X10,X12,X13,X15,X18,X32,X52,Y00,Y03,Y09,Y24,Y27,Y29,Y30,A82,B52,B68,C58,G07,G41,N24,O01,O31,P10,P12,P23,P51,P54,P55,P84,Q00,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,A22,A34,A65,A99,B04,B79,F35,F62,H45,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,T64,V06,W85,X40,Y64,Z05,Z38,B92,E33,H48,K67,M78,P93,V20,B12,H22,I14,J29,M52,P95,V32,V76,V82,V96
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000015,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000019,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000053,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000099,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000203,2016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000221,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [47]:
# save the data to fst
write.fst(wide_data_presence, "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_presence_2016.fst")

In [44]:
dim(wide_data_presence)

# Dimensionality reduction
Based on the high number of diagnosis and procedure codes, we will use dimensionality reduction to reduce the number of features before clustering.

We will use two methods:
* PCA: Principal Component Analysis
* Autoencoder: a neural network that learns to reconstruct the input

In [45]:
wide_data <- wide_data_clean

In [2]:
wide_data <- read.fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_2016.fst",
    as.data.table = TRUE,
    to=500000
)

wide_data_presence <- read.fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_presence_2016.fst",
    as.data.table = TRUE,
    to=500000
)


In [3]:
dim(wide_data)
dim(wide_data_presence)

# get rid of all zero columns
# find the columns with zero sum
zero_cols <- colSums(wide_data) == 0
# get rid of the zero columns
wide_data <- wide_data[, !zero_cols, with = FALSE]
wide_data_presence <- wide_data_presence[, !zero_cols, with = FALSE]
dim(wide_data)
dim(wide_data_presence)


## PCA

In [12]:
# perform kernel pca
library(kernlab)

# scale the data
wide_data_scaled <- scale(wide_data[, -c(1, 2)])

In [7]:
head(wide_data_scaled)

A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C50,C51,C52,⋯,V58,V63,V73,V78,V79,V88,V94,W04,W21,W27,W28,W29,W32,W33,W39,W46,W52,W53,W60,W61,W89,W90,X00,X01,X02,X10,X12,X13,X15,X18,X32,X52,Y00,Y03,Y09,Y24,Y27,Y29,Y30,A82,B52,B68,C58,G07,G41,N24,O01,O31,P10,P12,P23,P51,P54,P55,P84,Q00,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,A22,A34,A65,A99,B04,B79,F35,F62,H45,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,T64,V06,W85,X40,Y64,Z05,Z38,B92,E33,H48,K67,M78,P93,V20,B12,H22,I14,J29,M52,P95,V32,V76,V82,V96
-0.007562532,-0.006210679,-0.04757756,-0.01291986,-0.00547728,-0.006928335,-0.04525058,-0.03535494,-0.0142023,-0.007563966,-0.003162278,-0.003162278,-0.009970991,-0.003909361,-0.02378741,-0.003162278,-0.005078374,-0.005185494,-0.004242658,-0.009129044,-0.01691168,-0.1073133,-0.004403876,-0.005197096,-0.01265382,-0.01995426,-0.03828038,-0.003721049,-0.006984438,-0.01158169,-0.0159182,-0.01056815,-0.008014094,-0.01173766,-0.01892081,-0.01619145,-0.004899014,-0.02238044,-0.006674353,,-0.01014646,-0.005164021,-0.007980453,-0.006285965,-0.00547728,-0.005059964,-0.008281029,-0.006766587,-0.005496007,-0.006455074,,-0.04131055,-0.009311696,-0.06607922,-0.0937618,-0.01337802,-0.005797577,-0.005996531,-0.01469845,-0.009186176,-0.01591138,-0.04699335,-0.04551555,-0.0424701,-0.00749627,-0.004472158,-0.006907415,-0.0220617,-0.01104046,-0.05358533,-0.2604895,-0.03239995,-0.09569928,-0.01070896,-0.01383741,-0.003162278,-0.01334755,-0.006855105,-0.003162278,-0.00632465,-0.01504071,-0.01678888,-0.004472158,-0.00547728,-0.007893729,-0.005776935,-0.008198057,-0.006529393,,-0.005111698,-0.003162278,-0.003162278,-0.01445939,-0.02706786,-0.006139491,-0.01291096,-0.004969233,-0.003162278,-0.005888891,-0.008682715,-0.06186187,-0.1060624,-0.04214093,-0.03772148,-0.01277175,-0.007108406,-0.01090045,-0.004251866,-0.007845593,-0.009109103,-0.00909452,-0.008186694,-0.005895194,-0.008216686,-0.009042157,-0.004986445,-0.006038676,-0.008910771,-0.00996513,-0.01764608,-0.01590501,-0.005700758,-0.03699083,-0.01600826,-0.02056009,-0.01234298,-0.01903139,-0.00780089,-0.009505375,-0.02348118,-0.005755001,-0.007974807,-0.006364055,-0.012025,-0.006498991,-0.05742392,-0.0040663,-0.01665162,-0.007533179,-0.01689576,-0.03124136,-0.160591,-0.009919841,-0.005715425,-0.00632465,-0.009028569,-0.01685512,-0.06881626,-0.01160421,-0.007683892,⋯,-0.003162278,,,,,,,,-0.003162278,-0.006708321,,-0.004472158,,,,,,,,,,,-0.003162278,,,,-0.00632465,,-0.004472158,,-0.01553977,,,,-0.003162278,,,,,-0.003162278,-0.003162278,-0.004472158,,,,,-0.003577714,,,-0.006000078,,,,,-0.003162278,,,,,,,,,,-0.003162278,,,,,,,,-0.003162278,,,,,,,,,-0.003162278,,-0.003162278,,,,,,,,,,,,,,,,,,,,,-0.003162278,,,,,,,,,,,,,,,-0.003162278,,,,,,-0.004472158,,,,,,,,,,,,,,,,-0.003162278,-0.003162278,,,,,-0.003162278,,
-0.007562532,-0.006210679,-0.04757756,-0.01291986,-0.00547728,-0.006928335,-0.04525058,-0.03535494,-0.0142023,-0.007563966,-0.003162278,-0.003162278,-0.009970991,-0.003909361,-0.02378741,-0.003162278,-0.005078374,-0.005185494,-0.004242658,-0.009129044,-0.01691168,-0.1073133,-0.004403876,-0.005197096,-0.01265382,-0.01995426,-0.03828038,-0.003721049,-0.006984438,-0.01158169,-0.0159182,-0.01056815,-0.008014094,-0.01173766,-0.01892081,-0.01619145,-0.004899014,-0.02238044,-0.006674353,,-0.01014646,-0.005164021,-0.007980453,-0.006285965,-0.00547728,-0.005059964,-0.008281029,-0.006766587,-0.005496007,-0.006455074,,-0.04131055,-0.009311696,-0.06607922,-0.0937618,-0.01337802,-0.005797577,-0.005996531,-0.01469845,-0.009186176,-0.01591138,-0.04699335,-0.04551555,-0.0424701,-0.00749627,-0.004472158,-0.006907415,-0.0220617,-0.01104046,-0.05358533,-0.2604895,-0.03239995,-0.09569928,-0.01070896,-0.01383741,-0.003162278,-0.01334755,-0.006855105,-0.003162278,-0.00632465,-0.01504071,-0.01678888,-0.004472158,-0.00547728,-0.007893729,-0.005776935,-0.008198057,-0.006529393,,-0.005111698,-0.003162278,-0.003162278,-0.01445939,-0.02706786,-0.006139491,-0.01291096,-0.004969233,-0.003162278,-0.005888891,-0.008682715,-0.06186187,-0.1060624,-0.04214093,-0.03772148,-0.01277175,-0.007108406,-0.01090045,-0.004251866,-0.007845593,-0.009109103,-0.00909452,-0.008186694,-0.005895194,-0.008216686,-0.009042157,-0.004986445,-0.006038676,-0.008910771,-0.00996513,-0.01764608,-0.01590501,-0.005700758,0.61521312,-0.01600826,-0.02056009,-0.01234298,-0.01903139,-0.00780089,-0.009505375,-0.02348118,-0.005755001,-0.007974807,-0.006364055,-0.012025,-0.006498991,-0.05742392,-0.0040663,-0.01665162,-0.007533179,-0.01689576,-0.03124136,-0.160591,-0.009919841,-0.005715425,-0.00632465,-0.009028569,-0.01685512,-0.06881626,-0.01160421,-0.007683892,⋯,-0.003162278,,,,,,,,-0.003162278,-0.006708321,,-0.004472158,,,,,,,,,,,-0.003162278,,,,-0.00632465,,-0.004472158,,-0.01553977,,,,-0.003162278,,,,,-0.003162278,-0.003162278,-0.004472158,,,,,-0.003577714,,,-0.006000078,,,,,-0.003162278,,,,,,,,,,-0.003162278,,,,,,,,-0.003162278,,,,,,,,,-0.003162278,,-0.003162278,,,,,,,,,,,,,,,,,,,,,-0.003162278,,,,,,,,,,,,,,,-0.003162278,,,,,,-0.004472158,,,,,,,,,,,,,,,,-0.003162278,-0.003162278,,,,,-0.003162278,,
-0.007562532,-0.006210679,-0.04757756,-0.01291986,-0.00547728,-0.006928335,-0.04525058,-0.03535494,-0.0142023,-0.007563966,-0.003162278,-0.003162278,-0.009970991,-0.003909361,-0.02378741,-0.003162278,-0.005078374,-0.005185494,-0.004242658,-0.009129044,-0.01691168,-0.1073133,-0.004403876,-0.005197096,-0.01265382,-0.01995426,-0.03828038,-0.003721049,-0.006984438,-0.01158169,-0.0159182,-0.01056815,-0.008014094,-0.01173766,-0.01892081,-0.01619145,-0.004899014,-0.02238044,-0.006674353,,-0.01014646,-0.005164021,-0.007980453,-0.006285965,-0.00547728,-0.005059964,-0.008281029,-0.006766587,-0.005496007,-0.006455074,,-0.04131055,-0.009311696,-0.06607922,-0.0937618,-0.01337802,-0.005797577,-0.005996531,-0.01469845,-0.009186176,-0.01591138,-0.04699335,-0.04551555,-0.0424701,-0.00749627,-0.004472158,-0.006907415,-0.0220617,-0.01104046,-0.05358533,0.3633776,-0.03239995,-0.09569928,-0.01070896,-0.01383741,-0.003162278,-0.01334755,-0.006855105,-0.003162278,-0.00632465,-0.01504071,-0.01678888,-0.004472158,-0.00547728,-0.007893729,-0.005776935,-0.008198057,-0.006529393,,-0.005111698,-0.003162278,-0.003162278,-0.01445939,-0.02706786,-0.006139491,-0.01291096,-0.004969233,-0.003162278,-0.005888891,-0.008682715,-0.06186187,-0.1060624,-0.04214093,-0.03772148,-0.01277175,-0.007108406,-0.01090045,-0.004251866,-0.007845593,-0.009109103,-0.00909452,-0.008186694,-0.005895194,-0.008216686,-0.009042157,-0.004986445,-0.006038676,-0.008910771,-0.00996513,-0.01764608,-0.01590501,-0.005700758,-0.03699083,-0.01600826,-0.02056009,-0.01234298,-0.01903139,-0.00780089,-0.009505375,-0.02348118,-0.005755001,-0.007974807,-0.006364055,-0.012025,-0.006498991,-0.05742392,-0.0040663,-0.01665162,-0.007533179,-0.01689576,-0.03124136,-0.160591,-0.009919841,-0.005715425,-0.00632465,-0.009028569,-0.01685512,-0.06881626,-0.01160421,-0.007683892,⋯,-0.003162278,,,,,,,,-0.003162278,-0.006708321,,-0.004472158,,,,,,,,,,,-0.003162278,,,,-0.00632465,,-0.004472158,,-0.01553977,,,,-0.003162278,,,,,-0.003162278,-0.003162278,-0.004472158,,,,,-0.003577714,,,-0.006000078,,,,,-0.003162278,,,,,,,,,,-0.003162278,,,,,,,,-0.003162278,,,,,,,,,-0.003162278,,-0.003162278,,,,,,,,,,,,,,,,,,,,,-0.003162278,,,,,,,,,,,,,,,-0.003162278,,,,,,-0.004472158,,,,,,,,,,,,,,,,-0.003162278,-0.003162278,,,,,-0.003162278,,
-0.007562532,-0.006210679,-0.04757756,-0.01291986,-0.00547728,-0.006928335,-0.04525058,-0.03535494,-0.0142023,-0.007563966,-0.003162278,-0.003162278,-0.009970991,-0.003909361,-0.02378741,-0.003162278,-0.005078374,-0.005185494,-0.004242658,-0.009129044,-0.01691168,-0.1073133,-0.004403876,-0.005197096,-0.01265382,-0.01995426,-0.03828038,-0.003721049,-0.006984438,-0.01158169,-0.0159182,-0.01056815,-0.008014094,-0.01173766,-0.01892081,-0.01619145,-0.004899014,-0.02238044,-0.006674353,,-0.01014646,-0.005164021,-0.007980453,-0.006285965,-0.00547728,-0.005059964,-0.008281029,-0.006766587,-0.005496007,-0.006455074,,-0.04131055,-0.009311696,-0.06607922,-0.0937618,-0.01337802,-0.005797577,-0.005996531,-0.01469845,-0.009186176,-0.01591138,-0.04699335,-0.04551555,-0.0424701,-0.00749627,-0.004472158,-0.006907415,-0.0220617,-0.01104046,-0.05358533,4.1065802,-0.03239995,-0.09569928,-0.01070896,-0.01383741,-0.003162278,-0.01334755,-0.006855105,-0.003162278,-0.00632465,-0.01504071,-0.01678888,-0.004472158,-0.00547728,-0.007893729,-0.005776935,-0.008198057,-0.006529393,,-0.005111698,-0.003162278,-0.003162278,-0.01445939,-0.02706786,-0.006139491,-0.01291096,-0.004969233,-0.003162278,-0.005888891,-0.008682715,-0.06186187,-0.1060624,-0.04214093,-0.03772148,-0.01277175,-0.007108406,-0.01090045,-0.004251866,-0.007845593,-0.009109103,-0.00909452,-0.008186694,-0.005895194,-0.008216686,-0.009042157,-0.004986445,-0.006038676,-0.008910771,-0.00996513,-0.01764608,-0.01590501,-0.005700758,-0.03699083,-0.01600826,-0.02056009,-0.01234298,-0.01903139,-0.00780089,-0.009505375,-0.02348118,-0.005755001,-0.007974807,-0.006364055,-0.012025,-0.006498991,-0.05742392,-0.0040663,-0.01665162,-0.007533179,-0.01689576,-0.03124136,-0.160591,-0.009919841,-0.005715425,-0.00632465,-0.009028569,-0.01685512,-0.06881626,-0.01160421,-0.007683892,⋯,-0.003162278,,,,,,,,-0.003162278,-0.006708321,,-0.004472158,,,,,,,,,,,-0.003162278,,,,-0.00632465,,-0.004472158,,-0.01553977,,,,-0.003162278,,,,,-0.003162278,-0.003162278,-0.004472158,,,,,-0.003577714,,,-0.006000078,,,,,-0.003162278,,,,,,,,,,-0.003162278,,,,,,,,-0.003162278,,,,,,,,,-0.003162278,,-0.003162278,,,,,,,,,,,,,,,,,,,,,-0.003162278,,,,,,,,,,,,,,,-0.003162278,,,,,,-0.004472158,,,,,,,,,,,,,,,,-0.003162278,-0.003162278,,,,,-0.003162278,,
-0.007562532,-0.006210679,-0.04757756,-0.01291986,-0.00547728,-0.006928335,7.52173642,-0.03535494,-0.0142023,-0.007563966,-0.003162278,-0.003162278,-0.009970991,-0.003909361,-0.02378741,-0.003162278,-0.005078374,-0.005185494,-0.004242658,-0.009129044,-0.01691168,-0.1073133,-0.004403876,-0.005197096,-0.01265382,-0.01995426,-0.03828038,-0.003721049,-0.006984438,-0.01158169,-0.0159182,-0.01056815,-0.008014094,-0.01173766,-0.01892081,-0.01619145,-0.004899014,-0.02238044,-0.006674353,,-0.01014646,-0.005164021,-0.007980453,-0.006285965,-0.00547728,-0.005059964,-0.008281029,-0.006766587,-0.005496007,-0.006455074,,-0.04131055,-0.009311696,-0.06607922,-0.0937618,-0.01337802,-0.005797577,-0.005996531,-0.01469845,-0.009186176,-0.01591138,-0.04699335,-0.04551555,-0.0424701,-0.00749627,-0.004472158,-0.006907415,-0.0220617,-0.01104046,-0.05358533,0.3633776,-0.03239995,-0.09569928,-0.01070896,-0.01383741,-0.003162278,-0.01334755,-0.006855105,-0.003162278,-0.00632465,-0.01504071,-0.01678888,-0.004472158,-0.00547728,-0.007893729,-0.005776935,-0.008198057,-0.006529393,,-0.005111698,-0.003162278,-0.003162278,-0.01445939,-0.02706786,-0.006139491,-0.01291096,-0.004969233,-0.003162278,-0.005888891,-0.008682715,-0.06186187,-0.1060624,-0.04214093,-0.03772148,-0.01277175,-0.007108406,-0.01090045,-0.004251866,-0.007845593,-0.009109103,-0.00909452,-0.008186694,-0.005895194,-0.008216686,-0.009042157,-0.004986445,-0.006038676,-0.008910771,-0.00996513,-0.01764608,-0.01590501,-0.005700758,-0.03699083,-0.01600826,-0.02056009,-0.01234298,-0.01903139,-0.00780089,-0.009505375,-0.02348118,-0.005755001,-0.007974807,-0.006364055,-0.012025,-0.006498991,0.10864675,-0.0040663,-0.01665162,-0.007533179,-0.01689576,-0.03124136,-0.160591,-0.009919841,-0.005715425,-0.00632465,-0.009028569,-0.01685512,-0.06881626,-0.01160421,-0.007683892,⋯,-0.003162278,,,,,,,,-0.003162278,-0.006708321,,-0.004472158,,,,,,,,,,,-0.003162278,,,,-0.00632465,,-0.004472158,,-0.01553977,,,,-0.003162278,,,,,-0.003162278,-0.003162278,-0.004472158,,,,,-0.003577714,,,-0.006000078,,,,,-0.003162278,,,,,,,,,,-0.003162278,,,,,,,,-0.003162278,,,,,,,,,-0.003162278,,-0.003162278,,,,,,,,,,,,,,,,,,,,,-0.003162278,,,,,,,,,,,,,,,-0.003162278,,,,,,-0.004472158,,,,,,,,,,,,,,,,-0.003162278,-0.003162278,,,,,-0.003162278,,
-0.007562532,-0.006210679,-0.04757756,-0.01291986,-0.00547728,-0.006928335,-0.04525058,-0.03535494,-0.0142023,-0.007563966,-0.003162278,-0.003162278,-0.009970991,-0.003909361,-0.02378741,-0.003162278,-0.005078374,-0.005185494,-0.004242658,-0.009129044,-0.01691168,-0.1073133,-0.004403876,-0.005197096,-0.01265382,-0.01995426,-0.03828038,-0.003721049,-0.006984438,-0.01158169,-0.0159182,-0.01056815,-0.008014094,-0.01173766,-0.01892081,-0.01619145,-0.004899014,-0.02238044,-0.006674353,,-0.01014646,-0.005164021,-0.007980453,-0.006285965,-0.00547728,-0.005059964,-0.008281029,-0.006766587,-0.005496007,-0.006455074,,-0.04131055,-0.009311696,-0.06607922,-0.0937618,-0.01337802,-0.005797577,-0.005996531,-0.01469845,-0.009186176,-0.01591138,-0.04699335,-0.04551555,-0.0424701,-0.00749627,-0.004472158,-0.006907415,-0.0220617,-0.01104046,-0.05358533,-0.2604895,-0.03239995,-0.09569928,-0.01070896,-0.01383741,-0.003162278,-0.01334755,-0.006855105,-0.003162278,-0.00632465,-0.01504071,-0.01678888,-0.004472158,-0.00547728,-0.007893729,-0.005776935,-0.008198057,-0.006529393,,-0.005111698,-0.003162278,-0.003162278,-0.01445939,-0.02706786,-0.006139491,-0.01291096,-0.004969233,-0.003162278,-0.005888891,-0.008682715,-0.06186187,-0.1060624,-0.04214093,-0.03772148,-0.01277175,-0.007108406,-0.01090045,-0.004251866,-0.007845593,-0.009109103,-0.00909452,-0.008186694,-0.005895194,-0.008216686,-0.009042157,-0.004986445,-0.006038676,-0.008910771,-0.00996513,-0.01764608,-0.01590501,-0.005700758,-0.03699083,-0.01600826,-0.02056009,-0.01234298,-0.01903139,-0.00780089,-0.009505375,-0.02348118,-0.005755001,-0.007974807,-0.006364055,-0.012025,-0.006498991,-0.05742392,-0.0040663,-0.01665162,-0.007533179,-0.01689576,-0.03124136,-0.160591,-0.009919841,-0.005715425,-0.00632465,-0.009028569,-0.01685512,-0.06881626,-0.01160421,-0.007683892,⋯,-0.003162278,,,,,,,,-0.003162278,-0.006708321,,-0.004472158,,,,,,,,,,,-0.003162278,,,,-0.00632465,,-0.004472158,,-0.01553977,,,,-0.003162278,,,,,-0.003162278,-0.003162278,-0.004472158,,,,,-0.003577714,,,-0.006000078,,,,,-0.003162278,,,,,,,,,,-0.003162278,,,,,,,,-0.003162278,,,,,,,,,-0.003162278,,-0.003162278,,,,,,,,,,,,,,,,,,,,,-0.003162278,,,,,,,,,,,,,,,-0.003162278,,,,,,-0.004472158,,,,,,,,,,,,,,,,-0.003162278,-0.003162278,,,,,-0.003162278,,


In [14]:
# perform kernel pca
wide_data_kpca <- kpca(~., data=as.data.frame(wide_data_scaled),
    kernel = "rbfdot"#, kpar = list(sigma = 1)
)

summary(wide_data_kpca)

ERROR: Error: cannot allocate vector of size 74.5 Gb


In [4]:
# do PCA on the wide_data
# perform the PCA
pca_results <- PCA(wide_data_presence[, -c("id", "year")],
    graph = FALSE,
    ncp = 100
)

In [5]:
pca_results

**Results for the Principal Component Analysis (PCA)**
The analysis was performed on 500000 individuals, described by 1729 variables
*The results are available in the following objects:

   name               description                          
1  "$eig"             "eigenvalues"                        
2  "$var"             "results for the variables"          
3  "$var$coord"       "coord. for the variables"           
4  "$var$cor"         "correlations variables - dimensions"
5  "$var$cos2"        "cos2 for the variables"             
6  "$var$contrib"     "contributions of the variables"     
7  "$ind"             "results for the individuals"        
8  "$ind$coord"       "coord. for the individuals"         
9  "$ind$cos2"        "cos2 for the individuals"           
10 "$ind$contrib"     "contributions of the individuals"   
11 "$call"            "summary statistics"                 
12 "$call$centre"     "mean of the variables"              
13 "$call$ecart.type" "standard e

In [6]:
head(pca_results$ind$coord)

Unnamed: 0,Dim.1,Dim.2,Dim.3,Dim.4,Dim.5,Dim.6,Dim.7,Dim.8,Dim.9,Dim.10,Dim.11,Dim.12,Dim.13,Dim.14,Dim.15,Dim.16,Dim.17,Dim.18,Dim.19,Dim.20,Dim.21,Dim.22,Dim.23,Dim.24,Dim.25,Dim.26,Dim.27,Dim.28,Dim.29,Dim.30,Dim.31,Dim.32,Dim.33,Dim.34,Dim.35,Dim.36,Dim.37,Dim.38,Dim.39,Dim.40,Dim.41,Dim.42,Dim.43,Dim.44,Dim.45,Dim.46,Dim.47,Dim.48,Dim.49,Dim.50,Dim.51,Dim.52,Dim.53,Dim.54,Dim.55,Dim.56,Dim.57,Dim.58,Dim.59,Dim.60,Dim.61,Dim.62,Dim.63,Dim.64,Dim.65,Dim.66,Dim.67,Dim.68,Dim.69,Dim.70,Dim.71,Dim.72,Dim.73,Dim.74,Dim.75,Dim.76,Dim.77,Dim.78,Dim.79,Dim.80,Dim.81,Dim.82,Dim.83,Dim.84,Dim.85,Dim.86,Dim.87,Dim.88,Dim.89,Dim.90,Dim.91,Dim.92,Dim.93,Dim.94,Dim.95,Dim.96,Dim.97,Dim.98,Dim.99,Dim.100
1,0.8688914,-0.04518187,-0.3457613,-2.5136645,-0.8694933,-0.56499414,-2.41442184,-2.6878193,0.5155052,-0.624619,-0.07866497,-0.353180607,-0.11909206,-1.00904826,0.5940065,0.4509753,-0.30970806,1.62948422,-1.83703271,0.01837236,0.07834666,-0.14074149,-0.4624748,-0.335447,0.0505631,-0.4823393,-0.58232675,0.4856234,0.15086407,-0.04650155,0.1266357,-0.08900848,0.31615114,0.36149164,0.08718276,0.6097465,-0.8993532,-0.78858751,0.74930261,-0.64137397,-1.37585829,-0.69624498,-0.45061489,-1.4912425,-0.6530947,-0.3030309,-0.5076353,-0.3495098,1.2652101,1.08764497,0.3454046,0.7090622,0.75146087,0.02611199,0.47689952,0.70476636,-0.4232344,0.05769731,0.4534731,1.1020406,-0.7322334,-1.1499846,0.76879654,-1.6276602,-0.02805171,-0.10550337,0.62227977,-0.38727972,-0.09199857,0.04904052,-0.05826927,-0.6651205,-0.81191006,0.3640468,-1.52781653,1.300678097,-0.08821917,0.32014861,0.70675196,1.1942149,0.4292695,0.2999739,-0.65942883,-0.6337749,0.7580024,0.29276102,0.8162031,1.84994854,0.1400378,0.22583102,0.38385016,-0.777263518,0.3332814,-0.57113091,-0.05028273,1.06591742,-0.693739,-0.06478676,-1.43795998,-0.58249543
2,-2.1033502,-0.03892521,-0.5512945,-1.1236564,1.3047567,0.34872758,1.43954758,-0.5520582,-0.8573216,-0.6105449,-0.34953056,1.184783322,0.08797358,0.96728628,-0.737597,-0.9031536,-0.69986257,-0.01786079,-0.22996946,-0.27090313,0.26069358,0.04143913,0.119715,0.3140453,0.54365023,-0.1296118,-0.29548935,-0.2424958,0.39458507,0.08210694,0.32672221,0.13186342,0.29745368,-0.03948061,0.74506046,-0.4884979,-0.6863385,-0.05039118,-1.4465351,-0.58313023,-1.07657802,-0.16903682,-0.16187553,-0.307972,0.5983264,1.30360451,-1.0924512,0.8743596,-0.7413409,-0.36613709,-0.3953477,0.2428828,-0.02299781,0.46101924,-0.40322538,0.98317238,0.1408206,-0.5192364,-0.2518628,-0.3513833,-0.281256,-0.4288975,-0.51779035,0.9863608,-0.43351876,0.77304207,-0.22733749,-1.54903137,0.34565915,0.91408058,0.8976615,1.4320745,0.80682681,1.0424536,0.37842038,0.583448222,0.86536614,0.03934572,-0.30404124,0.8376956,-0.2228675,0.4934546,0.51826584,0.9008804,-0.6103868,-1.66875707,0.2382326,-0.82248358,-0.1329926,-0.50499825,-1.21502728,0.004318909,0.8439197,0.0954882,0.54223341,0.35159629,0.5172211,0.05964592,0.91542952,-0.07547496
3,-2.1551037,-0.04650516,-0.6963318,-1.1582182,-1.5988312,-0.38363563,-1.55654153,-0.3443874,0.7687808,-0.1172583,-0.01085756,-0.008552755,-0.01328443,-0.01866305,0.3823713,-0.1685987,-0.07604074,0.53667347,-0.66932639,-0.83629607,0.4591808,-0.22229818,-0.2353112,0.9008813,0.34432468,1.0687833,-0.17149717,-0.2371396,-0.08194182,0.12559413,0.35644928,0.22476811,0.01873297,-0.27190018,1.16428101,-0.3644768,0.3800782,-0.34340544,-0.40125029,-0.59551288,-0.01419248,0.81244952,-0.34677463,-0.8703317,-0.3133113,-0.51583373,0.2988019,-0.4444449,0.2233523,-0.30923224,-0.1866808,0.5320791,0.80114974,0.06257386,0.35249961,0.4012567,0.9965941,1.24482997,-0.4988296,-0.5150833,0.448769,0.1025532,-0.50823036,-0.3592943,-0.1225661,0.09637148,0.09119716,0.11243722,0.27358747,0.42942356,-0.46679265,-0.9906748,-0.40803179,0.3228846,-0.2022071,0.005852807,-0.66156549,-0.19735262,-0.01652211,-0.0668057,-0.5007962,0.4703878,-0.37350504,-0.2787739,0.2064068,0.14913713,-0.2794167,1.12224369,0.6925119,-0.43802773,0.05531381,-0.307451672,-1.1113545,0.89820383,-0.52793804,-0.17142495,0.2950517,0.02844692,0.376851,-0.51640847
4,5.6468025,-0.09145044,2.675406,-1.7661599,-4.063856,-0.49580292,-2.40299644,2.0456582,-1.5711452,3.3433768,-0.38905242,5.673954548,-0.34535483,-0.181538,3.2610466,-0.2647837,-1.05829301,-2.40948917,1.01398569,-3.3452069,3.45073678,-1.2359579,-1.372941,1.8917199,2.26388313,4.2318291,-0.46378621,0.9779559,0.11337884,0.02400435,0.3101662,-0.19136646,-4.63173387,-0.66130915,-0.11663843,-0.979014,2.6099596,3.62728175,-1.04031236,-1.00726354,1.43590688,0.4505709,-0.08458858,3.1643649,1.1312608,1.57750869,1.5785617,3.1697772,-0.3736382,2.44180931,-0.3169283,1.4888976,-3.74526865,-1.79892696,1.4499587,0.0499806,1.1198762,1.24420232,-1.5558889,-6.3353624,-0.8473509,1.5395106,1.15555261,-1.3836195,2.41408944,1.88631852,0.08408153,-1.98220569,0.04263592,5.17799,-4.27064038,-0.1616964,-0.05679301,-1.679407,4.10054415,2.706528248,2.62441903,0.2705086,-2.89519943,-3.1315032,2.1322871,-0.8867976,-3.92043864,-0.7500106,-0.2406854,-0.92219461,2.1693453,-1.54557328,3.3116558,0.89877398,-2.96468385,-0.730946792,0.4429444,-0.50024609,5.04735855,-2.10164662,2.35598,-1.36588306,-2.78302314,0.63305739
5,4.9180536,-0.0410536,1.81393,-0.1147206,1.396423,0.13768728,0.1513036,-3.7387522,-2.0271984,-0.4263293,0.43511829,-2.718353202,-0.10712289,-0.25257834,0.6365892,-0.6966163,-1.52710738,-0.20728413,-0.09586038,0.60013559,-0.9429152,0.23064218,-0.1292433,-1.6052893,0.04991143,-2.1436335,-0.02308936,-2.085114,-1.34424698,-0.09643352,-0.15298541,0.22911268,-0.18495998,-0.28021723,-0.74374708,0.6143842,-1.4307135,-0.59680838,-1.24412418,-0.07375145,-0.50285932,1.28802996,1.12574516,0.1974938,0.1785073,-2.58020267,-1.1967123,-1.6469209,-0.673441,-0.12033312,-0.2531354,0.1910344,-1.62849493,-0.99771282,0.05252227,0.06277914,-1.6826563,1.60122444,-1.4175546,-0.2630065,-0.8329825,1.2381635,-0.42024721,-2.2518246,-0.48025797,0.59829409,0.31000035,-1.45594724,-0.66119315,-0.87578626,-0.2403169,-0.6042383,-1.00844903,1.0551142,-2.11788076,-0.011398681,1.69091183,0.47036132,-0.2183571,-2.0984461,-0.9932056,-1.3512374,-0.02439278,0.2404308,1.4244502,0.46168922,-1.0242291,-0.92823053,1.2915781,0.40414099,-0.21939571,0.005969181,1.2080164,1.62867128,-1.05860078,-0.58197116,2.0676326,-0.53539787,-0.01229949,0.31257487
6,-2.3965704,-0.01407106,-1.8965567,0.5621202,0.9870985,0.05048896,-0.02370499,-0.1295621,-0.4412714,-0.6537619,-0.04674423,-0.298309845,0.0202747,0.23767901,-0.1900571,-0.5414607,-0.37146629,0.05659062,-0.05199264,0.31772052,-0.0805696,0.10155916,0.1117925,-0.3161904,-0.19376527,-0.1911078,0.33146204,-0.225313,-0.25015133,-0.07583304,-0.04398782,-0.01608607,-0.13256341,0.07991487,-0.72169044,-0.2793009,-0.374927,0.45291524,0.02704762,0.35502885,-0.15259591,-0.04553593,0.40066908,-0.2944976,-0.2237477,0.07070524,-0.5238451,-0.4764811,-0.2336344,-0.03339904,0.4715388,0.1801811,-0.45928794,0.30763175,0.1271055,-0.4052243,0.1608198,-0.55020219,-0.2950291,-0.4289361,-0.1830296,0.4028417,-0.06737875,-0.401122,-0.30865402,-0.31694115,0.11701496,-0.06910913,0.34498033,0.13625169,0.24887295,-0.1490716,-0.20785551,0.6724553,-0.05250054,-0.266790712,-0.25693504,0.14681739,0.18956843,0.1257499,0.3894726,0.3003542,0.12198627,0.4438595,0.2211251,0.01227487,-0.2172422,-0.01075633,-0.1300546,0.04838672,-0.08127641,-0.312067539,0.4623073,-0.01856135,0.43599247,-0.08323532,0.239896,0.20169994,0.09100348,-0.01644335


In [8]:
eig.val <- pca_results$eig %>% as.data.table()
colnames(eig.val) <- c("eigenvalue", "var_pct", "cum_sum_var")
eig.val$component_no <- seq(1, nrow(eig.val))

eig.val[
    cum_sum_var > 99 &
    cum_sum_var < 99.5
]

eig.val[100, ]

eigenvalue,var_pct,cum_sum_var,component_no
<dbl>,<dbl>,<dbl>,<int>
0.5350732,0.03094697,99.0081,1679
0.5339178,0.03088015,99.03898,1680
0.5333435,0.03084693,99.06983,1681
0.5318015,0.03075775,99.10059,1682
0.5296726,0.03063462,99.13122,1683
0.5292208,0.03060849,99.16183,1684
0.524786,0.03035199,99.19218,1685
0.5207788,0.03012023,99.2223,1686
0.5196485,0.03005486,99.25236,1687
0.5172142,0.02991407,99.28227,1688


eigenvalue,var_pct,cum_sum_var,component_no
<dbl>,<dbl>,<dbl>,<int>
1.340568,0.07753428,12.65381,100


## Autoencoder
We will also try an autoencoder to see if it works better than the PCA. The possible reason why it might do better is the fact that our data will inherently have lots of interactions and non-linear relationships between the features. Also, we will be able to use all the rows of the data without running into stack overflow issues.

In [7]:
# create test, cross validation and training sets
set.seed(123)
wide_data_presence <- read.fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_presence_2016.fst",
    as.data.table = TRUE
)
head(wide_data_presence)
dim(wide_data_presence)

id,year,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C50,⋯,V58,V63,V73,V78,V79,V88,V94,W04,W21,W27,W28,W29,W32,W33,W39,W46,W52,W53,W60,W61,W89,W90,X00,X01,X02,X10,X12,X13,X15,X18,X32,X52,Y00,Y03,Y09,Y24,Y27,Y29,Y30,A82,B52,B68,C58,G07,G41,N24,O01,O31,P10,P12,P23,P51,P54,P55,P84,Q00,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,A22,A34,A65,A99,B04,B79,F35,F62,H45,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,T64,V06,W85,X40,Y64,Z05,Z38,B92,E33,H48,K67,M78,P93,V20,B12,H22,I14,J29,M52,P95,V32,V76,V82,V96
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000015,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000019,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000053,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000099,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000203,2016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000221,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [9]:
# create train, test, and cross validation sets. 60% train, 20% test, 20% cross validation
train <- wide_data_presence[sample(.N, .N * 0.6), ]
test <- wide_data_presence[!id %in% train$id]
cv <- test[sample(.N, .N * 0.5), ]
test <- test[!id %in% cv$id]

head(train)
dim(train)
dim(test)
dim(cv)

id,year,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C50,⋯,V58,V63,V73,V78,V79,V88,V94,W04,W21,W27,W28,W29,W32,W33,W39,W46,W52,W53,W60,W61,W89,W90,X00,X01,X02,X10,X12,X13,X15,X18,X32,X52,Y00,Y03,Y09,Y24,Y27,Y29,Y30,A82,B52,B68,C58,G07,G41,N24,O01,O31,P10,P12,P23,P51,P54,P55,P84,Q00,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,A22,A34,A65,A99,B04,B79,F35,F62,H45,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,T64,V06,W85,X40,Y64,Z05,Z38,B92,E33,H48,K67,M78,P93,V20,B12,H22,I14,J29,M52,P95,V32,V76,V82,V96
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
476363231,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
109095229,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
474033737,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
453517035,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
477842881,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
483029067,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [52]:
# create the encoder and decoder

# create the encoder
l1_regularization_factor <- 0.0001
input_shape <- ncol(train) - 2
input_layer <- layer_input(shape = input_shape)


encoder <- input_layer %>%
    layer_dense(units = 250, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    #layer_batch_normalization() %>%
    layer_dense(units = 100, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    layer_dense(units = 50, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    layer_dense(units = 25, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    layer_dense(units = 10, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) #10 dimension for the output

# create the decoder

decoder <- encoder %>%
    layer_dense(units = 25, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    layer_dense(units = 50, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    layer_dense(units = 100, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    layer_dense(units = 250, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor)) %>%
    layer_dense(units = input_shape, activation = "relu", kernel_regularizer = regularizer_l1(l = l1_regularization_factor))

In [53]:
# compile the model, note that since we are working on the presence dataset, this will be classification task, we will use AUC and accuracy as metrics
autoencoder <- keras_model(input_layer, decoder)
autoencoder %>% compile(
    optimizer = optimizer_adam(learning_rate = 0.001),
    loss = "binary_crossentropy",
    metrics = c("accuracy", "AUC")
)

summary(autoencoder)

Model: "model_10"
____________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________
 Layer (type)                                                                                                                          Output Shape                                                                                                            Param #                                      
 input_8 (InputLayer)                                                                                                                  [(None, 1837)]                                                                                                          0                                            
 dense_60 (Dense)                                                              

In [54]:
# now, train the model
history_encoder <- autoencoder %>% fit(
    train[,-c("id", "year")],
    train[,-c("id", "year")],
    validation_data = list(cv[,-c("id", "year")], cv[,-c("id", "year")]),
    epochs = 1,
    batch_size = 100,
    verbose = 1
)

In [None]:
# plot the training and validation loss
plot(history_encoder)

# plot the training and validation accuracy
plot(history_encoder, metric = "accuracy")

# plot the training and validation AUC
plot(history_encoder, metric = "AUC")

ERROR: Error in .External2(C_savehistory, file): no history available to save


# Clustering

In [9]:
# winsorise all dims of coords_pred
coords_pred <- pca_results$ind$coord
#add year and id
coords_pred <- cbind(wide_data[, c("id", "year")], coords_pred)
library(DescTools)
coords_pred <-
    coords_pred %>%
    mutate_at(vars(-c("id", "year")),
    Winsorize, probs = c(0.05, 0.95)) %>%
    as.data.table()


Attaching package: ‘DescTools’


The following object is masked from ‘package:data.table’:

    %like%




In [11]:
library(ClusterR)
kmeans_results <-
    KMeans_rcpp(coords_pred[, -c(1, 2)], 10, initializer = "kmeans++")

Loading required package: gtools



In [14]:
summary(kmeans_results)

                        Length Class  Mode   
call                         4 -none- call   
clusters                500000 -none- numeric
centroids                 1000 -none- numeric
total_SSE                    1 -none- numeric
best_initialization          1 -none- numeric
WCSS_per_cluster            10 -none- numeric
obs_per_cluster             10 -none- numeric
between.SS_DIV_total.SS      1 -none- numeric

In [15]:
wide_data_presence$cluster = kmeans_results$cluster

In [34]:
# remove outliers

# run clustering on predictions
library(cluster)
library(pracma)
library(factoextra)

# elbow method
wss <- (nrow(coords_pred) - 1) * sum(apply(coords_pred[, -c("id", "year")], 2, var))
for (i in 2:15) wss[i] <- sum(kmeanspp(coords_pred[, -c("id", "year")], k = i)$withinss)
plot(1:15, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")



Attaching package: ‘pracma’


The following objects are masked from ‘package:DescTools’:

    Mode, Rank


The following objects are masked from ‘package:kernlab’:

    cross, eig, size


The following object is masked from ‘package:purrr’:

    cross




ERROR: Error in kmeanspp(coords_pred[, -c("id", "year")], k = i): could not find function "kmeanspp"


In [64]:
# lets cluster, we will use k-means clustering
install.packages("cluster")
library(cluster)

# we will use the elbow method to find the optimal number of clusters
# we will use the silhouette method to find the optimal number of clusters

# elbow method
wss <- (nrow(wide_data) - 1) * sum(apply(wide_data[, -c("id", "year")], 2, var))
for (i in 2:15) wss[i] <- sum(kmeans(wide_data[, -c("id", "year")], centers = i)$withinss)
plot(1:15, wss, type = "b", xlab = "Number of Clusters", ylab = "Within groups sum of squares")

# silhouette method
library(factoextra)
fviz_nbclust(wide_data[, -c("id", "year")], FUNcluster = kmeans, method = "silhouette")

# we will use 5 clusters

# run k-means clustering
kmeans_model = kmeans(wide_data[, -c("id", "year")], centers = 5)

# add the cluster number to the data
wide_data$cluster = kmeans_model$cluster

# save the data to fst
write.fst(wide_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_clustered.fst")

“installation of package ‘cluster’ had non-zero exit status”
Updating HTML index of packages in '.Library'

Making 'packages.html' ...
 done



## Exploratory data analysis

In [None]:
# number of people in each cluster
kmeans_whole_wide_data[, .N, by = cluster][order(-N)]

cluster,N
<dbl>,<int>
9,144326
6,109042
2,37851
3,36526
5,36222
7,35841
10,34535
1,28010
4,23988
8,13659


In [16]:
# presence of digestive tract cancer in each of the clusters
cancer_codes <-
    colnames(wide_data_presence) %in%
    c("C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26")

#colnames(wide_data_presence)[cancer_codes]

wide_data_presence[, c(colnames(wide_data_presence)[cancer_codes], "cluster"), with = F][, lapply(.SD, mean), by = cluster][, .(mean = rowMeans(.SD)), by = cluster][order(-mean)]

# what percent of cancer patients are in each cluster
total_cancer <- wide_data_presence[, c(colnames(wide_data_presence)[cancer_codes]), with = F][, (rowSums(.SD) > 0) * 1] %>% sum()
total_cancer
wide_data_presence[, c(colnames(wide_data_presence)[cancer_codes], "cluster"), with = F][, sum(rowSums(.SD) > 0) / total_cancer, by = cluster][order(-V1)]


cluster,mean
<dbl>,<dbl>
4,0.0104322995
10,0.0062183292
1,0.0025496846
8,0.0013605193
2,0.0010699849
3,0.0010426363
6,0.0007833373
5,0.0006119669
7,0.0006114971
9,0.0006022246


cluster,V1
<dbl>,<dbl>
4,0.27381652
10,0.22588982
6,0.11464819
9,0.11441325
1,0.08316692
2,0.05415247
3,0.05015858
5,0.02995419
7,0.02936685
8,0.02443322


In [17]:
# presence of MI in each of the clusters
mi_codes <-
    colnames(wide_data_presence) %in%
    c("I21")

# colnames(wide_data_presence)[mi_codes]

wide_data_presence[, c(colnames(wide_data_presence)[mi_codes], "cluster"), with = F][, lapply(.SD, mean), by = cluster][, .(mean = rowMeans(.SD)), by = cluster][order(-mean)]

# what percent of MI patients are in each cluster
total_mi <- wide_data_presence[, c(colnames(wide_data_presence)[mi_codes]), with = F][, (rowSums(.SD) > 0) * 1] %>% sum()
total_mi
wide_data_presence[, c(colnames(wide_data_presence)[mi_codes], "cluster"), with = F][, sum(rowSums(.SD) > 0) / total_mi, by = cluster][order(-V1)]

cluster,mean
<dbl>,<dbl>
10,0.149153033
2,0.084013632
1,0.050981792
5,0.020595218
4,0.012798066
7,0.006389331
3,0.005201774
8,0.004026649
9,0.003415878
6,0.002778746


cluster,V1
<dbl>,<dbl>
10,0.426336699
2,0.263201457
1,0.118192352
5,0.061744744
9,0.040804503
4,0.0254097
6,0.025078629
7,0.018953816
3,0.015725873
8,0.004552226


In [18]:
# presence of schizophrenia in each of the clusters

mental_codes <-
    colnames(wide_data_presence) %in%
    c("F20", "F21", "F22", "F23", "F24", "F25", "F26", "F27", "F28", "F29")
# colnames(wide_data_presence)[mental_codes]

wide_data_presence[, c(colnames(wide_data_presence)[mental_codes], "cluster"), with = F][, lapply(.SD, mean), by = cluster][, .(mean = rowMeans(.SD)), by = cluster][order(-mean)]

# what percent of mental patients are in each cluster
total_mental <- wide_data_presence[, c(colnames(wide_data_presence)[mental_codes]), with = F][, (rowSums(.SD) > 0) * 1] %>% sum()
total_mental
wide_data_presence[, c(colnames(wide_data_presence)[mental_codes], "cluster"), with = F][, sum(rowSums(.SD) > 0) / total_mental, by = cluster][order(-V1)]

cluster,mean
<dbl>,<dbl>
5,0.020854039
1,0.0168466619
10,0.0117489503
9,0.0050614581
8,0.0046123435
6,0.0031272354
7,0.0029296058
4,0.0027044772
2,0.0013605981
3,0.0006091551


cluster,V1
<dbl>,<dbl>
9,0.274203552
5,0.235466591
1,0.126924161
10,0.126755004
6,0.125458134
7,0.037383705
4,0.023681985
8,0.021934029
2,0.019622216
3,0.008570623


## Predict clusters for the rest of the wide data

In [21]:
wide_data_presence <- read.fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_presence_2016.fst",
    as.data.table = TRUE
)

In [24]:
# predict PCAs
wide_data_after_pca <-
    predict(pca_results, wide_data_presence[, -c("id", "year")])

ERROR: Error in UseMethod("tbl_vars"): no applicable method for 'tbl_vars' applied to an object of class "list"


In [27]:
str(wide_data_after_pca)

List of 3
 $ coord: num [1:1845299, 1:100] 0.869 -2.103 -2.155 5.647 4.918 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : NULL
  .. ..$ : chr [1:100] "Dim.1" "Dim.2" "Dim.3" "Dim.4" ...
 $ cos2 : num [1:1845299, 1:100] 0.000825 0.016165 0.027139 0.002909 0.027123 ...
  ..- attr(*, "dimnames")=List of 2
  .. ..$ : NULL
  .. ..$ : chr [1:100] "Dim.1" "Dim.2" "Dim.3" "Dim.4" ...
 $ dist : num [1:1845299] 30.2 16.5 13.1 104.7 29.9 ...


In [31]:
# winsorize
library(DescTools)
wide_data_after_pca <-
    wide_data_after_pca$coord %>%
    as.data.table() %>%
    mutate_all(
        Winsorize, probs = c(0.05, 0.95)
    ) %>%
    as.data.table()

In [34]:
# predict clusters
kmeans_whole_wide_data <- predict(kmeans_results, wide_data_after_pca,
    threads = numcores
)
# create a data table with only id, year, and cluster
results_clustering_whole_data <- data.table(
    id = wide_data_presence$id,
    year = wide_data_presence$year,
    cluster = kmeans_whole_wide_data
)

wide_data$cluster <- kmeans_whole_wide_data
# save the data to fst
write.fst(results_clustering_whole_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/results_clustering_whole_data.fst")

In [35]:
head(results_clustering_whole_data)

id,year,cluster
<int>,<dbl>,<dbl>
100000015,2016,2
100000019,2016,6
100000053,2016,6
100000099,2016,1
100000203,2016,1
100000221,2016,9


In [37]:
wide_data_presence$cluster <- kmeans_whole_wide_data

### Exploratory analyses on the whole data

In [38]:
# number of people in each cluster
wide_data_presence[, .N, by = cluster][order(-N)]

cluster,N
<dbl>,<int>
9,615044
6,449941
7,148717
3,127958
4,106149
2,101748
10,83332
8,78781
1,70226
5,63403


In [39]:
# presence of digestive tract cancer in each of the clusters
cancer_codes <-
    colnames(wide_data_presence) %in%
    c("C15", "C16", "C17", "C18", "C19", "C20", "C21", "C22", "C23", "C24", "C25", "C26")

#colnames(wide_data_presence)[cancer_codes]

wide_data_presence[, c(colnames(wide_data_presence)[cancer_codes], "cluster"), with = F][, lapply(.SD, mean), by = cluster][, .(mean = rowMeans(.SD)), by = cluster][order(-mean)]

# what percent of cancer patients are in each cluster
total_cancer <- wide_data_presence[, c(colnames(wide_data_presence)[cancer_codes]), with = F][, (rowSums(.SD) > 0) * 1] %>% sum()
total_cancer
wide_data_presence[, c(colnames(wide_data_presence)[cancer_codes], "cluster"), with = F][, sum(rowSums(.SD) > 0) / total_cancer, by = cluster][order(-V1)]


cluster,mean
<dbl>,<dbl>
4,0.0093249426
10,0.0080181283
1,0.0025928194
8,0.0009763352
2,0.0009246701
3,0.0007893215
5,0.0006164272
6,0.00061008
9,0.000561071
7,0.0004443563


cluster,V1
<dbl>,<dbl>
4,0.32669207
10,0.20711305
9,0.1363092
6,0.11232081
1,0.0628924
3,0.04144438
2,0.03908547
8,0.03048449
7,0.02739975
5,0.01625839


In [40]:
# presence of MI in each of the clusters
mi_codes <-
    colnames(wide_data_presence) %in%
    c("I21")

# colnames(wide_data_presence)[mi_codes]

wide_data_presence[, c(colnames(wide_data_presence)[mi_codes], "cluster"), with = F][, lapply(.SD, mean), by = cluster][, .(mean = rowMeans(.SD)), by = cluster][order(-mean)]

# what percent of MI patients are in each cluster
total_mi <- wide_data_presence[, c(colnames(wide_data_presence)[mi_codes]), with = F][, (rowSums(.SD) > 0) * 1] %>% sum()
total_mi
wide_data_presence[, c(colnames(wide_data_presence)[mi_codes], "cluster"), with = F][, sum(rowSums(.SD) > 0) / total_mi, by = cluster][order(-V1)]

cluster,mean
<dbl>,<dbl>
10,0.149858398
2,0.09013445
1,0.049554296
5,0.019147359
4,0.011465016
7,0.005426414
3,0.00425921
8,0.003630317
9,0.003095713
6,0.002813702


cluster,V1
<dbl>,<dbl>
10,0.38569399
2,0.283247884
1,0.107480388
9,0.058805362
6,0.039100624
4,0.037587251
5,0.037494595
7,0.024924331
3,0.016832417
8,0.008833158


In [41]:
# presence of schizophrenia in each of the clusters

mental_codes <-
    colnames(wide_data_presence) %in%
    c("F20", "F21", "F22", "F23", "F24", "F25", "F26", "F27", "F28", "F29")
# colnames(wide_data_presence)[mental_codes]

wide_data_presence[, c(colnames(wide_data_presence)[mental_codes], "cluster"), with = F][, lapply(.SD, mean), by = cluster][, .(mean = rowMeans(.SD)), by = cluster][order(-mean)]

# what percent of mental patients are in each cluster
total_mental <- wide_data_presence[, c(colnames(wide_data_presence)[mental_codes]), with = F][, (rowSums(.SD) > 0) * 1] %>% sum()
total_mental
wide_data_presence[, c(colnames(wide_data_presence)[mental_codes], "cluster"), with = F][, sum(rowSums(.SD) > 0) / total_mental, by = cluster][order(-V1)]

cluster,mean
<dbl>,<dbl>
5,0.0217359589
1,0.0185561615
10,0.0111436783
9,0.0035084888
8,0.0027338445
7,0.0023551443
6,0.0021024979
4,0.00202781
2,0.0014742305
3,0.0003546085


cluster,V1
<dbl>,<dbl>
9,0.312077835
5,0.168642362
1,0.135898009
6,0.131894431
10,0.114582867
7,0.050055916
4,0.030149855
8,0.029545963
2,0.020241557
3,0.006911206


# Going back to the main data for the rest of the analysis

In [19]:
# read the data
carrier_data <- read_fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/carrier_data.fst",
    as.data.table = TRUE
)
inpatient_data <- read_fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/inpatient_data.fst",
    as.data.table = TRUE
)
outpatient_data <- read_fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/outpatient_data.fst",
    as.data.table = TRUE
)

clustering_results <- read_fst("/work/postresearch/Shared/Projects/Farbod/Clustering/results_clustering_whole_data.fst",
    as.data.table = TRUE
)

In [20]:
#choose only 2016
carrier_data <- carrier_data[year == 2016, ]
inpatient_data <- inpatient_data[year == 2016, ]
outpatient_data <- outpatient_data[year == 2016, ]

In [21]:
head(carrier_data)
head(inpatient_data)
head(outpatient_data)
head(clustering_results)

id,diagnosis,provider,provider_specialty,hcpcs,date,cost,year
<int>,<chr>,<chr>,<chr>,<chr>,<int>,<dbl>,<dbl>
100000015,H3532,1265609275,18,67028,20160129,110.66,2016
100000015,H3532,1265609275,18,J0178,20160129,1961.0,2016
100000015,I10,1730170630,6,99214,20160212,113.64,2016
100000015,I10,1730170630,6,93000,20160212,18.34,2016
100000015,I479,1730170630,6,93457,20160218,371.86,2016
100000015,I252,1730170630,6,99217,20160219,75.37,2016


DESY_SORT_KEY,CLAIM_NO,PRVDR_NUM,CLM_THRU_DT,NCH_NEAR_LINE_REC_IDENT_CD,NCH_CLM_TYPE_CD,CLAIM_QUERY_CODE,CLM_FAC_TYPE_CD,CLM_SRVC_CLSFCTN_TYPE_CD,CLM_FREQ_CD,FI_NUM,CLM_MDCR_NON_PMT_RSN_CD,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,NCH_PRMRY_PYR_CD,FI_CLM_ACTN_CD,PRVDR_STATE_CD,ORG_NPI_NUM,AT_PHYSN_UPIN,AT_PHYSN_NPI,AT_PHYSN_SPCLTY_CD,OP_PHYSN_UPIN,OP_PHYSN_NPI,OP_PHYSN_SPCLTY_CD,OT_PHYSN_UPIN,OT_PHYSN_NPI,OT_PHYSN_SPCLTY_CD,RNDRNG_PHYSN_NPI,RNDRNG_PHYSN_SPCLTY_CD,CLM_MCO_PD_SW,PTNT_DSCHRG_STUS_CD,CLM_PPS_IND_CD,CLM_TOT_CHRG_AMT,CLM_ADMSN_DT,CLM_IP_ADMSN_TYPE_CD,CLM_SRC_IP_ADMSN_CD,NCH_PTNT_STATUS_IND_CD,CLM_PASS_THRU_PER_DIEM_AMT,NCH_BENE_IP_DDCTBL_AMT,NCH_BENE_PTA_COINSRNC_LBLTY_AM,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,NCH_PROFNL_CMPNT_CHRG_AMT,NCH_IP_NCVRD_CHRG_AMT,CLM_TOT_PPS_CPTL_AMT,CLM_PPS_CPTL_FSP_AMT,CLM_PPS_CPTL_OUTLIER_AMT,CLM_PPS_CPTL_DSPRPRTNT_SHR_AMT,CLM_PPS_CPTL_IME_AMT,CLM_PPS_CPTL_EXCPTN_AMT,CLM_PPS_OLD_CPTL_HLD_HRMLS_AMT,CLM_PPS_CPTL_DRG_WT_NUM,CLM_UTLZTN_DAY_CNT,BENE_TOT_COINSRNC_DAYS_CNT,BENE_LRD_USED_CNT,CLM_NON_UTLZTN_DAYS_CNT,NCH_BLOOD_PNTS_FRNSHD_QTY,NCH_VRFD_NCVRD_STAY_FROM_DT,NCH_VRFD_NCVRD_STAY_THRU_DT,NCH_BENE_MDCR_BNFTS_EXHTD_DT_I,NCH_BENE_DSCHRG_DT,CLM_DRG_CD,CLM_DRG_OUTLIER_STAY_CD,NCH_DRG_OUTLIER_APRVD_PMT_AMT,ADMTG_DGNS_CD,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,CLM_POA_IND_SW1,ICD_DGNS_CD2,CLM_POA_IND_SW2,ICD_DGNS_CD3,CLM_POA_IND_SW3,ICD_DGNS_CD4,CLM_POA_IND_SW4,ICD_DGNS_CD5,CLM_POA_IND_SW5,ICD_DGNS_CD6,CLM_POA_IND_SW6,ICD_DGNS_CD7,CLM_POA_IND_SW7,ICD_DGNS_CD8,CLM_POA_IND_SW8,ICD_DGNS_CD9,CLM_POA_IND_SW9,ICD_DGNS_CD10,CLM_POA_IND_SW10,ICD_DGNS_CD11,CLM_POA_IND_SW11,ICD_DGNS_CD12,CLM_POA_IND_SW12,ICD_DGNS_CD13,CLM_POA_IND_SW13,ICD_DGNS_CD14,CLM_POA_IND_SW14,ICD_DGNS_CD15,CLM_POA_IND_SW15,ICD_DGNS_CD16,CLM_POA_IND_SW16,ICD_DGNS_CD17,CLM_POA_IND_SW17,ICD_DGNS_CD18,CLM_POA_IND_SW18,ICD_DGNS_CD19,CLM_POA_IND_SW19,ICD_DGNS_CD20,CLM_POA_IND_SW20,ICD_DGNS_CD21,CLM_POA_IND_SW21,ICD_DGNS_CD22,CLM_POA_IND_SW22,ICD_DGNS_CD23,CLM_POA_IND_SW23,ICD_DGNS_CD24,CLM_POA_IND_SW24,ICD_DGNS_CD25,CLM_POA_IND_SW25,FST_DGNS_E_CD,ICD_DGNS_E_CD1,CLM_E_POA_IND_SW1,ICD_DGNS_E_CD2,CLM_E_POA_IND_SW2,ICD_DGNS_E_CD3,CLM_E_POA_IND_SW3,ICD_DGNS_E_CD4,CLM_E_POA_IND_SW4,ICD_DGNS_E_CD5,CLM_E_POA_IND_SW5,ICD_DGNS_E_CD6,CLM_E_POA_IND_SW6,ICD_DGNS_E_CD7,CLM_E_POA_IND_SW7,ICD_DGNS_E_CD8,CLM_E_POA_IND_SW8,ICD_DGNS_E_CD9,CLM_E_POA_IND_SW9,ICD_DGNS_E_CD10,CLM_E_POA_IND_SW10,ICD_DGNS_E_CD11,CLM_E_POA_IND_SW11,ICD_DGNS_E_CD12,CLM_E_POA_IND_SW12,ICD_PRCDR_CD1,PRCDR_DT1,ICD_PRCDR_CD2,PRCDR_DT2,ICD_PRCDR_CD3,PRCDR_DT3,ICD_PRCDR_CD4,PRCDR_DT4,ICD_PRCDR_CD5,PRCDR_DT5,ICD_PRCDR_CD6,PRCDR_DT6,ICD_PRCDR_CD7,PRCDR_DT7,ICD_PRCDR_CD8,PRCDR_DT8,ICD_PRCDR_CD9,PRCDR_DT9,ICD_PRCDR_CD10,PRCDR_DT10,ICD_PRCDR_CD11,PRCDR_DT11,ICD_PRCDR_CD12,PRCDR_DT12,ICD_PRCDR_CD13,PRCDR_DT13,ICD_PRCDR_CD14,PRCDR_DT14,ICD_PRCDR_CD15,PRCDR_DT15,ICD_PRCDR_CD16,PRCDR_DT16,ICD_PRCDR_CD17,PRCDR_DT17,ICD_PRCDR_CD18,PRCDR_DT18,ICD_PRCDR_CD19,PRCDR_DT19,ICD_PRCDR_CD20,PRCDR_DT20,ICD_PRCDR_CD21,PRCDR_DT21,ICD_PRCDR_CD22,PRCDR_DT22,ICD_PRCDR_CD23,PRCDR_DT23,ICD_PRCDR_CD24,PRCDR_DT24,ICD_PRCDR_CD25,PRCDR_DT25,DOB_DT,GNDR_CD,BENE_RACE_CD,BENE_CNTY_CD,BENE_STATE_CD,CWF_BENE_MDCR_STUS_CD,CLM_TRTMT_AUTHRZTN_NUM,CLM_PRCR_RTRN_CD,CLM_IP_LOW_VOL_PMT_AMT,CLM_CARE_IMPRVMT_MODEL_CD1,CLM_CARE_IMPRVMT_MODEL_CD2,CLM_CARE_IMPRVMT_MODEL_CD3,CLM_CARE_IMPRVMT_MODEL_CD4,CLM_BNDLD_MODEL_1_DSCNT_PCT,CLM_BASE_OPRTG_DRG_AMT,CLM_VBP_PRTCPNT_IND_CD,CLM_VBP_ADJSTMT_PCT,CLM_HRR_PRTCPNT_IND_CD,CLM_HRR_ADJSTMT_PCT,CLM_MODEL_4_READMSN_IND_CD,CLM_UNCOMPD_CARE_PMT_AMT,CLM_BNDLD_ADJSTMT_PMT_AMT,CLM_VBP_ADJSTMT_PMT_AMT,CLM_HRR_ADJSTMT_PMT_AMT,EHR_PYMT_ADJSTMT_AMT,PPS_STD_VAL_PYMT_AMT,FINL_STD_AMT,HAC_PGM_RDCTN_IND_SW,EHR_PGM_RDCTN_IND_SW,CLM_SITE_NTRL_PYMT_CST_AMT,CLM_SITE_NTRL_PYMT_IPPS_AMT,CLM_FULL_STD_PYMT_AMT,CLM_SS_OUTLIER_STD_PYMT_AMT,CLM_NEXT_GNRTN_ACO_IND_CD1,CLM_NEXT_GNRTN_ACO_IND_CD2,CLM_NEXT_GNRTN_ACO_IND_CD3,CLM_NEXT_GNRTN_ACO_IND_CD4,CLM_NEXT_GNRTN_ACO_IND_CD5,ACO_ID_NUM,year
<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>,<int>,<int>,<dbl>,<chr>,<dbl>,<lgl>,<lgl>,<int>,<lgl>,<lgl>,<int>,<lgl>,<int>,<lgl>,<lgl>,<int>,<int>,<dbl>,<int>,<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<chr>,<dbl>,<int>,<int>,<lgl>,<int>,<dbl>,<dbl>,<chr>,<dbl>,<int>,<dbl>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>
100000203,2,100075,20160403,V,60,5,1,1,I,9101,,6110.17,0,,5,10,1881632818,,1134394356,,,1700010923.0,,,,,,,,1,2,39874.14,20160331,1,1,A,3.64,1288,0,0,0,0,435.76,393.21,0,42.55,0.0,0,0,0.9695,3,0,0,0,0,,,,20160403,194,0,0,E860,J189,J189,Y,E871,Y,I509,Y,F329,Y,F419,Y,E785,Y,K219,Y,I10,Y,A084,Y,K449,Y,E780,Y,Z880,0,Z882,0,E860,Y,Z7951,0,Z9049,Y,Z902,Y,Z85118,0,Z9181,0.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,BB241ZZ,20160402.0,BW211ZZ,20160331.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,2,2,1,510,33,10,,14,0,,62.0,,,0,4944.52,Y,1.0035584,1,0.9923,,1778.88,0,17.59,-38.07,0.0,5726.14,5637.38,,,0,0,0,0,,,,,,,2016
100000421,3,330393,20160617,V,60,3,1,1,1,13201,,26010.55,0,,1,33,1972523348,,1043251655,,,1043251655.0,,,,,,,,3,2,55892.75,20160614,3,1,A,260.35,1288,0,0,0,0,1972.96,1492.61,0,93.89,386.46,0,0,2.7513,3,0,0,0,0,,,,20160617,468,0,0,T84020A,T84020A,T84020A,Y,T84090A,Y,F952,Y,E559,Y,I071,Y,M419,Y,Z96642,Y,E780,Y,J45909,Y,M1990,Y,D649,Y,F419,Y,I371,Y,F328,Y,G43909,Y,R6250,Y,G4733,Y,I351,Y,Q6589,0.0,Z8774,0.0,Z87730,0.0,,,,,,,,,Y838,Y838,0.0,,,,,,,,,,,,,,,,,,,,,,,0SR901Z,20160614.0,0SP90JZ,20160614.0,0SP909Z,20160614.0,0SUA09Z,20160614.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1,2,1,700,33,20,,14,0,,62.0,,,0,18847.54,Y,0.9977695,1,0.9837,,929.15,0,-42.04,-307.21,0.0,16249.95,15950.71,,,0,0,0,0,,,,,,,2016
100000905,4,220033,20160920,V,60,3,1,1,1,14211,,4354.34,0,,1,22,1316917941,,1700174109,,,1720159627.0,,,1720159627.0,,,,,1,2,10105.28,20160918,1,1,A,0.0,1288,0,0,0,0,404.56,382.02,0,22.54,0.0,0,0,0.7294,2,0,0,0,0,,,,20160920,694,0,0,N132,N132,N132,Y,K760,Y,E119,Y,N3000,Y,B9620,Y,E039,Y,N8320,Y,R600,Y,R0789,Y,R9431,Y,J45909,Y,Z87442,0,Z87440,0,Z9049,Y,Z888,0,Z880,0,Z91013,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0T768DZ,20160918.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,40,22,10,,14,0,,,,,0,4750.12,Y,1.0024607,1,0.9915,,455.7,0,11.69,-40.38,58.16,4308.04,4247.64,,,0,0,0,0,,,,,,,2016
100000905,5,220171,20161015,V,60,5,1,1,I,14211,,7344.59,0,,5,22,1558392563,,1912193285,,,,,,,,,,,1,2,9593.91,20161012,1,1,A,138.28,0,0,0,0,0,547.26,487.59,0,0.0,59.67,0,0,0.9469,3,0,0,0,0,,,,20161015,194,0,0,R509,J189,J189,Y,J90,Y,E119,Y,R630,Y,Z936,0,I10,Y,Z87440,0,E039,Y,Z87442,0,J45909,Y,Z9049,0,Z801,0,Z833,0,H9193,Y,Z86011,0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,40,22,10,,14,0,,62.0,,,0,6060.46,Y,1.000835,1,0.9804,,0.0,0,5.06,-118.79,0.0,5646.3,5533.37,,,0,0,0,0,,,,,,,2016
100000945,6,220077,20160226,V,60,5,1,1,I,5901,,25869.46,0,,5,22,1487655064,,1730147679,,,1194832774.0,,,,,,,,1,2,15273.76,20160224,1,1,A,0.0,1288,0,0,0,0,1994.21,1585.2,0,145.2,263.81,0,0,3.0267,2,0,0,0,0,,,,20160226,654,0,0,N200,N132,N132,Y,G912,Y,N179,Y,I739,Y,N3281,Y,N189,Y,N529,Y,E785,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0TC78ZZ,20160225.0,0T7B8DZ,20160225.0,BT1FZZZ,20160225.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,1,1,70,22,10,,0,0,,,,,0,19946.33,Y,1.0000747,1,1.0,,956.45,0,1.49,0.0,0.0,17876.54,17544.77,,,0,0,0,0,,,,,,,2016
100001101,7,500072,20160314,V,60,3,1,1,1,2401,,8514.7,0,,1,50,1306845557,,1245325851,,,,,,,,,,,1,2,13765.01,20160310,1,1,A,0.0,1288,0,0,0,0,679.08,679.08,0,0.0,0.0,0,0,1.4261,4,0,0,0,0,,,,20160314,193,0,0,J189,J189,J189,Y,I5031,Y,I272,Y,I4891,Y,I071,Y,I10,Y,G4733,Y,Z96641,Y,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,5,2,1,40,50,10,,14,0,,,,,0,8486.46,Y,1.0056729,1,1.0,,238.14,0,48.14,0.0,0.0,8422.95,8280.25,,,0,0,0,0,,,,,,,2016


DESY_SORT_KEY,CLAIM_NO,PRVDR_NUM,CLM_THRU_DT,NCH_NEAR_LINE_REC_IDENT_CD,NCH_CLM_TYPE_CD,CLAIM_QUERY_CODE,CLM_FAC_TYPE_CD,CLM_SRVC_CLSFCTN_TYPE_CD,CLM_FREQ_CD,FI_NUM,CLM_MDCR_NON_PMT_RSN_CD,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,NCH_PRMRY_PYR_CD,PRVDR_STATE_CD,ORG_NPI_NUM,SRVC_LOC_NPI_NUM,AT_PHYSN_UPIN,AT_PHYSN_NPI,AT_PHYSN_SPCLTY_CD,OP_PHYSN_UPIN,OP_PHYSN_NPI,OP_PHYSN_SPCLTY_CD,OT_PHYSN_UPIN,OT_PHYSN_NPI,OT_PHYSN_SPCLTY_CD,RNDRNG_PHYSN_NPI,RNDRNG_PHYSN_SPCLTY_CD,RFR_PHYSN_NPI,RFR_PHYSN_SPCLTY_CD,CLM_MCO_PD_SW,PTNT_DSCHRG_STUS_CD,CLM_TOT_CHRG_AMT,NCH_BENE_BLOOD_DDCTBL_LBLTY_AM,NCH_PROFNL_CMPNT_CHRG_AMT,PRNCPAL_DGNS_CD,ICD_DGNS_CD1,ICD_DGNS_CD2,ICD_DGNS_CD3,ICD_DGNS_CD4,ICD_DGNS_CD5,ICD_DGNS_CD6,ICD_DGNS_CD7,ICD_DGNS_CD8,ICD_DGNS_CD9,ICD_DGNS_CD10,ICD_DGNS_CD11,ICD_DGNS_CD12,ICD_DGNS_CD13,ICD_DGNS_CD14,ICD_DGNS_CD15,ICD_DGNS_CD16,ICD_DGNS_CD17,ICD_DGNS_CD18,ICD_DGNS_CD19,ICD_DGNS_CD20,ICD_DGNS_CD21,ICD_DGNS_CD22,ICD_DGNS_CD23,ICD_DGNS_CD24,ICD_DGNS_CD25,FST_DGNS_E_CD,ICD_DGNS_E_CD1,ICD_DGNS_E_CD2,ICD_DGNS_E_CD3,ICD_DGNS_E_CD4,ICD_DGNS_E_CD5,ICD_DGNS_E_CD6,ICD_DGNS_E_CD7,ICD_DGNS_E_CD8,ICD_DGNS_E_CD9,ICD_DGNS_E_CD10,ICD_DGNS_E_CD11,ICD_DGNS_E_CD12,ICD_PRCDR_CD1,PRCDR_DT1,ICD_PRCDR_CD2,PRCDR_DT2,ICD_PRCDR_CD3,PRCDR_DT3,ICD_PRCDR_CD4,PRCDR_DT4,ICD_PRCDR_CD5,PRCDR_DT5,ICD_PRCDR_CD6,PRCDR_DT6,ICD_PRCDR_CD7,PRCDR_DT7,ICD_PRCDR_CD8,PRCDR_DT8,ICD_PRCDR_CD9,PRCDR_DT9,ICD_PRCDR_CD10,PRCDR_DT10,ICD_PRCDR_CD11,PRCDR_DT11,ICD_PRCDR_CD12,PRCDR_DT12,ICD_PRCDR_CD13,PRCDR_DT13,ICD_PRCDR_CD14,PRCDR_DT14,ICD_PRCDR_CD15,PRCDR_DT15,ICD_PRCDR_CD16,PRCDR_DT16,ICD_PRCDR_CD17,PRCDR_DT17,ICD_PRCDR_CD18,PRCDR_DT18,ICD_PRCDR_CD19,PRCDR_DT19,ICD_PRCDR_CD20,PRCDR_DT20,ICD_PRCDR_CD21,PRCDR_DT21,ICD_PRCDR_CD22,PRCDR_DT22,ICD_PRCDR_CD23,PRCDR_DT23,ICD_PRCDR_CD24,PRCDR_DT24,ICD_PRCDR_CD25,PRCDR_DT25,RSN_VISIT_CD1,RSN_VISIT_CD2,RSN_VISIT_CD3,NCH_BENE_PTB_DDCTBL_AMT,NCH_BENE_PTB_COINSRNC_AMT,CLM_OP_PRVDR_PMT_AMT,CLM_OP_BENE_PMT_AMT,DOB_DT,GNDR_CD,BENE_RACE_CD,BENE_CNTY_CD,BENE_STATE_CD,CWF_BENE_MDCR_STUS_CD,FI_CLM_ACTN_CD,NCH_BLOOD_PNTS_FRNSHD_QTY,CLM_TRTMT_AUTHRZTN_NUM,CLM_PRCR_RTRN_CD,CLM_OP_TRANS_TYPE_CD,CLM_OP_ESRD_MTHD_CD,CLM_NEXT_GNRTN_ACO_IND_CD1,CLM_NEXT_GNRTN_ACO_IND_CD2,CLM_NEXT_GNRTN_ACO_IND_CD3,CLM_NEXT_GNRTN_ACO_IND_CD4,CLM_NEXT_GNRTN_ACO_IND_CD5,ACO_ID_NUM,year
<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<dbl>,<dbl>,<chr>,<int>,<dbl>,<int>,<chr>,<int>,<chr>,<lgl>,<int>,<chr>,<lgl>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<int>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<chr>,<int>,<chr>,<int>,<int>,<int>,<lgl>,<lgl>,<lgl>,<chr>,<dbl>
100000015,2,220071,20160119,W,40,3,1,3,1,14211,,104.26,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,846.02,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,26.6,104.26,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,3,220071,20160122,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,4,220071,20160126,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,5,220071,20160128,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,6,220071,20160202,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.0,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016
100000015,7,220071,20160209,W,40,3,1,3,1,14211,,52.13,0,,22,1023049236,,,1174560288,11,,,,,,,,,,,,1,423.02,0,,J449,J449,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0,13.3,52.13,0,4,1,1,160,22,10,1,0,,1,C,0,,,,,,,2016


id,year,cluster
<int>,<dbl>,<dbl>
100000015,2016,2
100000019,2016,6
100000053,2016,6
100000099,2016,1
100000203,2016,1
100000221,2016,9


In [22]:
# select only cost, provider, id, year, and cluster
carrier_data <- carrier_data[, c("id", "year", "cost", "provider", "provider_specialty")]
inpatient_data <- inpatient_data[, .(
    id = DESY_SORT_KEY,
    year = year,
    cost = CLM_TOT_CHRG_AMT,
    provider = AT_PHYSN_NPI,
    provider_specialty = AT_PHYSN_SPCLTY_CD
)]

outpatient_data <- outpatient_data[, .(
    id = DESY_SORT_KEY,
    year = year,
    cost = CLM_TOT_CHRG_AMT,
    provider = AT_PHYSN_NPI,
    provider_specialty = AT_PHYSN_SPCLTY_CD
)]

# merge the data
full_data <- rbindlist(list(carrier_data, inpatient_data, outpatient_data), fill = TRUE)

# add clusters
full_data <- full_data[clustering_results, on = c("id", "year")]

# save the data
write.fst(full_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/full_data.fst")

## MBSF data

In [23]:
# Finding outcomes from the mbsf data
mbsf_data <- read_fst("/work/postresearch/Shared/Projects/Data_fst/mbsf_data",
    as.data.table = TRUE
)


In [24]:
tail(mbsf_data)

DESY_SORT_KEY,REFERENCE_YEAR_2013,STATE_CODE_2013,COUNTY_CODE_2013,SEX_CODE_2013,RACE_CODE_2013,AGE_2013,ORIG_REASON_FOR_ENTITLEMENT_2013,CURR_REASON_FOR_ENTITLEMENT_2013,ENTITLEMENT_BUY_IN_IND01_2013,ENTITLEMENT_BUY_IN_IND02_2013,ENTITLEMENT_BUY_IN_IND03_2013,ENTITLEMENT_BUY_IN_IND04_2013,ENTITLEMENT_BUY_IN_IND05_2013,ENTITLEMENT_BUY_IN_IND06_2013,ENTITLEMENT_BUY_IN_IND07_2013,ENTITLEMENT_BUY_IN_IND08_2013,ENTITLEMENT_BUY_IN_IND09_2013,ENTITLEMENT_BUY_IN_IND10_2013,ENTITLEMENT_BUY_IN_IND11_2013,ENTITLEMENT_BUY_IN_IND12_2013,HMO_INDICATOR01_2013,HMO_INDICATOR02_2013,HMO_INDICATOR03_2013,HMO_INDICATOR04_2013,HMO_INDICATOR05_2013,HMO_INDICATOR06_2013,HMO_INDICATOR07_2013,HMO_INDICATOR08_2013,HMO_INDICATOR09_2013,HMO_INDICATOR10_2013,HMO_INDICATOR11_2013,HMO_INDICATOR12_2013,VALID_DATE_OF_DEATH_SWITCH_2013,DATE_OF_DEATH_2013,REFERENCE_YEAR_2014,STATE_CODE_2014,COUNTY_CODE_2014,SEX_CODE_2014,RACE_CODE_2014,AGE_2014,ORIG_REASON_FOR_ENTITLEMENT_2014,CURR_REASON_FOR_ENTITLEMENT_2014,ENTITLEMENT_BUY_IN_IND01_2014,ENTITLEMENT_BUY_IN_IND02_2014,ENTITLEMENT_BUY_IN_IND03_2014,ENTITLEMENT_BUY_IN_IND04_2014,ENTITLEMENT_BUY_IN_IND05_2014,ENTITLEMENT_BUY_IN_IND06_2014,ENTITLEMENT_BUY_IN_IND07_2014,ENTITLEMENT_BUY_IN_IND08_2014,ENTITLEMENT_BUY_IN_IND09_2014,ENTITLEMENT_BUY_IN_IND10_2014,ENTITLEMENT_BUY_IN_IND11_2014,ENTITLEMENT_BUY_IN_IND12_2014,HMO_INDICATOR01_2014,HMO_INDICATOR02_2014,HMO_INDICATOR03_2014,HMO_INDICATOR04_2014,HMO_INDICATOR05_2014,HMO_INDICATOR06_2014,HMO_INDICATOR07_2014,HMO_INDICATOR08_2014,HMO_INDICATOR09_2014,HMO_INDICATOR10_2014,HMO_INDICATOR11_2014,HMO_INDICATOR12_2014,VALID_DATE_OF_DEATH_SWITCH_2014,DATE_OF_DEATH_2014,REFERENCE_YEAR_2015,STATE_CODE_2015,COUNTY_CODE_2015,SEX_CODE_2015,RACE_CODE_2015,AGE_2015,ORIG_REASON_FOR_ENTITLEMENT_2015,CURR_REASON_FOR_ENTITLEMENT_2015,ENTITLEMENT_BUY_IN_IND01_2015,ENTITLEMENT_BUY_IN_IND02_2015,ENTITLEMENT_BUY_IN_IND03_2015,ENTITLEMENT_BUY_IN_IND04_2015,ENTITLEMENT_BUY_IN_IND05_2015,ENTITLEMENT_BUY_IN_IND06_2015,ENTITLEMENT_BUY_IN_IND07_2015,ENTITLEMENT_BUY_IN_IND08_2015,ENTITLEMENT_BUY_IN_IND09_2015,ENTITLEMENT_BUY_IN_IND10_2015,ENTITLEMENT_BUY_IN_IND11_2015,ENTITLEMENT_BUY_IN_IND12_2015,HMO_INDICATOR01_2015,HMO_INDICATOR02_2015,HMO_INDICATOR03_2015,HMO_INDICATOR04_2015,HMO_INDICATOR05_2015,HMO_INDICATOR06_2015,HMO_INDICATOR07_2015,HMO_INDICATOR08_2015,HMO_INDICATOR09_2015,HMO_INDICATOR10_2015,HMO_INDICATOR11_2015,HMO_INDICATOR12_2015,VALID_DATE_OF_DEATH_SWITCH_2015,DATE_OF_DEATH_2015,REFERENCE_YEAR_2016,STATE_CODE_2016,COUNTY_CODE_2016,SEX_CODE_2016,RACE_CODE_2016,AGE_2016,ORIG_REASON_FOR_ENTITLEMENT_2016,CURR_REASON_FOR_ENTITLEMENT_2016,ENTITLEMENT_BUY_IN_IND01_2016,ENTITLEMENT_BUY_IN_IND02_2016,ENTITLEMENT_BUY_IN_IND03_2016,ENTITLEMENT_BUY_IN_IND04_2016,ENTITLEMENT_BUY_IN_IND05_2016,ENTITLEMENT_BUY_IN_IND06_2016,ENTITLEMENT_BUY_IN_IND07_2016,ENTITLEMENT_BUY_IN_IND08_2016,ENTITLEMENT_BUY_IN_IND09_2016,ENTITLEMENT_BUY_IN_IND10_2016,ENTITLEMENT_BUY_IN_IND11_2016,ENTITLEMENT_BUY_IN_IND12_2016,HMO_INDICATOR01_2016,HMO_INDICATOR02_2016,HMO_INDICATOR03_2016,HMO_INDICATOR04_2016,HMO_INDICATOR05_2016,HMO_INDICATOR06_2016,HMO_INDICATOR07_2016,HMO_INDICATOR08_2016,HMO_INDICATOR09_2016,HMO_INDICATOR10_2016,HMO_INDICATOR11_2016,HMO_INDICATOR12_2016,VALID_DATE_OF_DEATH_SWITCH_2016,DATE_OF_DEATH_2016,REFERENCE_YEAR_2017,STATE_CODE_2017,COUNTY_CODE_2017,SEX_CODE_2017,RACE_CODE_2017,AGE_2017,ORIG_REASON_FOR_ENTITLEMENT_2017,CURR_REASON_FOR_ENTITLEMENT_2017,ENTITLEMENT_BUY_IN_IND01_2017,ENTITLEMENT_BUY_IN_IND02_2017,ENTITLEMENT_BUY_IN_IND03_2017,ENTITLEMENT_BUY_IN_IND04_2017,ENTITLEMENT_BUY_IN_IND05_2017,ENTITLEMENT_BUY_IN_IND06_2017,ENTITLEMENT_BUY_IN_IND07_2017,ENTITLEMENT_BUY_IN_IND08_2017,ENTITLEMENT_BUY_IN_IND09_2017,ENTITLEMENT_BUY_IN_IND10_2017,ENTITLEMENT_BUY_IN_IND11_2017,ENTITLEMENT_BUY_IN_IND12_2017,HMO_INDICATOR01_2017,HMO_INDICATOR02_2017,HMO_INDICATOR03_2017,HMO_INDICATOR04_2017,HMO_INDICATOR05_2017,HMO_INDICATOR06_2017,HMO_INDICATOR07_2017,HMO_INDICATOR08_2017,HMO_INDICATOR09_2017,HMO_INDICATOR10_2017,HMO_INDICATOR11_2017,HMO_INDICATOR12_2017,VALID_DATE_OF_DEATH_SWITCH_2017,DATE_OF_DEATH_2017,REFERENCE_YEAR_2018,STATE_CODE_2018,COUNTY_CODE_2018,SEX_CODE_2018,RACE_CODE_2018,AGE_2018,ORIG_REASON_FOR_ENTITLEMENT_2018,CURR_REASON_FOR_ENTITLEMENT_2018,ENTITLEMENT_BUY_IN_IND01_2018,ENTITLEMENT_BUY_IN_IND02_2018,ENTITLEMENT_BUY_IN_IND03_2018,ENTITLEMENT_BUY_IN_IND04_2018,ENTITLEMENT_BUY_IN_IND05_2018,ENTITLEMENT_BUY_IN_IND06_2018,ENTITLEMENT_BUY_IN_IND07_2018,ENTITLEMENT_BUY_IN_IND08_2018,ENTITLEMENT_BUY_IN_IND09_2018,ENTITLEMENT_BUY_IN_IND10_2018,ENTITLEMENT_BUY_IN_IND11_2018,ENTITLEMENT_BUY_IN_IND12_2018,HMO_INDICATOR01_2018,HMO_INDICATOR02_2018,HMO_INDICATOR03_2018,HMO_INDICATOR04_2018,HMO_INDICATOR05_2018,HMO_INDICATOR06_2018,HMO_INDICATOR07_2018,HMO_INDICATOR08_2018,HMO_INDICATOR09_2018,HMO_INDICATOR10_2018,HMO_INDICATOR11_2018,HMO_INDICATOR12_2018,VALID_DATE_OF_DEATH_SWITCH_2018,DATE_OF_DEATH_2018,REFERENCE_YEAR_2019,STATE_CODE_2019,COUNTY_CODE_2019,SEX_CODE_2019,RACE_CODE_2019,AGE_2019,ORIG_REASON_FOR_ENTITLEMENT_2019,CURR_REASON_FOR_ENTITLEMENT_2019,ENTITLEMENT_BUY_IN_IND01_2019,ENTITLEMENT_BUY_IN_IND02_2019,ENTITLEMENT_BUY_IN_IND03_2019,ENTITLEMENT_BUY_IN_IND04_2019,ENTITLEMENT_BUY_IN_IND05_2019,ENTITLEMENT_BUY_IN_IND06_2019,ENTITLEMENT_BUY_IN_IND07_2019,ENTITLEMENT_BUY_IN_IND08_2019,ENTITLEMENT_BUY_IN_IND09_2019,ENTITLEMENT_BUY_IN_IND10_2019,ENTITLEMENT_BUY_IN_IND11_2019,ENTITLEMENT_BUY_IN_IND12_2019,HMO_INDICATOR01_2019,HMO_INDICATOR02_2019,HMO_INDICATOR03_2019,HMO_INDICATOR04_2019,HMO_INDICATOR05_2019,HMO_INDICATOR06_2019,HMO_INDICATOR07_2019,HMO_INDICATOR08_2019,HMO_INDICATOR09_2019,HMO_INDICATOR10_2019,HMO_INDICATOR11_2019,HMO_INDICATOR12_2019,VALID_DATE_OF_DEATH_SWITCH_2019,DATE_OF_DEATH_2019,REFERENCE_YEAR_2020,STATE_CODE_2020,COUNTY_CODE_2020,SEX_CODE_2020,RACE_CODE_2020,AGE_2020,ORIG_REASON_FOR_ENTITLEMENT_2020,CURR_REASON_FOR_ENTITLEMENT_2020,ENTITLEMENT_BUY_IN_IND01_2020,ENTITLEMENT_BUY_IN_IND02_2020,ENTITLEMENT_BUY_IN_IND03_2020,ENTITLEMENT_BUY_IN_IND04_2020,ENTITLEMENT_BUY_IN_IND05_2020,ENTITLEMENT_BUY_IN_IND06_2020,ENTITLEMENT_BUY_IN_IND07_2020,ENTITLEMENT_BUY_IN_IND08_2020,ENTITLEMENT_BUY_IN_IND09_2020,ENTITLEMENT_BUY_IN_IND10_2020,ENTITLEMENT_BUY_IN_IND11_2020,ENTITLEMENT_BUY_IN_IND12_2020,HMO_INDICATOR01_2020,HMO_INDICATOR02_2020,HMO_INDICATOR03_2020,HMO_INDICATOR04_2020,HMO_INDICATOR05_2020,HMO_INDICATOR06_2020,HMO_INDICATOR07_2020,HMO_INDICATOR08_2020,HMO_INDICATOR09_2020,HMO_INDICATOR10_2020,HMO_INDICATOR11_2020,HMO_INDICATOR12_2020,VALID_DATE_OF_DEATH_SWITCH_2020,DATE_OF_DEATH_2020,deate_of_death_collapsed,death_validity_collapsed,date_of_death_collapsed
<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<IDate>,<chr>,<IDate>
499999741,13,19,510,1,1,52,1,1,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,,,14,19,510,1,1,53,1,1,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,,,15,19,510,1,1,54,1,1,C,C,C,C,C,C,C,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2016,19,510,1,1,55,1,1,3,3,3,3,3,3,3,3,3,3,3,3,C,C,0,C,C,C,C,C,C,C,C,C,,,2017,19,510,1,1,56,1,1,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2018,19,510,1,1,57,1,1,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2019,19,510,1,1,58,1,1,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2020,19,510,1,1,59,1,1,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,,False,
499999841,13,45,610,1,1,73,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,14,45,610,1,1,74,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,15,45,610,1,1,75,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2016,45,610,1,1,76,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2017,45,610,1,1,77,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2018,45,610,1,1,78,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2019,45,610,1,1,79,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2020,45,610,1,1,80,0,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,,False,
499999951,13,50,120,2,1,53,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,14,50,120,2,1,54,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,15,50,120,2,1,55,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,2016,50,120,2,1,56,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,2017,50,120,2,1,57,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,2018,50,120,2,1,58,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,2019,50,120,2,1,59,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,2020,50,120,2,1,60,1,1,C,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,,False,
499999961,13,38,250,1,2,75,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,14,38,250,1,2,76,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,15,38,250,1,2,77,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2016,38,250,1,2,78,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2017,38,250,1,2,79,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2018,38,250,1,2,80,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2019,38,250,1,2,81,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2020,38,250,1,2,82,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,,False,
499999971,13,38,190,2,1,73,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,14,50,70,2,1,74,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,15,50,70,2,1,75,0,0,3,3,3,3,3,3,3,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2016,50,70,2,1,76,0,0,3,3,3,3,3,3,C,C,C,C,C,3,C,C,C,C,C,C,C,C,C,C,C,C,,,2017,50,70,2,1,77,0,0,3,3,3,3,3,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,,,2018,50,70,2,1,78,0,0,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,,,2019,50,70,2,1,79,0,0,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,,,2020,50,70,2,1,80,0,0,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,C,,,,False,
499999981,13,50,160,1,1,61,1,1,3,C,C,C,C,C,C,C,C,C,C,C,0,0,0,0,0,0,0,0,0,0,0,0,,,14,50,160,1,1,62,1,1,C,C,C,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,15,50,160,1,1,63,1,1,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2016,50,160,1,1,64,1,0,3,C,C,C,C,C,C,C,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2017,50,160,1,1,65,1,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2018,50,160,1,1,66,1,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2019,50,160,1,1,67,1,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,2020,50,160,1,1,68,1,0,3,3,3,3,3,3,3,3,3,3,3,3,0,0,0,0,0,0,0,0,0,0,0,0,,,,False,


In [25]:
# Extract data from the mbsf data
mbsf_data <- mbsf_data[, .(
    id = DESY_SORT_KEY,
    state = STATE_CODE_2016,
    county = COUNTY_CODE_2016,
    sex = SEX_CODE_2016,
    race = RACE_CODE_2016,
    age = AGE_2016,
    died_2016 = !is.na(DATE_OF_DEATH_2016),
    died_2017 = !is.na(DATE_OF_DEATH_2017),
    died_2018 = !is.na(DATE_OF_DEATH_2018),
    died_2019 = !is.na(DATE_OF_DEATH_2019),
    died_2020 = !is.na(DATE_OF_DEATH_2020)
)]

In [26]:
head(mbsf_data[died_2018 == T])

id,state,county,sex,race,age,died_2016,died_2017,died_2018,died_2019,died_2020
<int>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>
100002065,22,40,2,1,79,False,False,True,False,False
100003247,32,150,1,6,83,False,False,True,False,False
100003583,46,210,1,1,79,False,False,True,False,False
100006751,10,550,2,1,80,False,False,True,False,False
100007013,33,240,1,1,79,False,False,True,False,False
100008815,5,480,1,3,79,False,False,True,False,False


## Comorbidities

In [3]:
wide_data_presence <- read.fst(
    "/work/postresearch/Shared/Projects/Farbod/Clustering/wide_data_presence_2016.fst",
    as.data.table = TRUE
)
head(wide_data_presence)

id,year,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C50,⋯,V58,V63,V73,V78,V79,V88,V94,W04,W21,W27,W28,W29,W32,W33,W39,W46,W52,W53,W60,W61,W89,W90,X00,X01,X02,X10,X12,X13,X15,X18,X32,X52,Y00,Y03,Y09,Y24,Y27,Y29,Y30,A82,B52,B68,C58,G07,G41,N24,O01,O31,P10,P12,P23,P51,P54,P55,P84,Q00,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,A22,A34,A65,A99,B04,B79,F35,F62,H45,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,T64,V06,W85,X40,Y64,Z05,Z38,B92,E33,H48,K67,M78,P93,V20,B12,H22,I14,J29,M52,P95,V32,V76,V82,V96
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000015,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000019,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000053,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000099,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000203,2016,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
100000221,2016,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# find charlson comorbidity index
# turn wide data into long data (melt)
# do this chuwide_data_presenceks to avoid memory issues
wide_data_presence[, year := NULL]
wide_data_presence[wide_data_presence == 0] <- NA
head(wide_data_presence)

“NAs produced by integer overflow”


id,A02,A03,A04,A05,A06,A07,A08,A09,A15,A18,A23,A25,A28,A30,A31,A32,A36,A37,A38,A39,A40,A41,A42,A43,A46,A48,A49,A50,A51,A52,A53,A54,A56,A59,A60,A63,A68,A69,A74,A75,A77,A79,A80,A81,A83,A86,A87,A88,A92,A93,A95,B00,B01,B02,B07,B08,B09,B10,B15,B16,B17,B18,B19,B20,B25,B26,B27,B30,B33,B34,B35,B36,B37,B38,B39,B40,B44,B45,B46,B47,B48,B49,B50,B57,B58,B59,B60,B69,B72,B78,B81,B83,B85,B86,B87,B88,B89,B90,B91,B94,B95,B96,B97,B99,C00,C01,C02,C03,C04,C05,C06,C07,C08,C09,C10,C11,C12,C13,C14,C15,C16,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26,C30,C31,C32,C33,C34,C37,C38,C40,C41,C43,C44,C45,C46,C47,C48,C49,C50,C51,⋯,V58,V63,V73,V78,V79,V88,V94,W04,W21,W27,W28,W29,W32,W33,W39,W46,W52,W53,W60,W61,W89,W90,X00,X01,X02,X10,X12,X13,X15,X18,X32,X52,Y00,Y03,Y09,Y24,Y27,Y29,Y30,A82,B52,B68,C58,G07,G41,N24,O01,O31,P10,P12,P23,P51,P54,P55,P84,Q00,V46,V74,V84,V90,V91,V95,V97,V98,V99,W03,W24,W35,W37,W40,W56,W86,W94,X04,X35,X39,X72,X94,Y23,Y32,Y62,A89,O07,P03,P37,P39,P58,P60,P81,V05,V16,V62,V69,V70,V81,V85,W09,W15,W99,X06,X37,X93,Y38,Y80,A22,A34,A65,A99,B04,B79,F35,F62,H45,O29,O89,P00,P01,P02,P04,P22,P26,P36,P52,P57,P77,S17,T64,V06,W85,X40,Y64,Z05,Z38,B92,E33,H48,K67,M78,P93,V20,B12,H22,I14,J29,M52,P95,V32,V76,V82,V96
<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
100000015,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,⋯,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100000019,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,⋯,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100000053,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,⋯,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100000099,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,⋯,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100000203,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,1.0,,,,,,,,,,,,,,⋯,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
100000221,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,⋯,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [12]:
# turn column names into values in a new column called "code",
long_data_presence <- melt(wide_data_presence, id.vars = c("id"), na.rm = T)
head(long_data_presence)

id,variable,value
<int>,<fct>,<dbl>
100079759,A02,1
100083267,A02,1
100367273,A02,1
102316111,A02,1
102758339,A02,1
103000143,A02,1


In [15]:
charlson_result <- comorbidity(
    x = long_data_presence,
    id = "id",
    code = "variable",
    map = "charlson_icd10_quan",
    assign0 = T
)
charlson_result$score <- score(charlson_result, weights = "quan", assign0 = T)
head(charlson_result)

Unnamed: 0_level_0,id,mi,chf,pvd,cevd,dementia,cpd,rheumd,pud,mld,diab,diabwc,hp,rend,canc,msld,metacanc,aids
Unnamed: 0_level_1,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
1,100000015,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,100000019,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0
3,100000053,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4,100000099,0,1,1,0,0,0,0,0,1,0,0,0,1,0,0,0,0
5,100000203,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0
6,100000221,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Merging all the data

In [29]:
full_data <- full_data[as.data.table(charlson_result), on = c("id")]
full_data <- full_data[mbsf_data, on = c("id")]
head(full_data)

# save the data
write.fst(full_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/full_data_with_charlson.fst")

id,year,cost,provider,provider_specialty,cluster,mi,chf,pvd,cevd,dementia,cpd,rheumd,pud,mld,diab,diabwc,hp,rend,canc,msld,metacanc,aids,score,state,county,sex,race,age,died_2016,died_2017,died_2018,died_2019,died_2020,i.cluster
<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>
100000015,2016,110.66,1265609275,18,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,1961.0,1265609275,18,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,113.64,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,18.34,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,371.86,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,75.37,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2


# Calculations and analyses

In [17]:
full_data <- read.fst("/work/postresearch/Shared/Projects/Farbod/Clustering/full_data_with_charlson.fst",
    as.data.table = TRUE
)

In [18]:
head(full_data)

id,year,cost,provider,provider_specialty,cluster,mi,chf,pvd,cevd,dementia,cpd,rheumd,pud,mld,diab,diabwc,hp,rend,canc,msld,metacanc,aids,score,state,county,sex,race,age,died_2016,died_2017,died_2018,died_2019,died_2020,i.cluster
<int>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>
100000015,2016,110.66,1265609275,18,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,1961.0,1265609275,18,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,113.64,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,18.34,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,371.86,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2
100000015,2016,75.37,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2


In [27]:
full_data_unique_patient <- full_data[, .(
    score = unique(score),
    cluster = unique(cluster),
    state = unique(state),
    county = unique(county),
    age = unique(age),
    sex = unique(sex),
    total_cost = sum(cost),
    died_2016 = unique(died_2016),
    died_2017 = unique(died_2017),
    died_2018 = unique(died_2018),
    died_2019 = unique(died_2019),
    died_2020 = unique(died_2020),
    mi = unique(mi),
    cancer = unique(canc),
    metastatic_cancer = unique(metacanc),
    aids = unique(aids),
    diabetes = unique(diab),
    diabetes_w_complications = unique(diabwc),
    chf = unique(chf),
    cerebrovascular = unique(cevd),
    rheumatoid_disease = unique(rheumd),
    renal_disease = unique(rend),
    moderate_or_sever_liver_disease = unique(msld)
), by = c("id", "year")]

head(full_data_unique_patient)


id,year,score,cluster,state,county,age,sex,total_cost,died_2016,died_2017,died_2018,died_2019,died_2020,mi,cancer,metastatic_cancer,aids,diabetes,diabetes_w_complications,chf,cerebrovascular,rheumatoid_disease,renal_disease,moderate_or_sever_liver_disease
<int>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>
100000015,2016,1,2,22,160,79,1,61177.19,False,False,False,False,False,0,0,0,0,0,0,0,0,0,0,0
100000019,2016,2,6,7,50,79,2,3047.27,False,False,False,False,False,0,1,0,0,0,0,0,0,0,0,0
100000053,2016,3,6,33,331,80,1,9518.4,False,False,False,False,False,0,0,0,0,0,0,1,0,0,0,0
100000099,2016,5,1,10,350,85,1,14722.77,False,False,False,False,False,0,0,0,0,0,0,1,0,0,1,0
100000203,2016,5,1,33,510,68,2,47242.07,False,False,False,False,False,0,1,0,0,0,0,1,0,0,0,0
100000221,2016,0,9,33,90,80,1,1094.0,True,False,False,False,False,0,0,0,0,0,0,0,0,0,0,0


In [30]:
# discriptives by cluster with 95% confidence intervals
full_data_unique_patient[, .(
    n = .N,
    died = sum(died_2017) + sum(died_2018) + sum(died_2019) + sum(died_2020),
    died_rate = (sum(died_2017) + sum(died_2018) + sum(died_2019) + sum(died_2020)) / .N,
    mean_age = mean(age),
    mean_score = mean(score),
    mean_cost = mean(total_cost),
    percent_female = mean(sex == 2),
    percent_mi = mean(mi == 1),
    percent_cancer = mean(cancer == 1),
    percent_metastatic_cancer = mean(metastatic_cancer == 1),
    percent_aids = mean(aids == 1),
    #percent_diabetes = mean(diabetes == 1),
    #percent_diabetes_w_complications = mean(diabetes_w_complications == 1),
    percent_chf = mean(chf == 1),
    percent_cerebrovascular = mean(cerebrovascular == 1),
    percent_rheumatoid_disease = mean(rheumatoid_disease == 1),
    percent_renal_disease = mean(renal_disease == 1)
    #percent_moderate_or_sever_liver_disease = mean(moderate_or_sever_liver_disease == 1)
), by = c("cluster")][order(-died_rate)]

cluster,n,died,died_rate,mean_age,mean_score,mean_cost,percent_female,percent_mi,percent_cancer,percent_metastatic_cancer,percent_aids,percent_chf,percent_cerebrovascular,percent_rheumatoid_disease,percent_renal_disease
<dbl>,<int>,<int>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
5.0,63403,31534.0,0.49735817,78.80704,2.3360567,38664.833,0.6360109,0.019352397,0.05785215,0.010661956,0.00274435,0.16306799,0.35417882,0.021544722,0.16426668
10.0,83332,32582.0,0.39099026,73.54228,4.1460063,293630.218,0.4880118,0.150470408,0.15726252,0.124141986,0.006960111,0.56127298,0.32460519,0.052128834,0.54500072
1.0,70226,25486.0,0.36291402,71.27471,2.6442628,142076.644,0.6605104,0.050024208,0.14658389,0.044584627,0.007433144,0.30659585,0.36026543,0.098239968,0.29614388
2.0,101748,32659.0,0.32097928,73.99095,2.1538605,93182.043,0.4321264,0.090616032,0.11817431,0.014182097,0.002525848,0.43882926,0.22248103,0.034624759,0.35777607
4.0,106149,17922.0,0.16883814,70.37733,1.623793,54348.584,0.5373673,0.011578065,0.19765612,0.076477404,0.003853074,0.07301058,0.10587947,0.044220859,0.12727393
9.0,615010,89054.0,0.1448009,67.98925,0.3432627,5450.668,0.5085803,0.003126778,0.04050341,0.005211297,0.002978813,0.01960293,0.02557519,0.009303914,0.03044829
3.0,127958,16298.0,0.12736992,74.64402,0.7413839,14689.494,0.4286719,0.004321731,0.16532769,0.010104878,0.001211335,0.03473796,0.08727864,0.022554276,0.07951046
6.0,449941,53136.0,0.11809548,71.6283,0.5649407,10989.099,0.5871325,0.002851485,0.07999493,0.004300564,0.004331679,0.02973501,0.07666338,0.024814365,0.08340871
7.0,148717,16066.0,0.10803069,68.25178,0.7495041,34818.077,0.6530861,0.005466759,0.06929941,0.005883658,0.004209337,0.04223458,0.09711734,0.075687379,0.07849809
8.0,78781,6175.0,0.07838184,67.7547,1.1821505,27719.831,0.9834224,0.003706477,0.28265699,0.035211536,0.001751691,0.03274901,0.07164164,0.044160394,0.05965905


## Find costs of each physician per patient

In [4]:
# find mean cost per physician
physician_cost <- full_data[, .(
    sum_cost = sum(cost),
    n = length(unique(id))
), by = c("provider", "provider_specialty", "cluster")]
head(physician_cost)

provider,provider_specialty,cluster,sum_cost,n
<chr>,<chr>,<dbl>,<dbl>,<int>
1265609275,18,2,27529.78,2
1730170630,6,2,36914.47,2
1811988454,6,2,5399.44,8
1962492033,30,2,105.0,4
1043294770,93,2,231.6,1
1770514119,11,2,1801.37,5


In [5]:
physician_cost[, mean_cost := sum_cost / n]
head(physician_cost)

provider,provider_specialty,cluster,sum_cost,n,mean_cost
<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>
1265609275,18,2,27529.78,2,13764.89
1730170630,6,2,36914.47,2,18457.235
1811988454,6,2,5399.44,8,674.93
1962492033,30,2,105.0,4,26.25
1043294770,93,2,231.6,1,231.6
1770514119,11,2,1801.37,5,360.274


In [34]:
# find the percentile of the mean cost of each physician in each cluster
physician_cost[, percentile := rank(mean_cost) / .N, by = c("cluster", "provider_specialty")]
# add decile
physician_cost[, decile := cut(percentile, breaks = seq(0, 1, 0.1), include.lowest = T)]
# add quartile
physician_cost[, quartile := cut(percentile, breaks = seq(0, 1, 0.25), include.lowest = T)]
head(physician_cost)

provider,provider_specialty,cluster,sum_cost,n,mean_cost,percentile,decile,quartile
<chr>,<chr>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<fct>,<fct>
1265609275,18,2,27529.78,2,13764.89,0.9819476,"(0.9,1]","(0.75,1]"
1730170630,6,2,36914.47,2,18457.235,0.9211876,"(0.9,1]","(0.75,1]"
1811988454,6,2,5399.44,8,674.93,0.4264507,"(0.4,0.5]","(0.25,0.5]"
1962492033,30,2,105.0,4,26.25,0.1734439,"(0.1,0.2]","[0,0.25]"
1043294770,93,2,231.6,1,231.6,0.3422514,"(0.3,0.4]","(0.25,0.5]"
1770514119,11,2,1801.37,5,360.274,0.4002623,"(0.4,0.5]","(0.25,0.5]"


In [40]:
# create a  dataset with unique id and other things
modelling_data <- unique(full_data[, -("cost")])
# merge with physician cost
modelling_data <- modelling_data[physician_cost, on = c("provider", "provider_specialty", "cluster")]

In [4]:
# add score quartile
modelling_data[, score_quartile := cut(score, breaks = seq(0,20,5), include.lowest = T)]

In [5]:
# add cluster score mean and cluster age mean
modelling_data[, cluster_score_mean := mean(score), by = c("cluster")]
modelling_data[, cluster_age_mean := mean(age), by = c("cluster")]

# add cluster centered age and score
modelling_data[, cluster_centered_score := score - cluster_score_mean]
modelling_data[, cluster_centered_age := age - cluster_age_mean]


In [6]:
modelling_data[,cluster := as.factor(cluster)]

In [7]:
head(modelling_data)

id,year,provider,provider_specialty,cluster,mi,chf,pvd,cevd,dementia,cpd,rheumd,pud,mld,diab,diabwc,hp,rend,canc,msld,metacanc,aids,score,state,county,sex,race,age,died_2016,died_2017,died_2018,died_2019,died_2020,i.cluster,sum_cost,n,mean_cost,percentile,decile,quartile,score_quartile,cluster_score_mean,cluster_age_mean,cluster_centered_score,cluster_centered_age
<int>,<dbl>,<chr>,<chr>,<fct>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>
100000015,2016,1265609275,18,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2,27529.78,2,13764.89,0.9819476,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,-1.2492465,4.981753
194457811,2016,1265609275,18,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,160,2,1,83,False,False,False,False,False,2,27529.78,2,13764.89,0.9819476,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,-2.2492465,8.981753
100000015,2016,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2,36914.47,2,18457.24,0.9211876,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,-1.2492465,4.981753
163502455,2016,1730170630,6,2,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,10,530,1,1,85,False,False,False,False,True,2,36914.47,2,18457.24,0.9211876,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,0.7507535,10.981753
100000015,2016,1811988454,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2,5399.44,8,674.93,0.4264507,"(0.4,0.5]","(0.25,0.5]","[0,5]",2.249246,74.01825,-1.2492465,4.981753
108934919,2016,1811988454,6,2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,22,90,2,3,78,False,False,False,False,False,2,5399.44,8,674.93,0.4264507,"(0.4,0.5]","(0.25,0.5]","[0,5]",2.249246,74.01825,0.7507535,3.981753


In [8]:
#save the data
write.fst(modelling_data, "/work/postresearch/Shared/Projects/Farbod/Clustering/modelling_data.fst")

# Descriptives

In [96]:
# death in 2017 to 2018 (uear after the data) divided by cluster and physician quartile with 95% confidence intervals
modelling_data[provider_specialty == "06" 
    , .(
    n = .N,
    died = sum(died_2017) + sum(died_2018),
    died_rate = paste(round((sum(died_2017) + sum(died_2018)) / .N, 3), " (", round(1.96 * sqrt((sum(died_2017) + sum(died_2018) ) / .N * (1 - (sum(died_2017) + sum(died_2018) ) / .N) / .N), 3), ")"),
    mean_spending = round(mean(mean_cost), 3),
    pct_mi =  paste(round(mean(mi == 1), 3), " (", round(1.96 * sqrt(mean(mi == 1) * (1 - mean(mi == 1)) / .N), 3), ")")
), keyby = c("cluster", "quartile")]

cluster,quartile,n,died,died_rate,mean_spending,pct_mi
<fct>,<fct>,<int>,<int>,<chr>,<dbl>,<chr>
1,"[0,0.25]",43501,9017,0.207 ( 0.004 ),64.358,0.09 ( 0.003 )
1,"(0.25,0.5]",40046,8606,0.215 ( 0.004 ),220.329,0.092 ( 0.003 )
1,"(0.5,0.75]",37592,7818,0.208 ( 0.004 ),715.264,0.095 ( 0.003 )
1,"(0.75,1]",30581,6266,0.205 ( 0.005 ),7504.4,0.095 ( 0.003 )
2,"[0,0.25]",47210,7286,0.154 ( 0.003 ),146.756,0.144 ( 0.003 )
2,"(0.25,0.5]",56762,8678,0.153 ( 0.003 ),569.7,0.137 ( 0.003 )
2,"(0.5,0.75]",59296,9023,0.152 ( 0.003 ),1973.391,0.13 ( 0.003 )
2,"(0.75,1]",48877,7323,0.15 ( 0.003 ),15626.806,0.111 ( 0.003 )
3,"[0,0.25]",12855,918,0.071 ( 0.004 ),76.88,0.019 ( 0.002 )
3,"(0.25,0.5]",19792,1512,0.076 ( 0.004 ),269.174,0.014 ( 0.002 )


# Modeling

In [2]:
modelling_data <- read.fst("/work/postresearch/Shared/Projects/Farbod/Clustering/modelling_data.fst",
    as.data.table = TRUE
)

In [12]:
head(modelling_data)

id,year,provider,provider_specialty,cluster,mi,chf,pvd,cevd,dementia,cpd,rheumd,pud,mld,diab,diabwc,hp,rend,canc,msld,metacanc,aids,score,state,county,sex,race,age,died_2016,died_2017,died_2018,died_2019,died_2020,i.cluster,sum_cost,n,mean_cost,percentile,decile,quartile,score_quartile,cluster_score_mean,cluster_age_mean,cluster_centered_score,cluster_centered_age,cluster_score_mean_cut
<int>,<dbl>,<chr>,<chr>,<fct>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<int>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<lgl>,<lgl>,<lgl>,<lgl>,<lgl>,<dbl>,<dbl>,<int>,<dbl>,<dbl>,<fct>,<fct>,<fct>,<dbl>,<dbl>,<dbl>,<dbl>,<fct>
100000015,2016,1265609275,18,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2,27529.78,2,13764.89,0.9819476,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,-1.2492465,4.981753,"(2,3]"
194457811,2016,1265609275,18,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,22,160,2,1,83,False,False,False,False,False,2,27529.78,2,13764.89,0.9819476,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,-2.2492465,8.981753,"(2,3]"
100000015,2016,1730170630,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2,36914.47,2,18457.24,0.9211876,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,-1.2492465,4.981753,"(2,3]"
163502455,2016,1730170630,6,2,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,3,10,530,1,1,85,False,False,False,False,True,2,36914.47,2,18457.24,0.9211876,"(0.9,1]","(0.75,1]","[0,5]",2.249246,74.01825,0.7507535,10.981753,"(2,3]"
100000015,2016,1811988454,6,2,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,22,160,1,1,79,False,False,False,False,False,2,5399.44,8,674.93,0.4264507,"(0.4,0.5]","(0.25,0.5]","[0,5]",2.249246,74.01825,-1.2492465,4.981753,"(2,3]"
108934919,2016,1811988454,6,2,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,3,22,90,2,3,78,False,False,False,False,False,2,5399.44,8,674.93,0.4264507,"(0.4,0.5]","(0.25,0.5]","[0,5]",2.249246,74.01825,0.7507535,3.981753,"(2,3]"


In [11]:
# add clsuter score mean cuts by 1
modelling_data[, cluster_score_mean_cut := cut(cluster_score_mean, breaks = seq(0,20,1), include.lowest = T)]

In [17]:
# run the multilevel model (physicians nested in clsuters)
#these will be physicians nested in specialties nested in clusters
library(lme4)

model_death_next_year <- glmer(
    died_2017 ~ cluster + provider_specialty + percentile + age + score*percentile + provider_specialty*percentile + (1 | provider_specialty) + (1 | provider_specialty:cluster),
    data = modelling_data[1:10000],
    family = binomial(link = "logit"),
    control = glmerControl(optimizer = "bobyqa"))

fixed-effect model matrix is rank deficient so dropping 13 columns / coefficients

“Some predictor variables are on very different scales: consider rescaling”
“maxfun < 10 * length(par)^2 is not recommended.”


In [None]:
# scale 

In [13]:
# run the multilevel model (physicians nested in clsuters)
# these will be physicians nested in specialties nested in clusters
library(lme4)

model_death_next_year <- glmer(
    died_2017 ~ cluster_age_mean + cluster_centered_score + cluster_centered_age + cluster_score_mean_cut * quartile + (1|cluster),
    data = modelling_data[provider_specialty=="06"],
    family = binomial(link = "logit"),
    control = glmerControl(optimizer = "bobyqa")
)

: 

In [None]:
summary(model_death_next_year)

Generalized linear mixed model fit by maximum likelihood (Laplace Approximation) ['glmerMod']
 Family: binomial  ( logit )
Formula: died_2017 ~ cluster_age_mean + cluster_centered_score + cluster_centered_age +      cluster_score_mean * quartile + (1 | cluster)
   Data: modelling_data[provider_specialty == "06"]
Control: glmerControl(optimizer = "bobyqa")

      AIC       BIC    logLik  deviance  df.resid 
 608938.2  609082.3 -304457.1  608914.2   1208531 

Scaled residuals: 
    Min      1Q  Median      3Q     Max 
-1.5500 -0.3294 -0.2124 -0.1457 15.1756 

Random effects:
 Groups  Name        Variance Std.Dev.
 cluster (Intercept) 0.1271   0.3565  
Number of obs: 1208543, groups:  cluster, 10

Fixed effects:
                                       Estimate Std. Error z value Pr(>|z|)    
(Intercept)                           -9.484716   0.253593 -37.401  < 2e-16 ***
cluster_age_mean                       0.077187   0.004629  16.676  < 2e-16 ***
cluster_centered_score                 0.