Skip to content

Dynamic Time Warping model with hierarchical clustering

Notifications You must be signed in to change notification settings

AVJdataminer/DTW

Repository files navigation

Dynamic Time Warping with hiererchical clustering

Create directories with date and versions

## Create directories first
t=format(Sys.time(), "%d_%b_%Y")
dir.create(paste("C:/MinuteClinic/Dynamic_time_warping/modeling_work_",t,sep=""))
setwd(paste("C:/MinuteClinic/Dynamic_time_warping/modeling_work_",t,sep=""))
subDir=paste(getwd())
version_path=paste(subDir,"/version_name",sep="")
ifelse(file.exists(version_path),,dir.create(file.path(version_path)))
indata_path=paste(subDir,"/dtw_indata",sep="")
ifelse(file.exists(indata_path),,dir.create(file.path(indata_path)))

modelpath=version_path
outpath=paste(subDir,"/version_name/output",sep="")
ifelse(file.exists(outpath),,dir.create(file.path(outpath)))
#create figures folder
#figures=paste(subDir,"/version_name/figures",sep="")
#ifelse(file.exists(figures),,dir.create(file.path(figures)))

list.files()
setwd(modelpath)
list.files()

TRUE

TRUE

TRUE

  1. 'dtw_indata'
  2. 'version_name'

'output'

Create a time series data set from hourly data

Starting with visit level data. Create an hourly aggregated data set for analysis.

require(dplyr)
require(lubridate)
# Develop best version hourly data ------
options(warn=0)
visit_level=read.csv('C:/MinuteClinic/Data/MC_data_01JAN2016_to_02JUL2018.csv')
visit_level$Employee=ifelse(visit_level$VisitOrigin=='Employee',1,0)
visit_level$Hmpil=ifelse(visit_level$VisitOrigin=='Hmpil',1,0)
visit_level$Scheduled=ifelse(visit_level$VisitOrigin=='Scheduled',ifelse(visit_level$ScheduleVisitType=="PSS",1,0),0)
visit_level$WalkIn=ifelse(visit_level$VisitOrigin=='Walk-In',1,0)
visit_level$RevenueGenerating=ifelse(visit_level$VisitStatus=='Revenue Generating',1,0)
visit_level$CompleteNonGenerating=ifelse(visit_level$VisitStatus=='Complete Non Revenue',1,0)
visit_level$ReferOutNoCharge=ifelse(visit_level$VisitStatus=='Refer Out No Charge',1,0)
visit_level$Canceled=ifelse(visit_level$VisitStatus=='Canceled',1,0)
visit_level$year=substr(visit_level$AppointmentDate,1,4)
visit_level$day=substr(visit_level$AppointmentDate,5,6)
visit_level$week=week(ymd(visit_level$AppointmentDate))
visit_level$Wait.Time=ifelse(visit_level$Canceled==1,0.01,visit_level$Wait.Time)
#subset to after Q4 of 2017
visit_level=filter(visit_level, AppointmentDate>=20171001)
visit_level$Hour=visit_level$AppointmentTimeOfDay/10000
visit_level=filter(visit_level, Hour>8 & Hour<=20)
visit_level$Hour=round(visit_level$Hour)
head(visit_level)
Loading required package: dplyr

Attaching package: 'dplyr'

The following objects are masked from 'package:stats':

    filter, lag

The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union

Loading required package: lubridate

Attaching package: 'lubridate'

The following object is masked from 'package:base':

    date
AppointmentDateClinicNumberProviderEpicidVisitStatusAppointmentTimeOfDayAppointmentMadeTimeOfDayWait.TimeScheduleVisitTypeVisitOriginScheduleTimeOfDay...ScheduledWalkInRevenueGeneratingCompleteNonGeneratingReferOutNoChargeCanceledyeardayweekHour
20171001 02497 3323 Revenue Generating104000 103714 0.033333 Walk-In 1040 ... 0 1 1 0 0 0 2017 10 40 10
20171001 02497 3323 Revenue Generating121000 120056 0.066666 Walk-In 1210 ... 0 1 1 0 0 0 2017 10 40 12
20171003 07017 1079 Revenue Generating122500 121610 20.150000 Walk-In NA ... 0 1 1 0 0 0 2017 10 40 12
20171005 22063 1398 Revenue Generating113000 112358 0.066666 Walk-In 1130 ... 0 1 1 0 0 0 2017 10 40 11
20171005 22063 1398 Revenue Generating131000 130313 0.066666 Walk-In 1310 ... 0 1 1 0 0 0 2017 10 40 13
20171005 22063 1398 Revenue Generating142000 141007 0.050000 Walk-In 1420 ... 0 1 1 0 0 0 2017 10 40 14
#convert visit level to daily to calc percent visits per hour
hourly=data.frame(visit_level%>%group_by(ClinicNumber,AppointmentDate,Hour)%>%summarise(provider.count=n_distinct(ProviderEpicid),
        Employee_Sum=sum(Employee),Hmpil_Sum=sum(Hmpil),Scheduled_Sum=sum(Scheduled),WalkIn_sum=sum(WalkIn),
        ReferOutNoCharge_Sum=sum(ReferOutNoCharge),Canceled_Sum=sum(Canceled),wait_time_avg=mean(Wait.Time,na.rm=T)))
head(hourly)
ClinicNumberAppointmentDateHourprovider.countEmployee_SumHmpil_SumScheduled_SumWalkIn_sumReferOutNoCharge_SumCanceled_Sumwait_time_avg
00012 20171001 10 1 0 0 0 7 0 0 7.128571
00012 20171001 11 2 0 1 0 2 0 1 5.231111
00012 20171001 12 1 0 0 0 2 0 0 1.424999
00012 20171001 13 1 0 0 0 2 0 0 23.191666
00012 20171001 14 1 0 0 0 5 0 0 40.020000
00012 20171001 15 1 0 0 0 2 0 0 3.666666
hourly$AllVisits=hourly$Employee_Sum+hourly$Hmpil_Sum+hourly$Scheduled_Sum+hourly$WalkIn_sum
head(hourly)
daily=hourly%>%group_by(ClinicNumber, AppointmentDate)%>%summarise(daily_sum=sum(AllVisits))
head(daily)
ClinicNumberAppointmentDateHourprovider.countEmployee_SumHmpil_SumScheduled_SumWalkIn_sumReferOutNoCharge_SumCanceled_Sumwait_time_avgAllVisits
00012 20171001 10 1 0 0 0 7 0 0 7.1285717
00012 20171001 11 2 0 1 0 2 0 1 5.2311113
00012 20171001 12 1 0 0 0 2 0 0 1.4249992
00012 20171001 13 1 0 0 0 2 0 0 23.1916662
00012 20171001 14 1 0 0 0 5 0 0 40.0200005
00012 20171001 15 1 0 0 0 2 0 0 3.6666662
ClinicNumberAppointmentDatedaily_sum
00012 2017100125
00012 2017100240
00012 2017100323
00012 2017100435
00012 2017100534
00012 2017100638
hrly=left_join(select(hourly,ClinicNumber, AppointmentDate,Hour,AllVisits), daily, by=c("ClinicNumber","AppointmentDate"))
hrly=unique(hrly)
hrly$PerVisits=hrly$AllVisits/hrly$daily_sum
head(hrly)
ClinicNumberAppointmentDateHourAllVisitsdaily_sumPerVisits
00012 2017100110 7 25 0.28
00012 2017100111 3 25 0.12
00012 2017100112 2 25 0.08
00012 2017100113 2 25 0.08
00012 2017100114 5 25 0.20
00012 2017100115 2 25 0.08
#check the data
range(hrly$AppointmentDate)
hrly=filter(hrly, AppointmentDate<20180101)
hrly_out=data.frame(hrly%>%group_by(ClinicNumber,Hour)%>%summarise(per_visits=mean(PerVisits, na.rm=T)))
summary(hrly_out)
write.csv(hrly_out, 'C:/MinuteClinic/Dynamic_time_warping/DTW_Input_data/hourly_percent_visits_2018.csv', row.names=F)
  1. 20171001
  2. 20171231
  ClinicNumber        Hour         per_visits     
 00012  :   13   Min.   : 8.00   Min.   :0.00000  
 00015  :   13   1st Qu.:11.00   1st Qu.:0.08849  
 00017  :   13   Median :14.00   Median :0.11101  
 00019  :   13   Mean   :13.74   Mean   :0.10735  
 00033  :   13   3rd Qu.:17.00   3rd Qu.:0.12753  
 00069  :   13   Max.   :20.00   Max.   :1.00000  
 (Other):13567                   NA's   :7        
memory.limit()
rm(list=ls())
gc()

7603

used(Mb)gc trigger(Mb)max used(Mb)
Ncells 634472 33.9 3763080 201.0 4703850 251.3
Vcells3063194 23.4 2833342552161.7 3541490132702.0
require(lubridate)
hrly=read.csv('C:/MinuteClinic/Dynamic_time_warping/DTW_Input_data/hourly_percent_visits_2018.csv')
head(hrly)
hrly=filter(hrly, Hour>=9 & Hour <19)
#install.packages('Hmisc')
Hmisc::describe(hrly)
ClinicNumberHourper_visits
00012 8 0.04038461
00012 9 0.07950237
00012 10 0.10117204
00012 11 0.11112025
00012 12 0.12879019
00012 13 0.06822577
hrly 

 3  Variables      11221  Observations
--------------------------------------------------------------------------------
ClinicNumber 
       n  missing distinct 
   11221        0     1133 

lowest : 00012 00015 00017 00019 00022, highest: 22063 22067 MC26  MC960 MC98 
--------------------------------------------------------------------------------
Hour 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
   11221        0       10     0.99    13.49    3.298        9        9 
     .25      .50      .75      .90      .95 
      11       13       16       17       18 
                                                                      
Value          9    10    11    12    13    14    15    16    17    18
Frequency   1127  1125  1125  1123  1122  1123  1125  1122  1119  1110
Proportion 0.100 0.100 0.100 0.100 0.100 0.100 0.100 0.100 0.100 0.099
--------------------------------------------------------------------------------
per_visits 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
   11214        7    11207        1   0.1179  0.02845  0.07580  0.08813 
     .25      .50      .75      .90      .95 
 0.10193  0.11667  0.13057  0.14605  0.15838 

lowest : 0.03225806 0.03311966 0.03333333 0.03367003 0.03846154
highest: 0.43598821 0.50000000 0.66666667 0.75000000 1.00000000
--------------------------------------------------------------------------------
#check for missing hour
counts=data.frame(table(hrly$ClinicNumber))
#counts
#create base size dataframe
clinic_list=as.character(unique(hrly$ClinicNumber))
df1=data.frame(rep(clinic_list,each=10),rep(9:18,length(clinic_list)))
head(df1)
names(df1)<-c("ClinicNumber","Hour")
df1$ClinicNumber=as.character(df1$ClinicNumber)
hrly$ClinicNumber=as.character(hrly$ClinicNumber)
df2=left_join(df1,hrly, by=c("ClinicNumber", "Hour"))
head(df2)
rep.clinic_list..each...10.rep.9.18..length.clinic_list..
00012 9
0001210
0001211
0001212
0001213
0001214
ClinicNumberHourper_visits
00012 9 0.07950237
00012 10 0.10117204
00012 11 0.11112025
00012 12 0.12879019
00012 13 0.06822577
00012 14 0.08772773
df2[is.na(df2)] <- 0
Hmisc::describe(df2)
df2 

 3  Variables      11330  Observations
--------------------------------------------------------------------------------
ClinicNumber 
       n  missing distinct 
   11330        0     1133 

lowest : 00012 00015 00017 00019 00022, highest: 22063 22067 MC26  MC960 MC98 
--------------------------------------------------------------------------------
Hour 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
   11330        0       10     0.99     13.5      3.3      9.0      9.9 
     .25      .50      .75      .90      .95 
    11.0     13.5     16.0     17.1     18.0 
                                                            
Value         9   10   11   12   13   14   15   16   17   18
Frequency  1133 1133 1133 1133 1133 1133 1133 1133 1133 1133
Proportion  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1  0.1
--------------------------------------------------------------------------------
per_visits 
       n  missing distinct     Info     Mean      Gmd      .05      .10 
   11330        0    11208        1   0.1167  0.03026  0.07315  0.08674 
     .25      .50      .75      .90      .95 
 0.10135  0.11640  0.13044  0.14590  0.15818 

lowest : 0.00000000 0.03225806 0.03311966 0.03333333 0.03367003
highest: 0.43598821 0.50000000 0.66666667 0.75000000 1.00000000
--------------------------------------------------------------------------------
#full clean dataset
write.csv(df2,'C:/MinuteClinic/Dynamic_time_warping/DTW_Input_data/hourly_percent_visits_Q42017_clean.csv', row.names=F)
head(df2)
ClinicNumberHourper_visits
00012 9 0.07950237
00012 10 0.10117204
00012 11 0.11112025
00012 12 0.12879019
00012 13 0.06822577
00012 14 0.08772773
indata=data.frame(matrix(df2$per_visits,nrow = (nrow(df2)/1133),ncol = 1133, byrow = F))
head(indata)
dim(indata)
names(indata)<-clinic_list
head(indata)
write.csv(indata,'C:/MinuteClinic/Dynamic_time_warping/DTW_Input_data/hourly_percent_visits_Q42017_column_wise.csv', row.names=F)
X1X2X3X4X5X6X7X8X9X10...X1124X1125X1126X1127X1128X1129X1130X1131X1132X1133
0.079502370.086609120.091827220.092556090.2100011 0.083244420.097083430.100656260.087149020.10045744... 0.091867850.1050256 0.084689870.4166667 0.2000000 0.1153846 0.062500 0.1910700 0.1944730 0.2063808
0.101172040.119388050.127629000.125888590.2179389 0.114004720.127182150.123865410.123930980.12556233... 0.104108900.1351720 0.107082490.3928571 0.0000000 0.2307692 0.156250 0.2075086 0.1831905 0.2195078
0.111120250.100787310.105671660.109584680.1934753 0.106775150.109409300.120063740.119269790.10829557... 0.108500450.1458302 0.146679710.0000000 0.0000000 0.2692308 0.125000 0.1708053 0.1517759 0.1650625
0.128790190.118065130.127345860.123433450.1695541 0.130911890.120492990.120762180.116674450.10636356... 0.156948420.1386013 0.182250430.0000000 0.6666667 0.0000000 0.171875 0.2303108 0.1668078 0.2484773
0.068225770.057507290.076476900.066558610.2203184 0.067584710.075308500.083955490.073421300.08027248... 0.123467820.1014109 0.096039150.0000000 0.0000000 0.1538462 0.062500 0.2072596 0.1178391 0.1468982
0.087727730.095353180.103852270.105518180.1745796 0.095380820.139933170.150608710.110854500.12323391... 0.153281750.1472172 0.134350420.0000000 0.0000000 0.2307692 0.171875 0.2210729 0.1590159 0.1734248
  1. 10
  2. 1133
00012000150001700019000220003300038000410006900073...17661176631766522052220532206322067MC26MC960MC98
0.079502370.086609120.091827220.092556090.2100011 0.083244420.097083430.100656260.087149020.10045744... 0.091867850.1050256 0.084689870.4166667 0.2000000 0.1153846 0.062500 0.1910700 0.1944730 0.2063808
0.101172040.119388050.127629000.125888590.2179389 0.114004720.127182150.123865410.123930980.12556233... 0.104108900.1351720 0.107082490.3928571 0.0000000 0.2307692 0.156250 0.2075086 0.1831905 0.2195078
0.111120250.100787310.105671660.109584680.1934753 0.106775150.109409300.120063740.119269790.10829557... 0.108500450.1458302 0.146679710.0000000 0.0000000 0.2692308 0.125000 0.1708053 0.1517759 0.1650625
0.128790190.118065130.127345860.123433450.1695541 0.130911890.120492990.120762180.116674450.10636356... 0.156948420.1386013 0.182250430.0000000 0.6666667 0.0000000 0.171875 0.2303108 0.1668078 0.2484773
0.068225770.057507290.076476900.066558610.2203184 0.067584710.075308500.083955490.073421300.08027248... 0.123467820.1014109 0.096039150.0000000 0.0000000 0.1538462 0.062500 0.2072596 0.1178391 0.1468982
0.087727730.095353180.103852270.105518180.1745796 0.095380820.139933170.150608710.110854500.12323391... 0.153281750.1472172 0.134350420.0000000 0.0000000 0.2307692 0.171875 0.2210729 0.1590159 0.1734248
#let's create the dataset as rowwise with hour across the top
rdata=data.frame(matrix(df2$per_visits,nrow =1133,ncol = 10, byrow = F))
head(rdata)
dim(rdata)
X1X2X3X4X5X6X7X8X9X10
0.079502370.139539030.1294059 0.1049971 0.103289180.1154648 0.111438280.128940460.083954570.12655779
0.101172040.065024190.1250171 0.1055075 0.119539880.1106251 0.113874250.129459340.114905770.10256979
0.111120250.105603130.1034505 0.1369112 0.062650980.1230172 0.090433760.126642280.114487910.10633368
0.128790190.117105770.1137967 0.1256704 0.098141310.0944434 0.119075180.074210620.127839140.09441497
0.068225770.118976370.1270601 0.1263911 0.105745670.1090297 0.116213210.097583110.106653600.12550554
0.087727730.106180400.1511323 0.1076241 0.124651330.1031845 0.116865100.111849030.121595270.12267545
  1. 1133
  2. 10
#install.packages('dtw')
require(dtw)
dmatrix=(dist(rdata, method="DTW"))
clusters<-hclust(dmatrix,method="average")
str(clusters)
List of 7
 $ merge      : int [1:1132, 1:2] -286 -116 -138 -373 -93 -141 -703 -951 -186 -421 ...
 $ height     : num [1:1132] 0.0557 0.057 0.0572 0.0608 0.0614 ...
 $ order      : int [1:1133] 1077 270 271 1073 1080 1064 1065 971 520 235 ...
 $ labels     : NULL
 $ method     : chr "average"
 $ call       : language hclust(d = dmatrix, method = "average")
 $ dist.method: chr "DTW"
 - attr(*, "class")= chr "hclust"
dim(dmatrix)
plot(clusters)
  1. 1133
  2. 1133

png

# Cut tree into 10 groups
sub_grp <- cutree(clusters, k = 10)
sub_grp2<- cutree(clusters, k = 2)
sub_grp3 <- cutree(clusters, k = 3)
sub_grp4 <- cutree(clusters, k = 4)
sub_grp5 <- cutree(clusters, k = 5)
sub_grp6<- cutree(clusters, k = 6)
sub_grp7 <- cutree(clusters, k = 7)
sub_grp8 <- cutree(clusters, k = 8)
sub_grp9 <- cutree(clusters, k = 9)
# Number of members in each cluster
table(sub_grp)
table(sub_grp3)
table(sub_grp5)
sub_grp
   1    2    3    4    5    6    7    8    9   10 
1118    3    1    2    1    1    1    1    4    1 



sub_grp3
   1    2    3 
1131    1    1 



sub_grp5
   1    2    3    4    5 
1124    6    1    1    1 
clustdf=rdata %>%
  mutate(cluster10 = sub_grp, cluster2=sub_grp2,cluster3 = sub_grp3, cluster4=sub_grp4,cluster5 = sub_grp5, cluster6=sub_grp6,
        cluster7= sub_grp7, cluster8=sub_grp8,cluster9 = sub_grp9)
clustdf$ClinicNumber<-clinic_list
head(clustdf)
write.csv(clustdf,'C:/MinuteClinic/Dynamic_time_warping/hourly_percent_visits_Q42017_clusters.csv')
#table(clustdf$cluster,clustdf$ClinicNumber)
X1X2X3X4X5X6X7X8X9X10cluster10cluster2cluster3cluster4cluster5cluster6cluster7cluster8cluster9ClinicNumber
0.079502370.139539030.1294059 0.1049971 0.103289180.1154648 0.111438280.128940460.083954570.126557791 1 1 1 1 1 1 1 1 00012
0.101172040.065024190.1250171 0.1055075 0.119539880.1106251 0.113874250.129459340.114905770.102569791 1 1 1 1 1 1 1 1 00015
0.111120250.105603130.1034505 0.1369112 0.062650980.1230172 0.090433760.126642280.114487910.106333681 1 1 1 1 1 1 1 1 00017
0.128790190.117105770.1137967 0.1256704 0.098141310.0944434 0.119075180.074210620.127839140.094414971 1 1 1 1 1 1 1 1 00019
0.068225770.118976370.1270601 0.1263911 0.105745670.1090297 0.116213210.097583110.106653600.125505541 1 1 1 1 1 1 1 1 00022
0.087727730.106180400.1511323 0.1076241 0.124651330.1031845 0.116865100.111849030.121595270.122675451 1 1 1 1 1 1 1 1 00033
#create long data
dflong=left_join(df2,select(clustdf,cluster10,cluster2,cluster3,cluster4,cluster5,cluster6,cluster7,cluster8,
                            cluster9,ClinicNumber), by="ClinicNumber")
head(dflong)
#plot the cluster groups with a few number of clusters by clinic by hour
write.csv(dflong,'C:/MinuteClinic/Dynamic_time_warping/hourly_percent_visits_Q42017_clusters_longformat.csv')
ClinicNumberHourper_visitscluster10cluster2cluster3cluster4cluster5cluster6cluster7cluster8cluster9
00012 9 0.079502371 1 1 1 1 1 1 1 1
00012 10 0.101172041 1 1 1 1 1 1 1 1
00012 11 0.111120251 1 1 1 1 1 1 1 1
00012 12 0.128790191 1 1 1 1 1 1 1 1
00012 13 0.068225771 1 1 1 1 1 1 1 1
00012 14 0.087727731 1 1 1 1 1 1 1 1
ggplot(dflong, aes(x=Hour, y=per_visits)) + 
    geom_line(aes(colour=as.factor(cluster10), group=as.factor(cluster10))) + # colour, group both depend on cond2
    geom_point(aes(colour=as.factor(cluster10)),               # colour depends on cond2
               size=3)+facet_wrap(~cluster10)+theme_bw()

png

#install.packages("factoextra")
#library(factoextra)
fviz_nbclust(rdata, FUN = hcut, method = "wss")
Warning message in stats::dist(x):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"Warning message in stats::dist(x, method = method, ...):
"NAs introduced by coercion"

png

#install.packages('gplots')
#library(gplots) 
y <- matrix(rnorm(500), 100, 5, dimnames=list(paste("g", 1:100, sep=""), paste("t", 1:5, sep=""))) 
#head(y)
#heatmap.2(y)
heatmap.2(as.matrix(rdata))

png

plot(as.dendrogram(clusters), edgePar=list(col=3, lwd=4), horiz=T) 

png

install.packages('')
library(pheatmap); library("RColorBrewer")
pheatmap(clusterdf, color=brewer.pal(9,"Blues"))
Error in library(pheatmap): there is no package called 'pheatmap'
Traceback:


1. library(pheatmap)

2. stop(txt, domain = NA)

About

Dynamic Time Warping model with hierarchical clustering

Topics

Resources

Stars

Watchers

Forks

Releases

No releases published

Packages

No packages published