## Care home identification notebook


this notebook is the latest version as of 17/08/2024 for identifying care home residents in connected bradford after some issues wih previous scripts were identified. it is the number that will be used in the final analysis, having it in a jupytner notebook format also enabls it to be easily upliaded to github to be made freely available.

In [None]:
library(bigrquery)
library(tidyverse) 

In [None]:
ProjectId = "yhcr-prd-bradfor-bia-core"

targetdb1 <- "yhcr-prd-bradfor-bia-core.CB_2172"
targetdb1 <- gsub(" ","",targetdb1)

In [None]:
#first query selects all individuals admitted to a care home at any point no age or date restrictions

In [None]:
sql1 <- 
paste(
    'with a  as (
SELECT person_id, 
tbl_bmbc_AdultSocialCare_Services_start_date, 
servicetype, 
FinalisedPSR, 
row_number() over
(partition by person_id order by tbl_bmbc_AdultSocialCare_Services_start_date) as care_seq 
from ',targetdb1,'.tbl_bmbc_AdultSocialCare_Services
where ServiceType in("Nursing","Residential") 
),
b as (
select 
* 
from a
where care_seq = 1
order by person_id, tbl_bmbc_AdultSocialCare_Services_start_date),

c as 
(SELECT person_id,
tbl_bmbc_AdultSocialCare_Services_end_date, 
servicetype, 
row_number() over (partition by person_id order by tbl_bmbc_AdultSocialCare_Services_end_date desc) as care_end_seq 
from ',targetdb1,'.tbl_bmbc_AdultSocialCare_Services
where ServiceType in("Nursing","Residential") )
,

d as (select
* from c 
where care_end_seq = 1),

e as (
select 
b.person_id,
b.tbl_bmbc_AdultSocialCare_Services_start_date as first_episodestartdate,
d.tbl_bmbc_AdultSocialCare_Services_end_date as last_episodestopdate,
b.ServiceType as admission_service,
b.finalisedPSR
from b 
LEFT join d on
b.person_id= d.person_id
order by person_id),

f as (
Select 
e.*,
date_diff(e.last_episodestopdate,e.first_episodestartdate,day) as admission_length
from e
),

h as (
SELECT
f.person_id,
f.admission_length,
f.admission_service,
first_episodestartdate,
f.last_episodestopdate,
#ethnicity.Ethnic_Group
EXTRACT(DATE FROM (g.birth_datetime)) as birth_datetime,
f.first_episodestartdate as date_of_admission,
EXTRACT(DATE FROM (g.death_datetime)) as death_datetime,
g.gender_source_value as gender,
o.observation_period_end_date
FROM f 
LEFT JOIN ',targetdb1,'.person g on
f.person_id = g.person_id
#LEFT JOIN ',targetdb1,'.care_home_cohort_v1_ethnicity ethnicity on 
#f.person_id = ethnicity.person_id
LEFT JOIN ',targetdb1,'.observation_period o on
f.person_id = o.person_id
 ),

g as (
select
DATE_DIFF(first_episodestartdate, birth_datetime, YEAR) as age_admission,
h.*,
from h
where first_episodestartdate < "2019-12-31"
order by age_admission desc)

select
*
from g



;',sep="")

sql_tbl <-bq_project_query(ProjectId,sql1)
sql_data <- bq_table_download(sql_tbl)


In [None]:
#convert vectors to date

In [None]:
sql_data$death_date <- as_date(sql_data$death_datetime)
sql_data <- sql_data %>% select(-death_datetime)

In [None]:
sql_data$episodestartdate <- as_date(sql_data$first_episodestartdate)
sql_data$episodestopdate <- as_date(sql_data$last_episodestopdate)
sql_data <- sql_data %>% select(-first_episodestartdate,-last_episodestopdate)

In [None]:
sql_data$birth_date <- as_date(sql_data$birth_datetime)
sql_data <- sql_data %>% select(-birth_datetime)
sql_data$ob_period_end_date <- as_date(sql_data$observation_period_end_date)
sql_data <- sql_data %>%  select(-observation_period_end_date)

In [None]:
# remove those inidividuals admitted on dates that were identified as being erroneous
# (i.e. these individuals were actually admitted into a care home prior to 2016, but had no start date so the date was put as the date of upload, creating some skew) 
# this has been confirmed with the individual who built the social care FDM in Connected bradford

In [None]:
sql_data_1 <- 
sql_data %>% filter(!(episodestartdate == "2016-04-04"))%>%
filter(!(episodestartdate == "2016-07-25"))%>% 
filter(!(episodestartdate == "2016-08-22"))%>% 
filter(!(episodestartdate == "2015-11-02"))

In [None]:
sql_data_1 %>% n_distinct('person_id')

In [None]:
sql_data_1

In [None]:
#how many aged less than 65 

In [None]:
sql_data_below_65 <- sql_data_1 %>% filter(age_admission < 65)
n_distinct(sql_data_below_65$person_id)

In [None]:
sql_data_above_sixtyfive <- sql_data_1 %>% filter(age_admission >= 65) 
sql_data_above_sixtyfive %>% n_distinct()


In [None]:
#here we bring in the mortality data to reduce number of missig death dates 

In [None]:
sql_death <- paste('
with a as (SELECT cast(person_id as BIGNUMERIC) person_id,
dod,
tbl_NEC_Deaths,
reg_date,
rank() over(partition by person_id order by dod desc, reg_date desc) as rank_seq
from ',targetdb1,'.tbl_NEC_Deaths
where cast(person_id as BIGNUMERIC) in (SELECT person_id from ',targetdb1,'.person where death_datetime is null)),

b as (
select
*,
row_number() over(partition by person_id order by person_id) as rank_seq_2
from a 
where rank_seq = 1)

select 
person_id, dod,reg_date from b 
where rank_seq_2 = 1 
order by person_id 
;',sep="")

tbl_death <-bq_project_query(ProjectId,sql_death)
ch_death<- bq_table_download(tbl_death)



In [None]:
ch_death <- ch_death %>% mutate(date_death = ifelse(is.na(dod), reg_date,dod))

In [None]:
ch_death$date_death <- as_date(ch_death$date_death)
    ch_death$person_id <- as.numeric(ch_death$person_id)

In [None]:
## join death certificate tibble to the deathdate in the master cohort table 

In [None]:
ch_death_filter <- ch_death %>% select(person_id,date_death)
sql_data_above_sixtyfive <- left_join(sql_data_above_sixtyfive,ch_death_filter,by='person_id')

In [None]:
sql_data_above_sixtyfive <- sql_data_above_sixtyfive %>%
mutate(dod = coalesce(sql_data_above_sixtyfive$death_date,sql_data_above_sixtyfive$date_death))%>% 
select(-death_date,-date_death)


In [None]:
sql_data_above_sixtyfive <- sql_data_above_sixtyfive %>% mutate(mortality = dod-episodestartdate)

In [None]:
alive_morethan_6weeks <- sql_data_above_sixtyfive %>% filter((mortality > 42) |is.na(dod))

In [None]:
alive_morethan_6weeks %>% n_distinct()

In [None]:
##next we remove anyone who was discharged within 42 days or died within 42 days
#first we calculate the number and then remove anyone who died within 42 days 
long_stay_resident <- alive_morethan_6weeks %>% filter(episodestopdate > (episodestartdate + ddays(x=42)))


In [None]:
#second, of  those remaining, we calculate and remove those who had a recorded legnth of stay of 42 days or less. 
#the two figures combined (number died withi 42 days and number discharged alive) gives the total number filtered at this stage. 

In [None]:
long_stay_resident %>% n_distinct()

## ghostbusting

In [None]:
# we identified anyone with missing primary care data and removed them, 
# this is most likely because whilst they are recieving social care funding from the bradford local authorty they are not registered to a GP contributing to Connected Bradford. 

first get rid of anyone who doesn't appear in the primary care dataset 

i then calculate follow-up period for the care home cohort hypertension analysis. Follow-up is based on the observation period recorded in connected bradford for the primary care. The observation period is defined as spans of time during which two conditions are expected to hold: (i) Clinical Events that happened to the person are recorded in the Event tables, and (ii) absence of records indicate such Events did not occur during this span of time. A person's follow-up period is defined as the time up until obersvation period end date in primary care. This project pulls data from mutliple different datasets i.e. primary,secondary,social care datasets: the follow-up times in these datasets may vary for any given individual, however I chose primary care on the basis that if someone is registered to a GP in bradford, if they had any clinical event requiring secondary care data they would be picked up by the secondary care datasets.

first remove those whose primary car eobservation period date ended before care home admission date, this was only 19 people 

In [None]:
#care_home_cohort_clean_1 <- long_stay_resident %>% 
#filter(!(long_stay_resident$person_id %in% tbl_missing$primary_care_person))

#add in follow-up duration variable 

care_home_cohort_clean_1 <- long_stay_resident %>% mutate(fu_period = (ob_period_end_date - episodestartdate))

the 'ghosts' are those who had a missing death date 

In [None]:
#send the final care home cohort back to bigquery as a table so it can be ran through the other queries
care_home_cohort_clean_1$mortality <- as.numeric(care_home_cohort_clean_1$mortality)

add in a new variable that distinugihses whether the person died during follow-up, whether the person was alive at the end of FU (defined as end of obsevation period) or lost-to-follow-up based on follow up until 1st january 2024

In [None]:
care_home_cohort_clean_2 <- care_home_cohort_clean_1 %>%
  mutate(end_of_fu_status = case_when(
    is.na(dod) & !is.na(ob_period_end_date) ~ 'survived',
    !is.na(dod) ~ 'died',
    is.na(dod) & (is.na(ob_period_end_date) | ob_period_end_date > as_date(19724)) ~ 'lost to follow up'
  )) 

care_home_cohort_clean_2$end_of_fu_status <- as.factor(care_home_cohort_clean_2$end_of_fu_status)

In [None]:
#bigquery doesn't like the difftime vectors so i need to conert them into numeric units 
#so i can send it back to bigquery 
#i then also created a new vector which is follow-up duration, this is either the date of death or last observation date (if they had no death date) subtracted from the ch admission date 
#to calculat ethe median follow-up. i also got rid of some defunct vectors to keep things tidy

In [None]:
care_home_cohort_clean_2$mortality <- as.numeric(care_home_cohort_clean_2$mortality)
care_home_cohort_clean_2$fu_period <- as.numeric(care_home_cohort_clean_2$fu_period)
care_home_cohort_clean_3 <- care_home_cohort_clean_2 %>% 
mutate(fu_end_date=(coalesce(care_home_cohort_clean_2$dod,care_home_cohort_clean_2$ob_period_end_date))) 
care_home_cohort_clean_4 <- care_home_cohort_clean_3 %>% select(-fu_period,-ob_period_end_date,-admission_length) %>% 
mutate(fu_duration= fu_end_date - episodestartdate)
care_home_cohort_clean_4$fu_duration <- as.numeric(care_home_cohort_clean_4$fu_duration)


In [None]:
#for the next section i had to re-run some code after connected bradford had moved over to trust servers, but i did not have all the data i wanted available. this meant i had to try and subset what i did have by the hypertension cohort table to 
#to do the final analysis 

In [None]:
sql_htn <- paste('
select
person_id
from ',targetdb1,'.cb_2172_hypertension_combined
where hypertension = TRUE
                     ;',sep="")

tbl_htn <-bq_project_query(ProjectId,sql_htn)
tbl_htn <- bq_table_download(tbl_htn)

In [None]:
#subset cohort by those with hypertension diagnosis
care_home_cohort_htn <- care_home_cohort_clean_4 %>% filter(person_id %in% tbl_htn$person_id)

In [None]:
care_home_cohort_htn$end_of_fu_status %>% table()

In [None]:
#remove those with fu end date before episode startdate
care_home_cohort_htn_missing <- care_home_cohort_htn %>% filter(!is.na(fu_end_date)) %>% filter(episodestartdate > fu_end_date) 

In [None]:
care_home_cohort_htn_clean <- care_home_cohort_htn %>% filter(!(person_id %in% care_home_cohort_htn_missing$person_id))

In [None]:
#next is to calculate the median follow-up time

In [None]:
mybq = bq_table(project='yhcr-prd-bradfor-bia-core', dataset='CB_2172', table='care_home_cohort_v1')
bq_table_upload(x= mybq, values= care_home_cohort_htn_clean, create_disposition='CREATE_IF_NEEDED', 
             write_disposition='WRITE_TRUNCATE')

in this next section we will calculate the follow-up period for the care home residents based on the primary care end observation date 