this script has been created to better identify ethnciites in the care home cohort population, using code from colleagues at ASR (K.Best and L.Gong). it makes use of multiple data sources to identify ethnicity in inidivduals resulting in a lower missing rate

In [2]:
library(ggplot2)
library(bigrquery)
library(tidyverse)
library(lubridate)
library(stats)
library(finalfit)
ProjectId = "yhcr-prd-bradfor-bia-core"

targetdb2 <- 'yhcr-prd-bradfor-bia-core.CB_FDM_DeathCertificates'
targetdb2 <-gsub(' ','',targetdb2)

targetdb3 <- 'yhcr-prd-bradfor-bia-core.CB_2172'
targetdb3 <- gsub(' ','',targetdb3)

targetdb4 <- 'yhcr-prd-bradfor-bia-core.CB_FDM_PrimaryCare'
targetdb4 <- gsub(' ','',targetdb4)



── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.4     [32m✔[39m [34mreadr    [39m 2.1.5
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mpurrr    [39m 1.0.2     [32m✔[39m [34mtidyr    [39m 1.3.1
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


get the ethnciites used in primary care data person table

In [32]:
sql1 <- 
paste(
    '
#remove the missing ethnicites from the srpatient table 
with ethnicity_prep as (SELECT 
person_id,
ethnicity,
from 
',targetdb4,'.tbl_srpatient
where
ethnicity !="Ethnic category - 2011 census England and Wales"  AND

ethnicity !="O/E - ethnic group NOS"  AND

ethnicity !="O/E - ethnic group"  AND

ethnicity !="Ethnic groups (census) NOS"  AND

ethnicity !="Not specified - JDB"  AND

person_id in( select person_id from ',targetdb3,'.care_home_cohort_v1) 
ORDER BY PERSON_ID),

#select randomly from the non-missing ethnicity values for each person from the ethncity table

sr_ethnicity as (
SELECT person_id , ethnicity,

FROM (

    SELECT ROW_NUMBER() OVER (PARTITION BY person_id ORDER BY RAND()) AS row_num,

        person_id, ethnicity

    FROM ethnicity_prep

) AS T
WHERE row_num = 1),

# join together different sources of recording ethnicity, from concept_id in person table, ethnicity from sr_patient table, 
and ethnicity 5 levels group from Lins table taken from the census groups

ethnicity_join as (
SELECT

a.person_id,

a.gender_source_value as gender,

  a.age_admission,

  b.ethnicity_concept_id,
 b.ethnicity_source_value,
  d.Ethnicity as Ethnicity_SR_Table,
f.ethnicity_variable_label_5levels as _Ethnic_Group_Person_Table

FROM ',targetdb3,'.care_home_cohort_v1 as a

LEFT JOIN  ',targetdb4,'.person as b

on a.person_id=b.person_id

Left JOIN sr_ethnicity as d

on a.person_id=d.person_id

left join
`yhcr-prd-bradfor-bia-core.CB_CDM_VOCAB.concept` as e
on b.ethnicity_concept_id = e.concept_id
left join
`yhcr-prd-bradfor-bia-core.CB_1322.Lin_clean_ethnic_1220` as f
on b.ethnicity_concept_id = f.concept_id
),

# identify  all the sr patient ethnicity codes that are filled in when the person table is missing.
ethnicity_missing_codes as (
SELECT distinct
_Ethnic_Group_Person_Table,
Ethnicity_SR_Table,
FROM ethnicity_join 
 WHERE _Ethnic_Group_Person_Table="Missing"
group by _Ethnic_Group_Person_Table,
Ethnicity_SR_Table)

#recode these ethnicity codes into the five high level ethnicity groups
SELECT 
person_id,
Gender,
age_admission, 
 ethnicity_concept_id,
_Ethnic_Group_Person_Table,
Ethnicity_SR_Table,
CASE
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =     "White British"  THEN  "White"
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =      "British or mixed British - ethnic category 2001 census" THEN "White"
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =      "White: Irish - England and Wales ethnic category 2011 census" THEN "White"
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =      "Other white ethnic group" THEN "White"  
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =      "Other White or White unspecified ethnic category 2001 census" THEN "White"
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =      "White British - ethnic category 2001 census" THEN "White"
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =      "Italian - ethnic category 2001 census" THEN "White"
WHEN _Ethnic_Group_Person_Table ="Missing" AND     Ethnicity_SR_Table =      "White Irish" THEN "White"
WHEN _Ethnic_Group_Person_Table is NULL THEN "Missing" 
              ELSE _Ethnic_Group_Person_Table
END AS Ethnic_Group
FROM ethnicity_join

;',sep="")



ethnicity_tbl <-bq_project_query(ProjectId,sql1)
ethnicity_data <- bq_table_download(ethnicity_tbl)

In [33]:
mybq = bq_table(project='yhcr-prd-bradfor-bia-core', dataset='CB_2172', table='care_home_cohort_v1_ethnicity')
bq_table_upload(x=mybq, values= ethnicity_data, create_disposition='CREATE_IF_NEEDED', 
             write_disposition='WRITE_TRUNCATE')