# CiC Ethnicity Validation 
This script is to obtain ethnicity information from the master dataset and compare to ethnicity information in the CiC dataset. Where information is available in the CiC dataset but not in the master dataset, this will be used as an indicator of ethnic origin.

In [None]:
# Load libraries
library(dplyr)
library(here)
library(ggplot2)
library(bigrquery)
bq_auth()

In [None]:
# Store the project ID
project_id = "yhcr-prd-phm-bia-core"

# Store Tables of Interest
targetdb1 <-'yhcr-prd-phm-bia-core.CB_FDM_ChildrensSocialCare'
targetdb1 <-gsub(' ','',targetdb1)
print (targetdb1)

targetdb2 <-'yhcr-prd-phm-bia-core.CB_FDM_MASTER'
targetdb2 <-gsub(' ','',targetdb2)
print (targetdb2)


In [None]:
# Create SQL command

sql1 <- paste('
WITH distinct_pi AS (
  SELECT DISTINCT person_id, EthnicOrigin
  FROM ', targetdb1, '.tbl_CiC
)
SELECT distinct a.person_id, a.ethnicity_source_value, b.EthnicOrigin
FROM distinct_pi b
JOIN ', targetdb2,'.person a ON b.person_id = a.person_id
LIMIT 4000;
', sep = "")

#This runs it
tb3 <- bq_project_query(project_id, sql1)

#This loads  it into an R data frame
table <- bq_table_download(tb3)
#This displays it
table


#### Create mapping for Broad and specific Ethnic Categories

In [None]:
# Create list of unique ethnic_source_values
ethnicity_source_value <- unique(table$ethnicity_source_value)
ethnicity_source_value

In [None]:
# Create mapping dataframe
mapping <- data.frame(
  ethnicity_source_value = c(
      "Asian or Asian British: any other Asian background - England and Wales ethnic category 2011 census",
      "Other ethnic group: any other ethnic group - England and Wales ethnic category 2011 census",
      "White: English or Welsh or Scottish or Northern Irish or British - England and Wales ethnic category 2011 census",
      "Asian or Asian British: Pakistani - England and Wales ethnic category 2011 census",
      "Unknown/Refuse to say", "Mixed multiple ethnic groups: White and Asian - England and Wales ethnic category 2011 census",
      "Black or African or Caribbean or Black British: African - England and Wales ethnic category 2011 census",
      "Mixed multiple ethnic groups: any other Mixed or multiple ethnic background - England and Wales ethnic category 2011 census",
      "Black or African or Caribbean or Black British: other Black or African or Caribbean background - England and Wales ethnic category 2011 census",
      "White:Any other White background", "Other ethnic group: Arab - England and Wales ethnic category 2011 census",
      "Asian or Asian British: Indian - England and Wales ethnic category 2011 census",
      "Mixed multiple ethnic groups: White and Black African - England and Wales ethnic category 2011 census",
      "Mixed multiple ethnic groups: White and Black Caribbean - England and Wales ethnic category 2011 census",
      "White: Irish - England and Wales ethnic category 2011 census",
      "Black or African or Caribbean or Black British: Caribbean - England and Wales ethnic category 2011 census",
      "White: Gypsy or Irish Traveller - England and Wales ethnic category 2011 census",
      "Asian or Asian British: Bangladeshi - England and Wales ethnic category 2011 census",
      "Asian or Asian British: Chinese - England and Wales ethnic category 2011 census" 
      
      ),
  BroadEthnicCategory = c(
    "Asian/Asian British", "Other ethnic group", "White", "Asian/Asian British", 
    "Unknown/Refuse to say", "Mixed ethnic group", "Black/African/Caribbean/Black British",
    "Mixed ethnic group", "Black/African/Caribbean/Black British", "White",
    "Other ethnic group", "Asian/Asian British", "Mixed ethnic group", 
    "Mixed ethnic group", "White", "Black/African/Caribbean/Black British",
    "White", "Asian/Asian British", "Asian/Asian British"
  ),
  SpecificEthnicCategory = c(
    "Other Asian background", "Other ethnic group", "English/Welsh/Scottish/Northern Irish/British",
    "Pakistani", "Unknown/Refuse to say", "White and Asian", "African",
    "Other mixed ethnic group", "other Black/African/Caribbean",
     "Other White", "Arab", "Indian", "White and Black African", 
    "White and Black Caribbean", "Irish", "Caribbean", "Gypsy/Irish Traveller", 
    "Bangladeshi", "Chinese"
  )
)

In [None]:
# Apply mapping
mapped_data <- table %>%
  left_join(mapping, by = "ethnicity_source_value")
head(mapped_data)

In [None]:
# List of ethnicorigin
ethnicorigin <- unique(table$EthnicOrigin)
ethnicorigin

In [None]:
mapping2 <- data.frame(
  EthnicOrigin = c(
      "Asian/British Asian - Other", "Other Ethnic Group", "Gypsy/Roma", "White - British",
      "Asian/British Asian - Pakistani", "Information Not Yet Obtained",
      "Mixed - White/Asian", "Black/Black British - African", "Mixed - Other",
      "Black - Other", "White - Other", "White - Eastern European", "", "Asian/British Asian - Indian",
      "Mixed - White/Black African", "Mixed - White/Black Caribbean", "",
      "Black/Black British - Caribbean", "Traveller of Irish Heritage", 
      "Asian/British Asian - Bangladeshi", "Asian/British Asian - Chinese"
     
      
      ),
  BroadEthnicCategory_CiC = c(
    "Asian/Asian British", "Other ethnic group", "Other ethnic group", "White", "Asian/Asian British", 
    "Unknown/Refuse to say", "Mixed ethnic group", "Black/African/Caribbean/Black British",
    "Mixed ethnic group", "Black/African/Caribbean/Black British", "White", "White",
    "Other ethnic group", "Asian/Asian British", "Mixed ethnic group", 
    "Mixed ethnic group", "White", "Black/African/Caribbean/Black British",
    "White", "Asian/Asian British", "Asian/Asian British"
  ),
  SpecificEthnicCategory_CiC = c(
    "Other Asian background", "Other ethnic group", "Other ethnic group", "English/Welsh/Scottish/Northern Irish/British",
    "Pakistani", "Unknown/Refuse to say", "White and Asian", "African",
    "Other mixed ethnic group", "other Black/African/Caribbean",
     "Other White", "Other White", "Arab", "Indian", "White and Black African", 
    "White and Black Caribbean", "Irish", "Caribbean", "Gypsy/Irish Traveller", 
    "Bangladeshi", "Chinese"
  )
)

In [None]:
mapping2

In [None]:
# Apply mapping
mapped_data <- mapped_data %>%
  left_join(mapping2, by = "EthnicOrigin")
head(mapped_data)

In [None]:
# Check how many match
matched <- mapped_data %>%
filter(BroadEthnicCategory == BroadEthnicCategory_CiC)
nrow(matched)

617 out of 817 person_ids matched on ethnicity from both datasets (75.52%). But this is includng where ethnicity may not be available in either dataset.

In [None]:
# Match after removal of any NAs
matched_no_na <- mapped_data %>%
filter(BroadEthnicCategory != "Unknown/Refuse to say" & BroadEthnicCategory_CiC != "Unknown/Refuse to say")
nrow(matched_no_na)

matched2 <- matched_no_na %>%
filter(BroadEthnicCategory == BroadEthnicCategory_CiC)
nrow(matched2)

When looking if columns matched when removing any NA's. 613 out of the 738 matched (78.28%).

In [None]:
not_matched <- matched_no_na %>%
filter(BroadEthnicCategory != BroadEthnicCategory_CiC)
nrow(not_matched)

In [None]:
# Summarise the mismatches
mismatch_summary <- not_matched %>%
  group_by(BroadEthnicCategory, BroadEthnicCategory_CiC) %>%
  summarize(count = n(), .groups = 'drop')

print(mismatch_summary)

In [None]:
# Create heatmap
ggplot(mismatch_summary, aes(x = BroadEthnicCategory, y = BroadEthnicCategory_CiC, fill = count)) +
  geom_tile(color = "white") +
  scale_fill_gradient(low = "white", high = "red") +
  labs(title = "Heatmap of Ethnic Category Mismatches",
       x = "Broad Ethnic Category",
       y = "Broad Ethnic Category (CiC)",
       fill = "Count") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Common descrepencies when master dataset says white but CiC states mixed and other ethnic group. There is also some discrepancy where master dataset says Asian but CiC states mixed and other ethnic group.

In [None]:
# Check information CiC has that master does not
check <- mapped_data %>%
filter(BroadEthnicCategory == "Unknown/Refuse to say" & BroadEthnicCategory_CiC != "Unknown/Refuse to say") 
nrow(check)


In [None]:
# Merge CiC where master data is missing
mapped_data <- mapped_data %>%
  mutate(
    BroadEthnicCategory_merge = ifelse(BroadEthnicCategory == "Unknown/Refuse to say", BroadEthnicCategory_CiC, BroadEthnicCategory),
    SpecificEthnicCategory_merge = ifelse(SpecificEthnicCategory == "Unknown/Refuse to say", SpecificEthnicCategory_CiC, SpecificEthnicCategory)
  )
head(mapped_data)

#### Upload table to Project workspace

In [None]:
schema <- list(
  list("person_id", "INTEGER"),
  list("ethnicity_source_value", "STRING"),
  list("EthnicOrigin", "STRING"),
  list("BroadEthnicCategory", "STRING"),
  list("SpecificEthnicCategory", "STRING"),
  list("BroadEthnicCategory_CiC", "STRING"),
  list("SpecificEthnicCategory_CiC", "STRING"),
  list("BroadEthnicCategory_merge", "STRING"),
  list("SpecificEthnicCategory_merge", "STRING")
)

# Reverse the column order
reverse <- mapped_data[, rev(colnames(mapped_data))]
bq_table_upload("yhcr-prd-phm-bia-core.CB_2353.CiC_ethnicity", reverse, schema = schema)