#0. Setup

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import Window

from functools import reduce

import databricks.koalas as ks
import pandas as pd
import numpy as np

import re
import io
import datetime

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import seaborn as sns

print("Matplotlib version: ", matplotlib.__version__)
print("Seaborn version: ", sns.__version__)
_datetimenow = datetime.datetime.now() # .strftime("%Y%m%d")
print(f"_datetimenow:  {_datetimenow}")

In [0]:
%run "../SHDS/common/functions"

In [0]:
%run "./CCU056-01-parameters"

In [0]:
db = 'dars_nic_391419_j3w9t'
dbc = f'{db}_collab'
dsa = f'dsa_391419_j3w9t_collab'

#1. Age, Sex, LSOA

## 1.1 Mid 2020

In [0]:
population_estimates_lsoa_2020 = spark.table(f'{dsa}.hds_population_estimates_mid_2020_age_lsoa_sex')

In [0]:
display(population_estimates_lsoa_2020)

In [0]:
# Filter for LSOAs in England only - remove Wales
population_estimates_lsoa_2020 = population_estimates_lsoa_2020.filter(f.col("lsoa_code").startswith("E"))

In [0]:
population_estimates_lsoa_2020=(population_estimates_lsoa_2020
        .withColumn("age_numeric",
                f.expr("CASE WHEN Age = '90+' THEN 101 ELSE CAST(Age AS INT) END"))
        .withColumn("age_5_band",  f.when(f.col("age_numeric") <= 4, "0-4")
                              .when((f.col("age_numeric") >= 5) & (f.col("age_numeric") <= 9), "5-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 14), "10-14")
                              .when((f.col("age_numeric") >= 15) & (f.col("age_numeric") <= 19), "15-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 24), "20-24")
                              .when((f.col("age_numeric") >= 25) & (f.col("age_numeric") <= 29), "25-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 34), "30-34")
                              .when((f.col("age_numeric") >= 35) & (f.col("age_numeric") <= 39), "35-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 44), "40-44")
                              .when((f.col("age_numeric") >= 45) & (f.col("age_numeric") <= 49), "45-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 54), "50-54")
                              .when((f.col("age_numeric") >= 55) & (f.col("age_numeric") <= 59), "55-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 64), "60-64")
                              .when((f.col("age_numeric") >= 65) & (f.col("age_numeric") <= 69), "65-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 74), "70-74")
                              .when((f.col("age_numeric") >= 75) & (f.col("age_numeric") <= 79), "75-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 84), "80-84")
                              .when((f.col("age_numeric") >= 85) & (f.col("age_numeric") <= 89), "85-89")
                        #       .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 94), "90-94")
                        #       .when((f.col("age_numeric") >= 95) & (f.col("age_numeric") <= 99), "95-99")
                              .otherwise("90+"))
        .withColumn("age_10_band",  f.when(f.col("age_numeric") <= 10, "0-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 19), "10-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 29), "20-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 39), "30-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 49), "40-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 59), "50-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 69), "60-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 79), "70-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 89), "80-89")
                        #       .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 99), "90-99")
                              .otherwise("90+"))
        

        .drop("age_numeric")
    )

In [0]:
display(population_estimates_lsoa_2020)

In [0]:
save_table(df=population_estimates_lsoa_2020, out_name=f'{proj}_population_estimates_mid_2020_age_lsoa_sex', save_previous=False, data_base=dsa)

## 1.2 Mid 2017

**Ages are not provided in single years but inconsistent age groups instead. We will wrangle these into 5 and 10 year age bands instead.**

Note LSOA names are not included here

In [0]:
# lsoa_name_lookup = spark.table(f'{dsa}.hds_cur_lsoa_region_lookup')
# wont contain older LSOA names

In [0]:
population_estimates_lsoa_2017 = spark.table(f'{dsa}.hds_population_estimates_mid_2017_age_lsoa_sex')

In [0]:
display(population_estimates_lsoa_2017)

In [0]:
# Filter for LSOAs in England only - remove Wales
population_estimates_lsoa_2017 = population_estimates_lsoa_2017.filter(f.col("lsoa_code").startswith("E"))

In [0]:
population_estimates_lsoa_2017=(population_estimates_lsoa_2017
        .withColumn("age_numeric",
                f.expr("CASE WHEN age = '90+' THEN 101 ELSE CAST(age AS INT) END"))
        .withColumn("age_5_band",  f.when(f.col("age_numeric") <= 4, "0-4")
                              .when((f.col("age_numeric") >= 5) & (f.col("age_numeric") <= 9), "5-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 14), "10-14")
                              .when((f.col("age_numeric") >= 15) & (f.col("age_numeric") <= 19), "15-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 24), "20-24")
                              .when((f.col("age_numeric") >= 25) & (f.col("age_numeric") <= 29), "25-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 34), "30-34")
                              .when((f.col("age_numeric") >= 35) & (f.col("age_numeric") <= 39), "35-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 44), "40-44")
                              .when((f.col("age_numeric") >= 45) & (f.col("age_numeric") <= 49), "45-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 54), "50-54")
                              .when((f.col("age_numeric") >= 55) & (f.col("age_numeric") <= 59), "55-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 64), "60-64")
                              .when((f.col("age_numeric") >= 65) & (f.col("age_numeric") <= 69), "65-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 74), "70-74")
                              .when((f.col("age_numeric") >= 75) & (f.col("age_numeric") <= 79), "75-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 84), "80-84")
                              .when((f.col("age_numeric") >= 85) & (f.col("age_numeric") <= 89), "85-89")
                        #       .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 94), "90-94")
                        #       .when((f.col("age_numeric") >= 95) & (f.col("age_numeric") <= 99), "95-99")
                              .otherwise("90+"))
        .withColumn("age_10_band",  f.when(f.col("age_numeric") <= 10, "0-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 19), "10-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 29), "20-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 39), "30-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 49), "40-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 59), "50-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 69), "60-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 79), "70-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 89), "80-89")
                        #       .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 99), "90-99")
                              .otherwise("90+"))
        

        .drop("age_numeric")
    )

In [0]:
display(population_estimates_lsoa_2017)

In [0]:
save_table(df=population_estimates_lsoa_2017, out_name=f'{proj}_population_estimates_mid_2017_age_lsoa_sex', save_previous=False, data_base=dsa)

## 1.3 Mid 2011

In [0]:
population_estimates_lsoa_2011 = spark.table(f'{dsa}.hds_population_estimates_mid_2011_age_lsoa_sex')

In [0]:
display(population_estimates_lsoa_2011)

In [0]:
# Filter for LSOAs in England only - remove Wales
population_estimates_lsoa_2011 = (population_estimates_lsoa_2011.filter(f.col("lsoa_code").startswith("E")))

In [0]:
population_estimates_lsoa_2011=(population_estimates_lsoa_2011
        .withColumn("age_numeric",
                f.expr("CASE WHEN age = '90+' THEN 101 ELSE CAST(age AS INT) END"))
        .withColumn("age_5_band",  f.when(f.col("age_numeric") <= 4, "0-4")
                              .when((f.col("age_numeric") >= 5) & (f.col("age_numeric") <= 9), "5-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 14), "10-14")
                              .when((f.col("age_numeric") >= 15) & (f.col("age_numeric") <= 19), "15-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 24), "20-24")
                              .when((f.col("age_numeric") >= 25) & (f.col("age_numeric") <= 29), "25-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 34), "30-34")
                              .when((f.col("age_numeric") >= 35) & (f.col("age_numeric") <= 39), "35-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 44), "40-44")
                              .when((f.col("age_numeric") >= 45) & (f.col("age_numeric") <= 49), "45-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 54), "50-54")
                              .when((f.col("age_numeric") >= 55) & (f.col("age_numeric") <= 59), "55-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 64), "60-64")
                              .when((f.col("age_numeric") >= 65) & (f.col("age_numeric") <= 69), "65-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 74), "70-74")
                              .when((f.col("age_numeric") >= 75) & (f.col("age_numeric") <= 79), "75-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 84), "80-84")
                              .when((f.col("age_numeric") >= 85) & (f.col("age_numeric") <= 89), "85-89")
                            #   .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 94), "90-94")
                            #   .when((f.col("age_numeric") >= 95) & (f.col("age_numeric") <= 99), "95-99")
                              .otherwise("90+"))
        .withColumn("age_10_band",  f.when(f.col("age_numeric") <= 10, "0-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 19), "10-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 29), "20-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 39), "30-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 49), "40-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 59), "50-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 69), "60-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 79), "70-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 89), "80-89")
                            #   .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 99), "90-99")
                              .otherwise("90+"))
        

        .drop("age_numeric")
    )

In [0]:
save_table(df=population_estimates_lsoa_2011, out_name=f'{proj}_population_estimates_mid_2011_age_lsoa_sex', save_previous=False, data_base=dsa)

## 1.4 Mid 2002

**Ages are not provided in single years but inconsistent age groups instead. We will wrangle these into 5 and 10 year age bands instead.**

Note LSOA names are not included here

In [0]:
population_estimates_lsoa_2002 = spark.table(f'{dsa}.hds_population_estimates_mid_2002_age_lsoa_sex')

In [0]:
display(population_estimates_lsoa_2002)

In [0]:
# Filter for LSOAs in England only - remove Wales
population_estimates_lsoa_2002 = population_estimates_lsoa_2002.filter(f.col("lsoa_code").startswith("E"))

In [0]:
population_estimates_lsoa_2002=(population_estimates_lsoa_2002
        .withColumn("age_numeric",
                f.expr("CASE WHEN age = '90+' THEN 101 ELSE CAST(age AS INT) END"))
        .withColumn("age_5_band",  f.when(f.col("age_numeric") <= 4, "0-4")
                              .when((f.col("age_numeric") >= 5) & (f.col("age_numeric") <= 9), "5-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 14), "10-14")
                              .when((f.col("age_numeric") >= 15) & (f.col("age_numeric") <= 19), "15-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 24), "20-24")
                              .when((f.col("age_numeric") >= 25) & (f.col("age_numeric") <= 29), "25-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 34), "30-34")
                              .when((f.col("age_numeric") >= 35) & (f.col("age_numeric") <= 39), "35-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 44), "40-44")
                              .when((f.col("age_numeric") >= 45) & (f.col("age_numeric") <= 49), "45-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 54), "50-54")
                              .when((f.col("age_numeric") >= 55) & (f.col("age_numeric") <= 59), "55-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 64), "60-64")
                              .when((f.col("age_numeric") >= 65) & (f.col("age_numeric") <= 69), "65-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 74), "70-74")
                              .when((f.col("age_numeric") >= 75) & (f.col("age_numeric") <= 79), "75-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 84), "80-84")
                              .when((f.col("age_numeric") >= 85) & (f.col("age_numeric") <= 89), "85-89")
                        #       .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 94), "90-94")
                        #       .when((f.col("age_numeric") >= 95) & (f.col("age_numeric") <= 99), "95-99")
                              .otherwise("90+"))
        .withColumn("age_10_band",  f.when(f.col("age_numeric") <= 10, "0-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 19), "10-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 29), "20-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 39), "30-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 49), "40-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 59), "50-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 69), "60-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 79), "70-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 89), "80-89")
                        #       .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 99), "90-99")
                              .otherwise("90+"))
        

        .drop("age_numeric")
    )

In [0]:
display(population_estimates_lsoa_2002)

In [0]:
save_table(df=population_estimates_lsoa_2002, out_name=f'{proj}_population_estimates_mid_2002_age_lsoa_sex', save_previous=False, data_base=dsa)

#2. Age, Sex, Ethnicity

## 2.1 2021

In [0]:
population_estimates_eth_2021 = spark.table(f'{dsa}.hds_population_estimates_2021_ethnicity_age_sex')

In [0]:
display(population_estimates_eth_2021)

In [0]:
#Add high-level ethnicity
population_estimates_eth_2021=(population_estimates_eth_2021.withColumn("high_level_ethnicity", f.split(f.col("ethnicity"), ":")[0]))

#ethnicity - detail only
population_estimates_eth_2021=(population_estimates_eth_2021
                      .withColumn("detail_ethnicity", f.split(f.col("ethnicity"), ":")[1])
                      .withColumn("detail_ethnicity", f.ltrim(f.col("detail_ethnicity")))
                      )

population_estimates_eth_2021 = (population_estimates_eth_2021.withColumn("n", f.when(f.col("n") == "c", 0).otherwise(f.col("n"))))

In [0]:
population_estimates_eth=(population_estimates_eth_2021
        .withColumn("age_numeric",
                f.expr("CASE WHEN Age = '100 or over' THEN 101 ELSE CAST(Age AS INT) END"))
        .withColumn("age_5_band",  f.when(f.col("age_numeric") <= 4, "0-4")
                              .when((f.col("age_numeric") >= 5) & (f.col("age_numeric") <= 9), "5-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 14), "10-14")
                              .when((f.col("age_numeric") >= 15) & (f.col("age_numeric") <= 19), "15-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 24), "20-24")
                              .when((f.col("age_numeric") >= 25) & (f.col("age_numeric") <= 29), "25-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 34), "30-34")
                              .when((f.col("age_numeric") >= 35) & (f.col("age_numeric") <= 39), "35-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 44), "40-44")
                              .when((f.col("age_numeric") >= 45) & (f.col("age_numeric") <= 49), "45-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 54), "50-54")
                              .when((f.col("age_numeric") >= 55) & (f.col("age_numeric") <= 59), "55-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 64), "60-64")
                              .when((f.col("age_numeric") >= 65) & (f.col("age_numeric") <= 69), "65-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 74), "70-74")
                              .when((f.col("age_numeric") >= 75) & (f.col("age_numeric") <= 79), "75-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 84), "80-84")
                              .when((f.col("age_numeric") >= 85) & (f.col("age_numeric") <= 89), "85-89")
                              .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 94), "90-94")
                              .when((f.col("age_numeric") >= 95) & (f.col("age_numeric") <= 99), "95-99")
                              .otherwise("100+"))
        .withColumn("age_10_band",  f.when(f.col("age_numeric") <= 10, "0-9")
                              .when((f.col("age_numeric") >= 10) & (f.col("age_numeric") <= 19), "10-19")
                              .when((f.col("age_numeric") >= 20) & (f.col("age_numeric") <= 29), "20-29")
                              .when((f.col("age_numeric") >= 30) & (f.col("age_numeric") <= 39), "30-39")
                              .when((f.col("age_numeric") >= 40) & (f.col("age_numeric") <= 49), "40-49")
                              .when((f.col("age_numeric") >= 50) & (f.col("age_numeric") <= 59), "50-59")
                              .when((f.col("age_numeric") >= 60) & (f.col("age_numeric") <= 69), "60-69")
                              .when((f.col("age_numeric") >= 70) & (f.col("age_numeric") <= 79), "70-79")
                              .when((f.col("age_numeric") >= 80) & (f.col("age_numeric") <= 89), "80-89")
                              .when((f.col("age_numeric") >= 90) & (f.col("age_numeric") <= 99), "90-99")
                              .otherwise("100+"))
        .drop("age_numeric")
    )

In [0]:
display(population_estimates_eth_2021)

In [0]:
save_table(df=population_estimates_eth_2021, out_name=f'{proj}_population_estimates_2021_ethnicity_age_sex', save_previous=False, data_base=dsa)

## 2.1 2011

In [0]:
population_estimates_eth_2011 = spark.table(f'{dsa}.hds_population_estimates_2011_ethnicity_age_sex')

In [0]:
#Add high-level ethnicity
population_estimates_eth_2011=(population_estimates_eth_2011.withColumn("high_level_ethnicity", f.split(f.col("ethnicity"), ":")[0]))

#ethnicity - detail only
population_estimates_eth_2011=(population_estimates_eth_2011
                      .withColumn("detail_ethnicity", f.split(f.col("ethnicity"), ":")[1])
                      .withColumn("detail_ethnicity", f.ltrim(f.col("detail_ethnicity")))
                      )

population_estimates_eth_2011 = (population_estimates_eth_2011.withColumn("n", f.when(f.col("n") == "c", 0).otherwise(f.col("n"))))

In [0]:
display(population_estimates_eth_2011)

In [0]:
population_estimates_eth_2011=(

    population_estimates_eth_2011
    .filter(f.col("age_band")!="All").filter(f.col("detail_ethnicity")!="Total").filter(f.col("ethnicity")!="All categories: Ethnic group")
                               
    .withColumn("age_band_proj",  f.when(f.col("age_band").isin("0-4"), "0-4")
                              .when((f.col("age_band").isin("5-7","8-9")), "5-9")
                              .when((f.col("age_band").isin("10-14")), "10-14")  
                              .when((f.col("age_band").isin("15","16-17")), "15-17")  
                              .when((f.col("age_band").isin("18-19","20-24")), "18-24")
                              .when((f.col("age_band").isin("25-29")), "25-29")
                              .when((f.col("age_band").isin("30-34")), "30-34")
                              .when((f.col("age_band").isin("35-39")), "35-39")
                              .when((f.col("age_band").isin("40-44")), "40-44")
                              .when((f.col("age_band").isin("45-49")), "45-49")
                              .when((f.col("age_band").isin("50-54")), "50-54")
                              .when((f.col("age_band").isin("55-59")), "55-59")
                              .when((f.col("age_band").isin("60-64")), "60-64")
                              .when((f.col("age_band").isin("65-69")), "65-69")
                              .when((f.col("age_band").isin("70-74")), "70-74")
                              .when((f.col("age_band").isin("75-79")), "75-79")
                              .when((f.col("age_band").isin("80-84")), "80-84")                                                      
                              .otherwise("85+"))
    
.drop("age_band")
.groupBy(["geography_name","ethnicity","sex","high_level_ethnicity","detail_ethnicity","age_band_proj"]).agg({"n": "sum"}).withColumnRenamed("sum(n)", "n")

    .orderBy("geography_name", "ethnicity", "sex",
           f.when(f.col("age_band_proj") == "0-4", 1)          
           .when(f.col("age_band_proj") == "5-9", 2)
           .when(f.col("age_band_proj") == "10-14", 3)
           .when(f.col("age_band_proj") == "15-17", 4)
           .when(f.col("age_band_proj") == "18-24", 5)
           .when(f.col("age_band_proj") == "25-29", 6)
           .when(f.col("age_band_proj") == "30-34", 7)
           .when(f.col("age_band_proj") == "35-39", 8)
           .when(f.col("age_band_proj") == "40-44", 9)
           .when(f.col("age_band_proj") == "45-49", 10)
           .when(f.col("age_band_proj") == "50-54", 11)
           .when(f.col("age_band_proj") == "55-59", 12)
           .when(f.col("age_band_proj") == "60-64", 13)
           .when(f.col("age_band_proj") == "65-69", 14)
           .when(f.col("age_band_proj") == "70-74", 15)
           .when(f.col("age_band_proj") == "75-79", 16)
           .when(f.col("age_band_proj") == "80-84", 17)
           .when(f.col("age_band_proj") == "85+", 18)
           )

           .withColumnRenamed("age_band_proj","age_band")
    
    )

In [0]:
display(population_estimates_eth_2011)

In [0]:
save_table(df=population_estimates_eth_2011, out_name=f'{proj}_population_estimates_2011_ethnicity_age_sex', save_previous=False, data_base=dsa)

## 2.3 2001

In [0]:
population_estimates_eth_2001 = spark.table(f'{dsa}.hds_population_estimates_2001_ethnicity_age_sex')

In [0]:
#Add high-level ethnicity
population_estimates_eth_2001=(population_estimates_eth_2001.withColumn("high_level_ethnicity", f.split(f.col("ethnicity"), ":")[0]))

#ethnicity - detail only
population_estimates_eth_2001=(population_estimates_eth_2001
                      .withColumn("detail_ethnicity", f.split(f.col("ethnicity"), ":")[1])
                      .withColumn("detail_ethnicity", f.ltrim(f.col("detail_ethnicity")))
                      )

population_estimates_eth_2001 = (population_estimates_eth_2001.withColumn("n", f.when(f.col("n") == "c", 0).otherwise(f.col("n"))))

In [0]:
population_estimates_eth_2001=(population_estimates_eth_2001

#     .withColumn("age_5_band",  f.when(f.col("age_band").isin("0-4"), "0-4")
#                               .when((f.col("age_band").isin("5-7","8-9")), "5-9")
#                               .when((f.col("age_band").isin("10-14")), "10-14")  
#                               .when((f.col("age_band").isin("15","16-17","18-19")), "15-19")  
#                               .when((f.col("age_band").isin("20-24")), "20-24")
#                               .when((f.col("age_band").isin("25-29")), "25-29")
#                               .when((f.col("age_band").isin("30-34")), "30-34")
#                               .when((f.col("age_band").isin("35-39")), "35-39")
#                               .when((f.col("age_band").isin("40-44")), "40-44")
#                               .when((f.col("age_band").isin("45-49")), "45-49")
#                               .when((f.col("age_band").isin("50-54")), "50-54")
#                               .when((f.col("age_band").isin("55-59")), "55-59")
#                               .when((f.col("age_band").isin("60-64")), "60-64")
#                               .when((f.col("age_band").isin("65-69")), "65-69")
#                               .when((f.col("age_band").isin("70-74")), "70-74")
#                               .when((f.col("age_band").isin("75-79")), "75-79")
#                               .when((f.col("age_band").isin("80-84")), "80-84")
#                               .when((f.col("age_band").isin("85-89")), "85-89")                                                        
#                               .otherwise("90+"))
    
    .withColumn("age_band_proj",  f.when(f.col("age_band").isin("0-4"), "0-4")
                              .when((f.col("age_band").isin("5-7","8-9")), "5-9")
                              .when((f.col("age_band").isin("10-14")), "10-14")  
                              .when((f.col("age_band").isin("15","16-17")), "15-17")  
                              .when((f.col("age_band").isin("18-19","20-24")), "18-24")
                              .when((f.col("age_band").isin("25-29")), "25-29")
                              .when((f.col("age_band").isin("30-34")), "30-34")
                              .when((f.col("age_band").isin("35-39")), "35-39")
                              .when((f.col("age_band").isin("40-44")), "40-44")
                              .when((f.col("age_band").isin("45-49")), "45-49")
                              .when((f.col("age_band").isin("50-54")), "50-54")
                              .when((f.col("age_band").isin("55-59")), "55-59")
                              .when((f.col("age_band").isin("60-64")), "60-64")
                              .when((f.col("age_band").isin("65-69")), "65-69")
                              .when((f.col("age_band").isin("70-74")), "70-74")
                              .when((f.col("age_band").isin("75-79")), "75-79")
                              .when((f.col("age_band").isin("80-84")), "80-84")
                              .when((f.col("age_band").isin("85-89")), "85-89")                                                        
                              .otherwise("90+"))
    
.drop("age_band")
.groupBy(["geography_name","ethnicity","sex","high_level_ethnicity","detail_ethnicity","age_band_proj"]).agg({"n": "sum"}).withColumnRenamed("sum(n)", "n")

# not going to sum by 5 year age bands here as want the 18-24 in our project
# .drop("age_band")
#     .groupBy(["geography_name","ethnicity","sex","high_level_ethnicity","detail_ethnicity","age_5_band"]).agg({"n": "sum"}).withColumnRenamed("sum(n)", "n")

#     .withColumn("age_10_band",  f.when(f.col("age_5_band").isin("0-4","5-9"), "0-9")
#                                 .when((f.col("age_5_band").isin("10-14","15-19")), "10-19")
#                                 .when((f.col("age_5_band").isin("20-24","25-29")), "20-29")
#                                 .when((f.col("age_5_band").isin("30-34","35-39")), "30-39")
#                                 .when((f.col("age_5_band").isin("40-44","45-49")), "40-49")
#                                 .when((f.col("age_5_band").isin("50-54","55-59")), "50-59")
#                                 .when((f.col("age_5_band").isin("60-64","65-69")), "60-69")
#                                 .when((f.col("age_5_band").isin("70-74","75-79")), "70-79")                                                                                                
#                                 .when((f.col("age_5_band").isin("80-84","85-89")), "80-89")  
#                               .otherwise("90+"))


#     .orderBy("geography_name", "ethnicity", "sex",
#            f.when(f.col("age_5_band") == "0-4", 1)          
#            .when(f.col("age_5_band") == "5-9", 2)
#            .when(f.col("age_5_band") == "10-14", 3)
#            .when(f.col("age_5_band") == "15-19", 4)
#            .when(f.col("age_5_band") == "20-24", 5)
#            .when(f.col("age_5_band") == "25-29", 6)
#            .when(f.col("age_5_band") == "30-34", 7)
#            .when(f.col("age_5_band") == "35-39", 8)
#            .when(f.col("age_5_band") == "40-44", 9)
#            .when(f.col("age_5_band") == "45-49", 10)
#            .when(f.col("age_5_band") == "50-54", 11)
#            .when(f.col("age_5_band") == "55-59", 12)
#            .when(f.col("age_5_band") == "60-64", 13)
#            .when(f.col("age_5_band") == "65-69", 14)
#            .when(f.col("age_5_band") == "70-74", 15)
#            .when(f.col("age_5_band") == "75-79", 16)
#            .when(f.col("age_5_band") == "80-84", 17)
#            .when(f.col("age_5_band") == "85-89", 18)
#            .when(f.col("age_5_band") == "90+", 19)
#            )

    .orderBy("geography_name", "ethnicity", "sex",
           f.when(f.col("age_band_proj") == "0-4", 1)          
           .when(f.col("age_band_proj") == "5-9", 2)
           .when(f.col("age_band_proj") == "10-14", 3)
           .when(f.col("age_band_proj") == "15-17", 4)
           .when(f.col("age_band_proj") == "18-24", 5)
           .when(f.col("age_band_proj") == "25-29", 6)
           .when(f.col("age_band_proj") == "30-34", 7)
           .when(f.col("age_band_proj") == "35-39", 8)
           .when(f.col("age_band_proj") == "40-44", 9)
           .when(f.col("age_band_proj") == "45-49", 10)
           .when(f.col("age_band_proj") == "50-54", 11)
           .when(f.col("age_band_proj") == "55-59", 12)
           .when(f.col("age_band_proj") == "60-64", 13)
           .when(f.col("age_band_proj") == "65-69", 14)
           .when(f.col("age_band_proj") == "70-74", 15)
           .when(f.col("age_band_proj") == "75-79", 16)
           .when(f.col("age_band_proj") == "80-84", 17)
           .when(f.col("age_band_proj") == "85-89", 18)
           .when(f.col("age_band_proj") == "90+", 19)
           )

           .withColumnRenamed("age_band_proj","age_band")
    
    )

In [0]:
display(population_estimates_eth_2001)

In [0]:
save_table(df=population_estimates_eth_2001, out_name=f'{proj}_population_estimates_2001_ethnicity_age_sex', save_previous=False, data_base=dsa)

In [0]:
display(population_estimates_eth_2001.select("ethnicity").distinct())

# Asian/Asian British: Bangladeshi
# Asian/Asian British: Indian
# Asian/Asian British: Other
# Asian/Asian British: Pakistani
# Black/Black British: Black African
# Black/Black British: Black Caribbean
# Black/Black British: Other
# Chinese/Other: Chinese
# Chinese/Other: Other
# Mixed: Other
# Mixed: White and Asian
# Mixed: White and Black African
# Mixed: White and Black Caribbean
# White: British
# White: Irish
# White: Other

In [0]:
display(population_estimates_eth_2001.select("detail_ethnicity").distinct())