# 0 Setup

In [0]:
spark.sql('CLEAR CACHE')
spark.conf.set('spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation', 'true')

In [0]:
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql import Window

from functools import reduce

import databricks.koalas as ks
import pandas as pd
import numpy as np

import re
import io
import datetime

import matplotlib
import matplotlib.pyplot as plt
from matplotlib import dates as mdates
import seaborn as sns

print("Matplotlib version: ", matplotlib.__version__)
print("Seaborn version: ", sns.__version__)
_datetimenow = datetime.datetime.now() # .strftime("%Y%m%d")
print(f"_datetimenow:  {_datetimenow}")

In [0]:
%run "../SHDS/common/functions"

In [0]:
%run "./CCU056-01-parameters"

In [0]:
main_cohort_final2 = spark.table(f'{dsa}.ccu056_tmp_main_cohort_final2')

# 1 LSOAs outside England

In [0]:
all_lsoa_unassembled = spark.table(f'{dsa}.{proj}_tmp_all_cases_lsoa_unassembled')

In [0]:
outside = (main_cohort_final2.filter(f.col("LSOA_Outside_England")==1).join(all_lsoa_unassembled,on="PERSON_ID",how="left"))
           
display(outside)

In [0]:
outside_england_only = (outside.filter(f.col("LSOA").startswith("E")))

display(outside_england_only)

In [0]:
all_lsoa_unassembled = (outside_england_only
.withColumn('OPERATION_DATE',f.date_format(f.col("OPERATION_DATE"), "yyyy-MM-dd"))
.withColumn('DATE_DIFF', f.abs(f.datediff(f.col("RECORD_DATE"), f.col("OPERATION_DATE"))))
        .withColumn('RECORD_SOURCE_group_final',
                    f.when(f.col('RECORD_SOURCE') == 'nacsa', 5)
                    .when(f.col('RECORD_SOURCE') == 'tavi', 5)
                    .when(f.col('RECORD_SOURCE') == 'gdppr', 3)
                    .when(f.col('RECORD_SOURCE') == 'gdppr_snomed', 4)
                    .when(f.col('RECORD_SOURCE') == 'hes_apc', 1)
                    .when(f.col('RECORD_SOURCE') == 'hes_op', 2)
                    .when(f.col('RECORD_SOURCE') == 'hes_ae', 2)
                    )
)

# define windows for row numbers
_win_rownum_LSOA = (
    Window
    .partitionBy('PERSON_ID')
    .orderBy(['DATE_DIFF', 'RECORD_SOURCE_group_final'])) #prioritising DATE_DIFF first then datasource after

all_lsoa_unassembled = (all_lsoa_unassembled
    .withColumn('_rownum_LSOA', f.row_number().over(_win_rownum_LSOA))
    )

display(all_lsoa_unassembled.orderBy("PERSON_ID","DATE_DIFF"))

In [0]:
varlist = ['LSOA']
  
for ind, var in enumerate(varlist):
    record_source = 'RECORD_SOURCE_group_final'
    # define window for tied records
    _win_tie = (Window
      .partitionBy('PERSON_ID')
      .orderBy('DATE_DIFF', record_source)
      )
      
    # count distinct values of var (including null) within tied records
    _tie = (
      all_lsoa_unassembled
      .withColumn(f'_tie_{var}', f.dense_rank().over(_win_tie))
      .where(f.col(f'_tie_{var}') == 1)
      .groupBy('PERSON_ID')
      .agg(
        f.countDistinct(f.col(f'{var}')).alias(f'_n_distinct_{var}')
        , f.countDistinct(f.when(f.col(f'{var}').isNull(), 1)).alias(f'_null_{var}')
      )
      .withColumn(f'_tie_{var}', f.when((f.col(f'_n_distinct_{var}') + f.col(f'_null_{var}')) > 1, 1).otherwise(0))
      .select('PERSON_ID', f'_tie_{var}'))
  
    if(ind == 0): _tmp_ties = _tie
    else: _tmp_ties = (_tmp_ties.join(_tie, on=['PERSON_ID'], how='outer'))



# take information from the first row identified above
_tmp_selected = {}
for var in varlist:
    _tmp = (
      all_lsoa_unassembled
      .select('PERSON_ID', 'RECORD_DATE', 'RECORD_SOURCE', f'{var}', f'_rownum_{var}')
      .where(f.col(f'_rownum_{var}') == 1)
      .withColumnRenamed('RECORD_DATE', f'_date_{var}')
      .withColumnRenamed('RECORD_SOURCE', f'_source_{var}')
      .select('PERSON_ID', f'{var}', f'_date_{var}', f'_source_{var}'))
    _tmp_selected[f'{var}'] = _tmp

_selected = (
    _tmp_selected['LSOA']
    .join(_tmp_ties, on=['PERSON_ID'], how='outer')
    .select('PERSON_ID', 'LSOA'
            , '_date_LSOA', '_source_LSOA', '_tie_LSOA'))

In [0]:
display(_selected)

In [0]:
# count ties
display(_selected.groupBy("_tie_LSOA").count())

In [0]:
save_table(df=_selected, out_name=f'{proj}_tmp_outside_england_lsoa_selected', save_previous=False, data_base=dsa)

In [0]:
display(main_cohort_final2.filter(f.col("LSOA_Outside_England")==1).distinct().count())

In [0]:
display(_selected.count())

# 2 Incorrect DOBs

In [0]:
display(main_cohort_final2.filter(f.col("DOB")<="1900-01-01").select("DOB").distinct())

In [0]:
display(main_cohort_final2.filter(f.col("DOB")<="1900-01-01"))

In [0]:
skinny_unassembled = spark.table(f'{dsa}.{proj}_tmp_all_cases_unassembled')

In [0]:
skinny_dob = (
    main_cohort_final2.filter(f.col("DOB")<="1900-01-01").drop("DOB").join(skinny_unassembled,on="PERSON_ID",how="left")
    .filter(f.col("DOB")>"1801-01-01")
              )

display(skinny_dob)

In [0]:
skinny_dob = (
    main_cohort_final2.filter(f.col("DOB")<="1900-01-01").drop("DOB").join(skinny_unassembled,on="PERSON_ID",how="left")
              )

display(skinny_dob)

**Conclusion: All the 1800-01-01 and 1801-01-01 DOBs have no other DOBs that are within the plausible range**