In [28]:
from pathlib import Path

import pandas as pd
from collections import defaultdict

pd.set_option('display.max_colwidth', None)
pd.set_option("display.max_rows", 1000)
pd.set_option('display.width', None)

In [4]:
dir = "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/output/logs"

In [5]:
files = list(Path(dir).glob("*.log"))
len(files)

220

In [6]:
data = defaultdict(list)
for file in files:
    with open(file, "r", encoding="utf-8") as fstream:
        for line in fstream.read().splitlines():
            error = None
            warn = None
            if "ERROR" in line:
                error = line.split("\t")[-1]
                
            if "WARN" in line:
                warn = line.split("\t")[-1]
            
            if error or warn: 
                data["file"].append(file.name)
                data["error"].append(error)
                data["warning"].append(warn)

In [7]:
df = pd.DataFrame(data)

In [8]:
df.head()

Unnamed: 0,file,error,warning
0,2022_Children's Hospital 2 A4D Tracker_patient.log,,Non-matching column names found: meterissued.
1,2022_Children's Hospital 2 A4D Tracker_patient.log,,Non-matching column names found: instantmeterreceived.
2,2022_Children's Hospital 2 A4D Tracker_patient.log,,Non-matching column names found: instantmeterreceived.
3,2022_Children's Hospital 2 A4D Tracker_patient.log,,Non-matching column names found: instantmeterreceived.
4,2022_Children's Hospital 2 A4D Tracker_patient.log,,Non-matching column names found: instantmeterreceived.


In [9]:
df.to_csv(Path(dir) / "error_report.csv", index=False)

In [10]:
df.file.nunique()

145

In [11]:
df.file.unique()

array(["2022_Children's Hospital 2 A4D Tracker_patient.log",
       '2020_Penang General Hospital A4D Tracker_DC_patient_raw.log',
       '2019_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log',
       '2020_Mahosot Hospital A4D Tracker_DC_product_raw.log',
       "2020_Vietnam National Children's Hospital A4D Tracker_product.log",
       '2021_Putrajaya Hospital A4D Tracker_DC_patient.log',
       '2022_Vietnam National Children_s Hospital A4D Tracker_product_raw.log',
       '2019_Sarawak General Hospital A4D Tracker_DC_patient_raw.log',
       '2022_Siriraj Hospital A4D Tracker_product.log',
       '2022_Sultanah Bahiyah A4D Tracker_DC_patient_raw.log',
       '2022_Preah Kossamak Hospital A4D Tracker_product.log',
       '2021_Vietnam National Children_s Hospital A4D Tracker_patient_raw.log',
       '2022_Penang General Hospital A4D Tracker_DC_patient.log',
       '2022_Sultanah Bahiyah A4D Tracker_DC_product.log',
       '2018_Mahosot Hospital A4D Tracker_product.log',
       

## Error

In [12]:
df[~df.error.isna()]

Unnamed: 0,file,error,warning
996,2022_Penang General Hospital A4D Tracker_DC_patient.log,Could not process patient data. Error = Error in if (header_cols[2] == header_cols_2[2]) {: missing value where TRUE/FALSE needed .,
1794,2022_Sunprasitthiprasong Hospital A4D Tracker_product.log,Could not process product data. Error = Error in start_df_msd:end_df_msd: argument of length 0 .,
1795,2020_Sarawak General Hospital A4D Tracker_DC_product_raw.log,Could not process raw product data. Error = Error in `dplyr::mutate()`: ℹ In argument: `product_received_from = case_when(...)`. Caused by error in `case_when()`: ! Can't combine `..1 (right)` <double> and `..2 (right)` <character>. .,
10723,2021_Children's Hospital 2 A4D Tracker_patient_raw.log,"Could not process raw patient data. Error = Error in `mutate()`: ℹ In argument: `t1d_diagnosis_age = fix_t1d_diagnosis_age(t1d_diagnosis_age, id)`. ℹ In row 1. Caused by error in `case_when()` at R/helper_patient_data_fix.R:319:4: ! Can't combine `..1 (right)` <character> and `..3 (right)` <integer>. .",
11586,2021_Uni Med Center A4D Tracker_patient_raw.log,"Could not process raw patient data. Error = Error in `mutate()`: ℹ In argument: `t1d_diagnosis_age = fix_t1d_diagnosis_age(t1d_diagnosis_age, id)`. ℹ In row 1. Caused by error in `case_when()` at R/helper_patient_data_fix.R:319:4: ! Can't combine `..1 (right)` <character> and `..3 (right)` <integer>. .",
27097,2017_Vietnam National Children's Hospital A4D Tracker_product.log,"Could not process product data. Error = Error in `filter()`: ℹ In argument: `!is.na(product_entry_date) & !grepl(""^[0-9]+$"", product_entry_date)`. Caused by error: ! object 'product_entry_date' not found .",
27211,2017_Mahosot Hospital A4D Tracker_product_raw.log,Could not process raw product data. Error = Error in `[<-` at R/helper_product_data.R:483:12: ! Assigned data `value` must be compatible with existing data. ℹ Error occurred for column `product_units_released`. Caused by error in `vec_assign()`: ! Can't convert <double> to <character>. .,


In [13]:
# how many files could not be processed in %?
len(df[~df.error.isna()]) / len(files) * 100
                                                        

3.1818181818181817

### Error analysis

#### Missing or invalid patient id

In [14]:
# missing patient id
df[(~df.error.isna()) & (df.error.str.contains("id is not valid"))].drop_duplicates()

Unnamed: 0,file,error,warning


#### missing month

In [15]:
# month list is empty?
print("\n".join(l.split(".")[0] for l in df[(~df.error.isna()) & (df.error.str.contains("month_list"))].file.tolist()))




#### missing patient data

In [16]:
# cannot find patient data
for year in range(2017,2023):
    subdf = df[(~df.error.isna()) & (df.error.str.contains("readxl::cell_limits"))]
    print(subdf[subdf.file.str.startswith(str(year))].file.tolist())

[]
[]
[]
[]
[]
[]


In [19]:
# check problem with num_na_rows
subdf = df[(~df.error.isna()) & (df.error.str.contains("num_na_rows"))]
subdf

Unnamed: 0,file,error,warning


In [18]:
# check start_df_msd:end_df_msd
subdf = df[(~df.error.isna()) & (df.error.str.contains("start_df_msd"))]
subdf.file.tolist()

['2022_Sunprasitthiprasong Hospital A4D Tracker_product.log']

# Warnings

In [20]:
df[~df.warning.isna()].drop_duplicates()


Unnamed: 0,file,error,warning
0,2022_Children's Hospital 2 A4D Tracker_patient.log,,Non-matching column names found: meterissued.
1,2022_Children's Hospital 2 A4D Tracker_patient.log,,Non-matching column names found: instantmeterreceived.
12,2020_Penang General Hospital A4D Tracker_DC_patient_raw.log,,Found invalid values for column blood_pressure_mmhg that do not follow the format X/Y. Values were replaced with 999999.
13,2020_Penang General Hospital A4D Tracker_DC_patient_raw.log,,"Extra columns in patient data: admissiontohospitaldatemmmyy, observations1, admissiontohospitaldatemmyy"
14,2020_Penang General Hospital A4D Tracker_DC_patient_raw.log,,"Missing columns in patient data: clinic_code, country_code, fbg_baseline_mg, fbg_updated_mg, hospitalisation_cause, hospitalisation_date, last_remote_followup_date, lost_date, observations, observations_category, status_out, t1d_diagnosis_date, t1d_diagnosis_with_dka, updated_2022_date"
...,...,...,...
27441,2022_CDA A4D Tracker_patient_raw.log,,"Patient KH_CD015: Value Self-mixed BD,Basal-bolus (AN) for column insulin_regimen is not in the list of allowed values."
27442,2022_CDA A4D Tracker_patient_raw.log,,"Patient KH_CD015: Value Basal-bolus MDI (AN/HI),Basal-bolus (AN) for column insulin_regimen is not in the list of allowed values."
27446,2022_CDA A4D Tracker_patient_raw.log,,"Patient KH_CD015: Value Premixed 30/70 BD,Basal-bolus (AN) for column insulin_regimen is not in the list of allowed values."
27453,2022_Children's Hospital 2 A4D Tracker_product.log,,"Jul'22 number of dates in product_entry_date that don't match the month/year on the sheet is 4: 2020-07-27, 2020-07-29, 2020-07-27, 2020-07-29"


### Warning analysis

#### extra columns

In [21]:
missed_names = df[(~df.warning.isna()) & (df.warning.str.contains("Extra"))].warning.str.strip("Extra columns in patient data:").drop_duplicates().to_list()

names = set()
for x in missed_names:
    names.update(x for x in x.split(",") if x)
    
names

{' activeremoteupdatedate',
 ' admissiontohospitaldatemmmyy',
 ' admissiontohospitaldatemmyy',
 ' complication_screening',
 ' complication_screening_',
 ' complication_screening_date',
 ' complication_screening_results',
 ' est_strips_pmoth',
 ' family_support_',
 ' family_support_scale',
 ' ge100',
 ' hospitalisation_date1',
 ' instantmeterreceiv',
 ' instantmeterreceiveddate',
 ' insulin_required_month',
 ' insulin_required_y',
 ' insurancestatusnanssfeqeqpending',
 ' lastclinicvisitไปโรงพยาบาลdate',
 ' lastremotefollowupสายเขา',
 ' lastremotefollowupโดยโทรศัพทdate',
 ' meter',
 ' meter_received_',
 ' meter_received_date',
 ' meterissued',
 ' new',
 ' observations1',
 ' patientcontactnumber',
 ' remote_followup',
 ' remotefu',
 ' virtualremotecontactdate',
 ' virtualremotecontactyn',
 '_dosage',
 '_screening',
 '_screening_results',
 '_strips_pmoth',
 '_visit',
 'hospitaldatemmmyy',
 'veremoteupdateyn'}

#### invalid values

In [22]:
subdf = df[(~df.warning.isna()) & (df.warning.str.contains("Could not convert value", regex=False))]

subdf.warning.str.strip("Could not convert value ").unique().tolist()

['- in column bmi for patient: MY_PN004',
 '- in column fbg_baseline_mmol for patient: MY_PN004',
 '0ct/19 in column bmi_date for patient: MY_PN011',
 'il in column hba1c_baseline for patient: MY_SW008',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW001',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW002',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW003',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW004',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW005',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW009',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW010',
 'NA or Hospitalisation Date in column hospitalisation_date for patient: MY_SW011',
 'Nil in column last_clinic_visit_date for patient: MY_SW008',
 '25-29/7/22 in column hospitalisation_date for patient: MY_S

In [23]:
subdf.file.unique().tolist()

['2020_Penang General Hospital A4D Tracker_DC_patient_raw.log',
 '2019_Sarawak General Hospital A4D Tracker_DC_patient_raw.log',
 '2022_Sultanah Bahiyah A4D Tracker_DC_patient_raw.log',
 '2021_Vietnam National Children_s Hospital A4D Tracker_patient_raw.log',
 '2022_Khon Kaen Hospital A4D Tracker_patient_raw.log',
 "2017_Vietnam National Children's Hospital A4D Tracker_patient_raw.log",
 '2019_Lao Friends Hospital for Children A4D Tracker_DC_patient_raw.log',
 '2019_Penang General Hospital A4D Tracker_DC_patient_raw.log',
 '2019_Sultanah Bahiyah Hospital A4D Tracker_DC_patient_raw.log',
 '2019_Mahosot Hospital A4D Tracker_patient_raw.log',
 '2021_Penang General Hospital A4D Tracker_DC_patient_raw.log',
 '2020_Lao Friends Hospital for Children A4D Tracker_DC_patient_raw.log',
 '2022_Siriraj Hospital A4D Tracker_patient_raw.log',
 '2022_Jayavarman VII Hospital A4D Tracker_patient_raw.log',
 '2018_Vietnam National Children_s Hospital A4D Tracker_patient_raw.log',
 '2021_Mahosot Hospital A

#### invalid values (outside range)

In [24]:
subdf = df[(~df.warning.isna()) & (df.warning.str.contains("Found invalid value", regex=False))]

subdf.warning.str.strip("Found invalid value ").str.split(expand=True)[[0,3]].drop_duplicates()

Unnamed: 0,0,3
12,s,blood_pressure_mmhg
186,18.5,hba1c_baseline
190,200,fbg_baseline_mmol
662,31,age
663,32,age
674,3.9,hba1c_baseline
1679,11,blood_pressure_sys_mmhg
1683,12,blood_pressure_sys_mmhg
1696,15,blood_pressure_sys_mmhg
1700,17,blood_pressure_sys_mmhg


In [25]:
subdf.file.unique().tolist()

['2020_Penang General Hospital A4D Tracker_DC_patient_raw.log',
 '2019_Sarawak General Hospital A4D Tracker_DC_patient_raw.log',
 '2021_Vietnam National Children_s Hospital A4D Tracker_patient_raw.log',
 "2017_Vietnam National Children's Hospital A4D Tracker_patient_raw.log",
 '2019_Penang General Hospital A4D Tracker_DC_patient_raw.log',
 '2019_Mahosot Hospital A4D Tracker_patient_raw.log',
 '2022_Siriraj Hospital A4D Tracker_patient_raw.log',
 '2022_Jayavarman VII Hospital A4D Tracker_patient_raw.log',
 '2018_Vietnam National Children_s Hospital A4D Tracker_patient_raw.log',
 '2022_Sarawak General Hospital A4D Tracker_DC_patient_raw.log',
 '2021_Mahosot Hospital A4D Tracker_DC_patient_raw.log',
 "2022_Mandalay Children's Hospital A4D Tracker_patient_raw.log",
 '2020_Sarawak General Hospital A4D Tracker_DC_patient_raw.log',
 '2022_Sunprasitthiprasong Hospital A4D Tracker_patient_raw.log',
 '2022_Preah Kossamak Hospital A4D Tracker_patient_raw.log',
 '2022_Phattalung Hospital A4D Track

#### invalid values (not allowed)

In [29]:
subdf = df[(~df.warning.isna()) & (df.warning.str.contains("not in the list of allowed values", regex=False))]

subdf.warning.str.split(expand=True)[[3, 6]].drop_duplicates()

Unnamed: 0,3,6
234,"Basal-bolus,NA",insulin_regimen
243,Basal-bolus,for
246,"NA,NA",insulin_regimen
318,"Pendang,",column
330,"Kedah,",column
342,Jitra,column
354,"Jitra,",column
366,Alor,for
378,"Changloon,",column
686,Namdinh,province


In [27]:
subdf.file.unique().tolist()

['2022_Sultanah Bahiyah A4D Tracker_DC_patient_raw.log',
 '2021_Vietnam National Children_s Hospital A4D Tracker_patient_raw.log',
 '2022_Khon Kaen Hospital A4D Tracker_patient_raw.log',
 "2017_Vietnam National Children's Hospital A4D Tracker_patient_raw.log",
 '2022_Chulalongkorn Hospital A4D Tracker_patient_raw.log',
 '2019_Sultanah Bahiyah Hospital A4D Tracker_DC_patient_raw.log',
 '2019_Mahosot Hospital A4D Tracker_patient_raw.log',
 '2021_Penang General Hospital A4D Tracker_DC_patient_raw.log',
 '2020_Lao Friends Hospital for Children A4D Tracker_DC_patient_raw.log',
 '2022_Siriraj Hospital A4D Tracker_patient_raw.log',
 '2022_Putrajaya Hospital A4D Tracker_DC_patient_raw.log',
 '2022_Jayavarman VII Hospital A4D Tracker_patient_raw.log',
 '2018_Vietnam National Children_s Hospital A4D Tracker_patient_raw.log',
 '2022_Surat Thani A4D Tracker_patient_raw.log',
 '2022_Sarawak General Hospital A4D Tracker_DC_patient_raw.log',
 '2021_Mahosot Hospital A4D Tracker_DC_patient_raw.log',
 '