In [1]:
from pathlib import Path

import pandas as pd
from collections import defaultdict

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [4]:
dir = "/Users/miay/Library/CloudStorage/OneDrive-Personal/1 Projects/CorrelAid/2023 Q1/A4D/data/logs"

In [5]:
files = list(Path(dir).glob("*.log"))
len(files)

73

In [6]:
data = defaultdict(list)
for file in files:
    with open(file, "r", encoding="utf-8") as fstream:
        for line in fstream.read().splitlines():
            error = None
            warn = None
            if "ERROR" in line:
                error = line.split("\t")[-1]
                
            if "WARN" in line:
                warn = line.split("\t")[-1]
            
            if error or warn: 
                data["file"].append(file.name)
                data["error"].append(error)
                data["warning"].append(warn)

In [7]:
df = pd.DataFrame(data)

In [8]:
df

Unnamed: 0,file,error,warning
0,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Jan20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2019-12-31, 2019-12-31, 2019-12-31"
1,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Feb20 number of dates in product_entry_date that don't match the month/year on the sheet is 4: 2020-01-31, 2020-01-31, 2020-03-04, 2020-01-31"
2,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Mar20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-02-29, 2020-02-29, 2020-02-29"
3,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Apr20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-03-31, 2020-03-31, 2020-03-31"
4,2020_Penang General Hospital A4D Tracker_DC_product.log,,"May20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-04-30, 2020-04-30, 2020-04-30"
...,...,...,...
332,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Aug20 number of dates in product_entry_date that don't match the month/year on the sheet is 6: 2020-07-31, 2020-09-30, 2020-07-31, 2020-09-30, 2020-07-31, 2020-09-30"
333,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Sep20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-08-31, 2020-08-31, 2020-08-31"
334,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Oct20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-09-30, 2020-09-30, 2020-09-30"
335,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Nov20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-10-31, 2020-10-31, 2020-10-31"


In [9]:
df.to_csv(Path(dir) / "error_report.csv", index=False)

In [10]:
df.file.nunique()

48

In [12]:
df.file.unique()

array(['2020_Penang General Hospital A4D Tracker_DC_product.log',
       '2019_Lao Friends Hospital for Children A4D Tracker_DC_product.log',
       '2022_Mukdahan Hospital A4D Tracker_product.log',
       '2022_Lao Friends Hospital for Children A4D Tracker_DC_patient.log',
       '2022_Chiang Mai Maharaj Nakorn A4D Tracker_patient.log',
       '2018_Penang General Hospital A4D Tracker_DC_product.log',
       '2020_Sarawak General Hospital A4D Tracker_DC_patient.log',
       '2021_Lao Friends Hospital for Children A4D Tracker_DC_patient.log',
       '2020_Mahosot Hospital A4D Tracker_DC_patient.log',
       '2021_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log',
       '2021_Sarawak General Hospital A4D Tracker_DC_patient.log',
       '2022_Pahol  Polpayuhasena A4D Tracker_product.log',
       '2022_Sultanah Bahiyah A4D Tracker_DC_product.log',
       '2019_Mahosot Hospital A4D Tracker_product.log',
       '2022_Sarawak General Hospital A4D Tracker_DC_product.log',
       "2021_Li

## Error

In [13]:
df[~df.error.isna()]

Unnamed: 0,file,error,warning
59,2021_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process patient data. Error = Error: nrow(df_patient) > 0 is not TRUE .,
116,2022_Penang General Hospital A4D Tracker_DC_patient.log,Could not process patient data. Error = Error in if (header_cols[2] == header_cols_2[2]) {: missing value where TRUE/FALSE needed .,
224,2020_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process patient data. Error = Error: nrow(df_patient) > 0 is not TRUE .,
233,2022_Sunprasitthiprasong Hospital A4D Tracker_product.log,Could not process product data. Error = Error in start_df_msd:end_df_msd: argument of length 0 .,


In [14]:
# how many files could not be processed in %?
len(df[(~df.error.isna()) & (df.error.str.contains("Could not process"))]) / len(files) * 100
                                                        

5.47945205479452

In [15]:
# missing patient id
df[(~df.error.isna()) & (df.error.str.contains("patient_id"))].file.tolist()

[]

In [16]:
# month list is empty?
print("\n".join(l.split(".")[0] for l in df[(~df.error.isna()) & (df.error.str.contains("month_list"))].file.tolist()))




In [17]:
# cannot find patient data
for year in range(2017,2023):
    subdf = df[(~df.error.isna()) & (df.error.str.contains("readxl::cell_limits"))]
    print(subdf[subdf.file.str.startswith(str(year))].file.tolist())

[]
[]
[]
[]
[]
[]


In [18]:
# check problem with num_na_rows
subdf = df[(~df.error.isna()) & (df.error.str.contains("num_na_rows"))]
subdf

Unnamed: 0,file,error,warning


In [19]:
# check start_df_msd:end_df_msd
subdf = df[(~df.error.isna()) & (df.error.str.contains("start_df_msd"))]
subdf.file.tolist()

['2022_Sunprasitthiprasong Hospital A4D Tracker_product.log']

# Warnings

In [20]:
df[~df.warning.isna()]

Unnamed: 0,file,error,warning
0,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Jan20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2019-12-31, 2019-12-31, 2019-12-31"
1,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Feb20 number of dates in product_entry_date that don't match the month/year on the sheet is 4: 2020-01-31, 2020-01-31, 2020-03-04, 2020-01-31"
2,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Mar20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-02-29, 2020-02-29, 2020-02-29"
3,2020_Penang General Hospital A4D Tracker_DC_product.log,,"Apr20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-03-31, 2020-03-31, 2020-03-31"
4,2020_Penang General Hospital A4D Tracker_DC_product.log,,"May20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-04-30, 2020-04-30, 2020-04-30"
...,...,...,...
332,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Aug20 number of dates in product_entry_date that don't match the month/year on the sheet is 6: 2020-07-31, 2020-09-30, 2020-07-31, 2020-09-30, 2020-07-31, 2020-09-30"
333,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Sep20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-08-31, 2020-08-31, 2020-08-31"
334,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Oct20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-09-30, 2020-09-30, 2020-09-30"
335,2020_Sultanah Bahiyah Hospital A4D Tracker_DC_product.log,,"Nov20 number of dates in product_entry_date that don't match the month/year on the sheet is 3: 2020-10-31, 2020-10-31, 2020-10-31"


In [21]:
missed_names = df[(~df.warning.isna()) & (df.warning.str.startswith("Non-matching column names found"))].warning.str.split(":",expand=True)[1].drop_duplicates()

In [22]:
names = set()
for row in missed_names:
    names.update(w.strip(".") for w in row.split(","))

In [23]:
names

{' complicationscreeningdropdown',
 ' currentmonthcomplicationscreeningdropdown',
 ' educationvocation',
 ' estimatedinsulinrequiredperyearvialbox',
 ' hospitalisationduetodiabetesemergencydate',
 ' hospitalisationduetodiabetesemergencyorglucosecontrolreason',
 ' instantmeterissued',
 ' instantmeterreceiveddate',
 ' lastclinicvisitไปโรงพยาบาลdate',
 ' lastremotefollowupโดยโทรศัพทdate',
 ' newmeter',
 ' remotefu',
 ' updatedhba1chba1cdate',
 ' virtualremotecontactyn',
 'admissiontohospitaldatemmmyy',
 'admissiontohospitaldatemmyy',
 'complicationscreeningdropdown',
 'complicationscreeningselect',
 'complicationscreeningselectfordropdown',
 'currentmonthcomplicationscreeningdropdown',
 'estimatedinsulinrequiredperyearvialbox',
 'hospitalisationduetodiabetesemergencyorglucosecontroldate',
 'hospitalisationduetodiabetesemergencyorglucosecontrolreason',
 'instantmeterreceiveddate',
 'lastremotefollowupสายเขาdate',
 'meter',
 'patientobservations',
 'remarkscomplicationsfamilysupport',
 'upd