In [1]:
from pathlib import Path

import pandas as pd
from collections import defaultdict

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [2]:
dir = "/Users/miay/Library/CloudStorage/OneDrive-Personal/1 Projects/CorrelAid/2023 Q1/A4D/data/logs"

In [3]:
files = list(Path(dir).glob("*.log"))
len(files)

39

In [4]:
data = defaultdict(list)
for file in files:
    with open(file, "r", encoding="utf-8") as fstream:
        for line in fstream.read().splitlines():
            error = None
            warn = None
            if "ERROR" in line:
                error = line.split("\t")[-1]
                
            if "WARN" in line:
                warn = line.split("\t")[-1]
            
            if error or warn: 
                data["file"].append(file.name)
                data["error"].append(error)
                data["warning"].append(warn)

In [5]:
df = pd.DataFrame(data)

In [6]:
df

Unnamed: 0,file,error,warning
0,2022_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: lostpatientssummarylostdate,lostpatientssummarystatusout."
1,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
2,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
3,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
4,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
...,...,...,...
239,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmmyy,remarkscomplicationsfamilysupport."
240,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmyy,remarkscomplicationsfamilysupport."
241,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmyy,remarkscomplicationsfamilysupport."
242,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmyy,remarkscomplicationsfamilysupport."


In [16]:
df.to_csv(Path(dir) / "error_report.csv", index=False)

In [7]:
df.file.nunique()

18

## Error

In [8]:
df[~df.error.isna()]

Unnamed: 0,file,error,warning
11,2021_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2021_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
23,2021_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2022_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
93,2018_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2018_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
94,2018_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2019_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
95,2018_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2020_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
96,2018_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2021_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
108,2018_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2022_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
157,2020_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2020_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
158,2020_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2021_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,
170,2020_Lao Friends Hospital for Children A4D Tracker_DC_patient.log,Could not process 2022_Lao Friends Hospital for Children A4D Tracker_DC. Error = Error: row_min < row_max is not TRUE .,


In [9]:
# how many files could not be processed in %?
len(df[(~df.error.isna()) & (df.error.str.contains("Could not process"))]) / len(files) * 100
                                                        

35.8974358974359

In [10]:
# missing patient id
df[(~df.error.isna()) & (df.error.str.contains("patient_id"))].file.tolist()

[]

In [11]:
# month list is empty?
print("\n".join(l.split(".")[0] for l in df[(~df.error.isna()) & (df.error.str.contains("month_list"))].file.tolist()))




In [12]:
# cannot find patient data
for year in range(2017,2023):
    subdf = df[(~df.error.isna()) & (df.error.str.contains("readxl::cell_limits"))]
    print(subdf[subdf.file.str.startswith(str(year))].file.tolist())

[]
[]
[]
[]
[]
[]


In [13]:
# check problem with num_na_rows
subdf = df[(~df.error.isna()) & (df.error.str.contains("num_na_rows"))]
subdf

Unnamed: 0,file,error,warning


In [14]:
# check start_df_msd:end_df_msd
subdf = df[(~df.error.isna()) & (df.error.str.contains("start_df_msd"))]
subdf.file.tolist()

[]

# Warnings

In [18]:
df[~df.warning.isna()]

Unnamed: 0,file,error,warning
0,2022_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: lostpatientssummarylostdate,lostpatientssummarystatusout."
1,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
2,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
3,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
4,2020_Sarawak General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: updatedhba1chba1cdate,updatedfbgfbgmmoll,updatedfbgfbgdate."
...,...,...,...
239,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmmyy,remarkscomplicationsfamilysupport."
240,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmyy,remarkscomplicationsfamilysupport."
241,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmyy,remarkscomplicationsfamilysupport."
242,2020_Penang General Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: educationvocation,admissiontohospitaldatemmyy,remarkscomplicationsfamilysupport."


In [15]:
missed_names = df[(~df.warning.isna()) & (df.warning.str.startswith("Non-matching column names found"))].warning.str.split(":",expand=True)[1].drop_duplicates()

In [16]:
names = set()
for row in missed_names:
    names.update(w.strip(".") for w in row.split(","))

In [17]:
names

{' complicationscreeningdropdown',
 ' currentmonthcomplicationscreeningdropdown',
 ' educationvocation',
 ' hospitalisationduetodiabetesemergencydate',
 ' lostpatientssummarylostdate',
 ' newmeter',
 ' recommendedtestingfrequencyperday',
 ' updatedhba1chba1cdate',
 ' virtualremotecontactyn',
 'admissiontohospitaldatemmmyy',
 'admissiontohospitaldatemmyy',
 'complicationscreening',
 'complicationscreeningcurrentmonthtesting',
 'complicationscreeningcurrentmonthtestingselectfordropdown',
 'complicationscreeningselect',
 'complicationscreeningselectfordropdown',
 'currentmonthcomplicationscreening',
 'currentmonthhospitalisationduetodiabetesemergencyorglucosecontroldate',
 'currentmonthhospitalisationduetodiabetesemergencyorglucosecontrolreason',
 'hospitalisationduetodiabetesemergencyorglucosecontroldate',
 'hospitalisationduetodiabetesemergencyorglucosecontrolreason',
 'lostpatientssummarystatusout',
 'meter',
 'observationcategoryselect',
 'otherpatientobservations',
 'patientobservati