In [1]:
from pathlib import Path

import pandas as pd
from collections import defaultdict

pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', None)

In [16]:
dir = "/Volumes/USB SanDisk 3.2Gen1 Media/a4d/a4dphase2_upload/output/logs"

In [17]:
files = list(Path(dir).glob("*.log"))
len(files)

47

In [18]:
data = defaultdict(list)
for file in files:
    with open(file, "r", encoding="utf-8") as fstream:
        for line in fstream.read().splitlines():
            error = None
            warn = None
            if "ERROR" in line:
                error = line.split("\t")[-1]
                
            if "WARN" in line:
                warn = line.split("\t")[-1]
            
            if error or warn: 
                data["file"].append(file.name)
                data["error"].append(error)
                data["warning"].append(warn)

In [19]:
df = pd.DataFrame(data)

In [20]:
df

Unnamed: 0,file,error,warning
0,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: recommendedtestingfrequencyperday,hospitalisationduetodiabetesemergencyorglucosecontrolreason,hospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningselect,patientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
1,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: recommendedtestingfrequencyperday,hospitalisationduetodiabetesemergencyorglucosecontrolreason,hospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningselectfordropdown,complicationscreening,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
2,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: virtualremotecontactyn,virtualremotecontactdate,recommendedtestingfrequencyperday,hospitalisationduetodiabetesemergencyorglucosecontrolreason,hospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningselectfordropdown,complicationscreening,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
3,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: virtualremotecontactyn,virtualremotecontactdate,recommendedtestingfrequencyperday,currentmonthhospitalisationduetodiabetesemergencyorglucosecontrolreason,currentmonthhospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningcurrentmonthtestingselectfordropdown,complicationscreeningcurrentmonthtesting,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
4,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: virtualremotecontactyn,virtualremotecontactdate,recommendedtestingfrequencyperday,currentmonthhospitalisationduetodiabetesemergencyorglucosecontrolreason,currentmonthhospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningcurrentmonthtestingselectfordropdown,complicationscreeningcurrentmonthtesting,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport,meter."
...,...,...,...
251,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.
252,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.
253,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.
254,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.


In [21]:
df.to_csv(Path(dir) / "error_report.csv", index=False)

In [22]:
df.file.nunique()

18

## Error

In [23]:
df[~df.error.isna()]

Unnamed: 0,file,error,warning


In [24]:
# how many files could not be processed in %?
len(df[(~df.error.isna()) & (df.error.str.contains("Could not process"))]) / len(files) * 100
                                                        

0.0

In [25]:
# missing patient id
df[(~df.error.isna()) & (df.error.str.contains("patient_id"))].file.tolist()

[]

In [26]:
# month list is empty?
print("\n".join(l.split(".")[0] for l in df[(~df.error.isna()) & (df.error.str.contains("month_list"))].file.tolist()))




In [27]:
# cannot find patient data
for year in range(2017,2023):
    subdf = df[(~df.error.isna()) & (df.error.str.contains("readxl::cell_limits"))]
    print(subdf[subdf.file.str.startswith(str(year))].file.tolist())

[]
[]
[]
[]
[]
[]


In [28]:
# check problem with num_na_rows
subdf = df[(~df.error.isna()) & (df.error.str.contains("num_na_rows"))]
subdf

Unnamed: 0,file,error,warning


In [29]:
# check start_df_msd:end_df_msd
subdf = df[(~df.error.isna()) & (df.error.str.contains("start_df_msd"))]
subdf.file.tolist()

[]

# Warnings

In [30]:
df[~df.warning.isna()]

Unnamed: 0,file,error,warning
0,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: recommendedtestingfrequencyperday,hospitalisationduetodiabetesemergencyorglucosecontrolreason,hospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningselect,patientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
1,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: recommendedtestingfrequencyperday,hospitalisationduetodiabetesemergencyorglucosecontrolreason,hospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningselectfordropdown,complicationscreening,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
2,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: virtualremotecontactyn,virtualremotecontactdate,recommendedtestingfrequencyperday,hospitalisationduetodiabetesemergencyorglucosecontrolreason,hospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningselectfordropdown,complicationscreening,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
3,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: virtualremotecontactyn,virtualremotecontactdate,recommendedtestingfrequencyperday,currentmonthhospitalisationduetodiabetesemergencyorglucosecontrolreason,currentmonthhospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningcurrentmonthtestingselectfordropdown,complicationscreeningcurrentmonthtesting,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport."
4,2021_Putrajaya Hospital A4D Tracker_DC_patient.log,,"Non-matching column names found: virtualremotecontactyn,virtualremotecontactdate,recommendedtestingfrequencyperday,currentmonthhospitalisationduetodiabetesemergencyorglucosecontrolreason,currentmonthhospitalisationduetodiabetesemergencyorglucosecontroldate,complicationscreeningcurrentmonthtestingselectfordropdown,complicationscreeningcurrentmonthtesting,otherpatientobservations,observationcategoryselect,suggestionsforpotentialadditionalsupportorinterventionfroma4degcomplicationsorfamilysupport,meter."
...,...,...,...
251,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.
252,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.
253,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.
254,2022_Sultanah Bahiyah A4D Tracker_DC_patient.log,,Non-matching column names found: lastremotefollowupdate.


In [15]:
missed_names = df[(~df.warning.isna()) & (df.warning.str.startswith("Non-matching column names found"))].warning.str.split(":",expand=True)[1].drop_duplicates()

In [16]:
names = set()
for row in missed_names:
    names.update(w.strip(".") for w in row.split(","))

In [17]:
names

{' complicationscreeningdropdown',
 ' currentmonthcomplicationscreeningdropdown',
 ' educationvocation',
 ' hospitalisationduetodiabetesemergencydate',
 ' lostpatientssummarylostdate',
 ' newmeter',
 ' recommendedtestingfrequencyperday',
 ' updatedhba1chba1cdate',
 ' virtualremotecontactyn',
 'admissiontohospitaldatemmmyy',
 'admissiontohospitaldatemmyy',
 'complicationscreening',
 'complicationscreeningcurrentmonthtesting',
 'complicationscreeningcurrentmonthtestingselectfordropdown',
 'complicationscreeningselect',
 'complicationscreeningselectfordropdown',
 'currentmonthcomplicationscreening',
 'currentmonthhospitalisationduetodiabetesemergencyorglucosecontroldate',
 'currentmonthhospitalisationduetodiabetesemergencyorglucosecontrolreason',
 'hospitalisationduetodiabetesemergencyorglucosecontroldate',
 'hospitalisationduetodiabetesemergencyorglucosecontrolreason',
 'lostpatientssummarystatusout',
 'meter',
 'observationcategoryselect',
 'otherpatientobservations',
 'patientobservati