In [22]:
import pandas as pd
from pathlib import Path
from collections import Counter
import os
import csv

log_dir = Path("../logs")
log_files = list(log_dir.glob("Log*.csv"))
print(f"found {len(log_files)} logs")

#start to deal with the differing headers - bearing in mind that there are some non-ascii symbols like degree signs

raw_headers = []

for file in log_files:
    try:
        with file.open('r', encoding='utf-8') as f:
            first_line = f.readline().strip() #strip off the newlines including carriage returns (visual inspection of some logs showed this was present occasionally)
            raw_headers.append(first_line)
    except Exception as e:  #some error handling
        print(f"failed to read {file}: {e}")

#count unique header sets for general information
header_count = Counter(raw_headers)
dir(header_count)
#read out what the unique headers are.  There are some that are in different orders, and some which contain different cols etc
print("\n=== raw header sets by count ===")
for header, count in header_count.most_common():
    print(f"{count} | {header[:80]}...") #cut down the readout so as not to blow out this cell - the actual full headers and counts will be dropped in a file once they are standardised a bit

standardised =[]
for line in raw_headers:
    fields = line.split(',')
    norm = ",".join(sorted(fields)) #sort alphabetically
    standardised.append(norm)

standardised_count = Counter(standardised)

print("\n=== standardised header sets by count")
for norm, count in standardised_count.most_common():
    print(f"{count} | {norm[:80]}...") #truncate for same reason as above

#sure enough, this changed the counts a bit.  That tells me that some of the logs have different
#drop these standardised headers into a file for a visual once-over.
standardised_count_df = pd.DataFrame.from_records(
    list(standardised_count.items()),
    columns=["standardised_header", "count"]
).sort_values(by="count", ascending=False)

#file is to be used in sublime text with rainbowcsv for inspection.  quotes need to be turned off
header_path = Path("standardised_header_set.csv")
standardised_count_df.to_csv(header_path, index=False, quoting=csv.QUOTE_NONE, escapechar='\\')

found 124 logs

=== raw header sets by count ===
69 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
11 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
10 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
8 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
5 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
5 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
5 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
4 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
2 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
2 | Time,Coolant temperature ° C,Exhaust gas temperature in front of the DPF ° C,Eng...
1 | Time,Boost pressure bar...
1 | Time,Exhaust gas temperature in f