<a href="https://colab.research.google.com/github/Britjit/hackathon/blob/main/hack_away.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install lz4 zstandard gzip chardet


[31mERROR: Could not find a version that satisfies the requirement gzip (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for gzip[0m[31m
[0m

In [None]:
pip install lz4 zstandard chardet



In [None]:
"Generated with the help of OpenAIs ChatGPT Googles Gemini "
" Made by Gabriel,Britney,Alex and Devam"
import gzip
import csv
import re
import statistics
import struct
import zlib  # assuming the file is compressed
import os
from tqdm import tqdm
import sys  # for directing tqdm to stderr

# ----------------------------
# SAP data type parsers
# ----------------------------
def parse_char(data):
    return data.decode('utf-8').strip()

def parse_numc(data):
    return data.decode('utf-8').strip()

def parse_date(data):
    return data.decode('utf-8').strip()

def parse_int(data):
    return str(struct.unpack('>i', data)[0])

def parse_packed(data):
    result = ''
    for byte in data:
        high = (byte >> 4) & 0x0F
        low = byte & 0x0F
        result += str(high)
        result += str(low)
    return result[:-1]

SAP_TYPE_PARSERS = {
    'CHAR': parse_char,
    'NUMC': parse_numc,
    'DATE': parse_date,
    'INT4': parse_int,
    'DEC': parse_packed
}

# ----------------------------
# Example EKKO fields
# ----------------------------
FIELDS = [
    {'FIELDNAME': 'EBELN', 'DATATYPE': 'CHAR', 'LENGTH': 10},
    {'FIELDNAME': 'BUKRS', 'DATATYPE': 'CHAR', 'LENGTH': 4},
    {'FIELDNAME': 'LIFNR', 'DATATYPE': 'NUMC', 'LENGTH': 10},
    {'FIELDNAME': 'AEDAT', 'DATATYPE': 'DATE', 'LENGTH': 8}
]

# ----------------------------
# Read SAP file safely
# ----------------------------
def read_sap_file(file_path):
    with open(file_path, 'rb') as f:
        magic = f.read(2)
        f.seek(0)
        if magic == b'\x1f\x8b':  # gzip
            print("Compression: gzip", file=sys.stderr)
            with gzip.open(f, 'rb') as gz:
                data = gz.read()
        else:
            print("Compression: none or unknown", file=sys.stderr)
            data = f.read()
    print(f"Magic bytes: {data[:4].hex()}", file=sys.stderr)

    # Attempt SAP version detection (first 4 bytes example)
    sap_version = struct.unpack('>I', data[:4])[0]
    print(f"SAP file version: {sap_version}", file=sys.stderr)

    return data

# ----------------------------
# Parse SAP file with progress
# ----------------------------
def parse_sap_file(file_path, fields):
    raw_data = read_sap_file(file_path)
    record_length = sum(f['LENGTH'] for f in fields)
    num_records = len(raw_data) // record_length
    records = []

    print(f"Total records detected: {num_records}", file=sys.stderr)

    for i in tqdm(range(num_records), desc="Parsing records", file=sys.stderr):
        offset = i * record_length
        record_bytes = raw_data[offset:offset + record_length]
        record_dict = {}
        pos = 0
        for f in fields:
            field_bytes = record_bytes[pos:pos + f['LENGTH']]
            parser = SAP_TYPE_PARSERS.get(f['DATATYPE'], parse_char)
            try:
                record_dict[f['FIELDNAME']] = parser(field_bytes)
            except:
                record_dict[f['FIELDNAME']] = field_bytes.hex()
            pos += f['LENGTH']
        records.append(record_dict)

    return records

# ----------------------------
# Export to CSV safely
# ----------------------------
def export_to_csv(records, fields, output_file):
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(['FIELDNAME', 'DATATYPE', 'LENGTH', 'VALUE'])
        for record in records:
            for f in fields:
                writer.writerow([f['FIELDNAME'], f['DATATYPE'], f['LENGTH'], record[f['FIELDNAME']]])

# ----------------------------
# Main
# ----------------------------
def main():
    input_file = '/content/MM_MM_EKKO_20240624_113031_0 1.gz'
    output_file = 'parsed_ekko_sap_template.csv'

    if not os.path.exists(input_file):
        print(f"Error: input file '{input_file}' not found.", file=sys.stderr)
        return

    records = parse_sap_file(input_file, FIELDS)
    export_to_csv(records, FIELDS, output_file)
    print(f"Successfully parsed {len(records)} records to '{output_file}'.", file=sys.stderr)

# ----------------------------
if __name__ == "__main__":
    main()


Compression: gzip
Magic bytes: 10070065
SAP file version: 268894309
Total records detected: 2274
Parsing records: 100%|██████████| 2274/2274 [00:00<00:00, 163193.55it/s]
Successfully parsed 2274 records to 'parsed_ekko_sap_template.csv'.


In [None]:
import os
from glob import glob
from concurrent.futures import ProcessPoolExecutor, as_completed
from tqdm import tqdm
import sys

# --- Keep your existing SAP functions (parse_sap_file, export_to_csv, etc.) ---


def process_one_file(file_path, output_folder):
    """Worker function to parse and export one SAP file."""
    base_name = os.path.basename(file_path).replace(".gz", "")
    output_file = os.path.join(output_folder, f"{base_name}_parsed.csv")

    try:
        records = parse_sap_file(file_path, FIELDS)
        export_to_csv(records, FIELDS, output_file)
        return f"✅ {file_path} -> {output_file}"
    except Exception as e:
        return f"❌ Error processing {file_path}: {e}"


def process_all_sap_files_parallel(input_folder, output_folder, max_workers=4):
    os.makedirs(output_folder, exist_ok=True)
    sap_files = glob(os.path.join(input_folder, "*.gz"))

    if not sap_files:
        print("No SAP files found!", file=sys.stderr)
        return

    print(f"Found {len(sap_files)} files. Processing with {max_workers} workers...\n", file=sys.stderr)

    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_one_file, path, output_folder) for path in sap_files]
        for future in tqdm(as_completed(futures), total=len(futures), desc="Processing files", file=sys.stderr):
            print(future.result(), file=sys.stderr)


def main():
    input_folder = "/content/"
    input_folder = "/content/"
    output_folder = "/content/parsed_results/"
    process_all_sap_files_parallel(input_folder, output_folder, max_workers=os.cpu_count())


if __name__ == "__main__":
    main()

Found 1 files. Processing with 2 workers...

Processing files:   0%|          | 0/1 [00:00<?, ?it/s]Compression: gzip
Magic bytes: 10070065
SAP file version: 268894309
Total records detected: 2274
Parsing records: 100%|██████████| 2274/2274 [00:00<00:00, 134862.03it/s]
✅ /content/MM_MM_EKKO_20240624_113031_0 1.gz -> /content/parsed_results/MM_MM_EKKO_20240624_113031_0 1_parsed.csv
Processing files: 100%|██████████| 1/1 [00:00<00:00, 16.08it/s]


In [None]:
import gzip
import csv
import struct
import os
import sys
import traceback
from tqdm import tqdm
from concurrent.futures import ProcessPoolExecutor, as_completed

# ----------------------------
# SAP data type parsers
# ----------------------------
def parse_char(data): return data.decode('utf-8', errors='ignore').strip()
def parse_numc(data): return data.decode('utf-8', errors='ignore').strip()
def parse_date(data): return data.decode('utf-8', errors='ignore').strip()
def parse_int(data): return str(struct.unpack('>i', data)[0]) if len(data) == 4 else str(int.from_bytes(data, 'big'))
def parse_packed(data):
    result = ''
    for byte in data:
        high = (byte >> 4) & 0x0F
        low = byte & 0x0F
        result += str(high) + str(low)
    return result[:-1]

SAP_TYPE_PARSERS = {
    'CHAR': parse_char,
    'NUMC': parse_numc,
    'DATE': parse_date,
    'INT4': parse_int,
    'DEC': parse_packed
}

# ----------------------------
# Example field definition (replace with your structure)
# ----------------------------
FIELDS = [
    {'FIELDNAME': 'EBELN', 'DATATYPE': 'CHAR', 'LENGTH': 10},
    {'FIELDNAME': 'BUKRS', 'DATATYPE': 'CHAR', 'LENGTH': 4},
    {'FIELDNAME': 'LIFNR', 'DATATYPE': 'NUMC', 'LENGTH': 10},
    {'FIELDNAME': 'AEDAT', 'DATATYPE': 'DATE', 'LENGTH': 8}
]

# ----------------------------
# File reading with magic byte detection
# ----------------------------
def read_sap_file(file_path):
    try:
        with open(file_path, 'rb') as f:
            magic = f.read(8)  # read first 8 bytes
            f.seek(0)
            if magic.startswith(b'\x1f\x8b'):  # gzip
                compression = 'gzip'
                with gzip.open(f, 'rb') as gz:
                    data = gz.read()
            else:
                compression = 'none'
                data = f.read()

        print(f"[INFO] File: {os.path.basename(file_path)}")
        print(f"       Magic bytes: {magic}")
        print(f"       Compression: {compression}")
        print(f"       File size: {len(data)} bytes")

        return data, magic, compression
    except Exception as e:
        print(f"[ERROR] Failed to read '{file_path}': {e}", file=sys.stderr)
        traceback.print_exc()
        return None, None, None

# ----------------------------
# SAP file parser
# ----------------------------
def parse_sap_file(file_path, fields):
    data, magic, compression = read_sap_file(file_path)
    if not data:
        return []

    record_length = sum(f['LENGTH'] for f in fields)
    if record_length == 0:
        print(f"[ERROR] Record length is 0, invalid field definitions.")
        return []

    num_records = len(data) // record_length
    records = []

    print(f"[INFO] Detected {num_records} records (record length {record_length})")

    for i in tqdm(range(num_records), desc=f"Parsing {os.path.basename(file_path)}", file=sys.stderr):
        try:
            offset = i * record_length
            record_bytes = data[offset:offset + record_length]
            record_dict = {}
            pos = 0
            for f in fields:
                field_bytes = record_bytes[pos:pos + f['LENGTH']]
                parser = SAP_TYPE_PARSERS.get(f['DATATYPE'], parse_char)
                try:
                    record_dict[f['FIELDNAME']] = parser(field_bytes)
                except Exception:
                    record_dict[f['FIELDNAME']] = field_bytes.hex()
                pos += f['LENGTH']
            records.append(record_dict)
        except Exception as e:
            print(f"[WARN] Skipping record {i}: {e}", file=sys.stderr)
            continue

    return records

# ----------------------------
# CSV export
# ----------------------------
def export_to_csv(records, fields, output_file):
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            headers = [f['FIELDNAME'] for f in fields]
            writer.writerow(headers)
            for record in records:
                writer.writerow([record.get(f['FIELDNAME'], '') for f in fields])
        print(f"[SUCCESS] Wrote {len(records)} records → {output_file}")
    except Exception as e:
        print(f"[ERROR] CSV export failed: {e}", file=sys.stderr)

# ----------------------------
# Parallel file processing
# ----------------------------
def process_file(file_path, output_dir):
    try:
        records = parse_sap_file(file_path, FIELDS)
        output_file = os.path.join(output_dir, os.path.basename(file_path) + "_parsed.csv")
        export_to_csv(records, FIELDS, output_file)
        return (file_path, len(records))
    except Exception as e:
        print(f"[ERROR] Failed processing {file_path}: {e}", file=sys.stderr)
        return (file_path, 0)

# ----------------------------
# Main
# ----------------------------
def main(input_path, output_dir="parsed_output", max_workers=4):
    os.makedirs(output_dir, exist_ok=True)
    file_list = []

    if os.path.isdir(input_path):
        file_list = [os.path.join(input_path, f) for f in os.listdir(input_path) if f.lower().endswith(('.gz', '.sap', '.bin'))]
    elif os.path.isfile(input_path):
        file_list = [input_path]
    else:
        print(f"[ERROR] Invalid input path: {input_path}")
        return

    print(f"[INFO] Processing {len(file_list)} file(s) with {max_workers} workers")

    results = []
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = {executor.submit(process_file, f, output_dir): f for f in file_list}
        for future in as_completed(futures):
            file_path = futures[future]
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                print(f"[ERROR] {file_path}: {e}", file=sys.stderr)

    print("\n[SUMMARY]")
    for f, count in results:
        print(f" - {os.path.basename(f)} → {count} records")

if __name__ == "__main__":
    # Example: change input path to your folder or file
    input_path = "/content/MM_MM_EKKO_20240624_113031_0 1.gz"
    main(input_path)

[INFO] Processing 1 file(s) with 4 workers
[INFO] File: MM_MM_EKKO_20240624_113031_0 1.gz
       Magic bytes: b'\x1f\x8b\x08\x00\xba)\xfdh'
       Compression: gzip
       File size: 72798 bytes
[INFO] Detected 2274 records (record length 32)


Parsing MM_MM_EKKO_20240624_113031_0 1.gz: 100%|██████████| 2274/2274 [00:00<00:00, 105528.17it/s]


[SUCCESS] Wrote 2274 records → parsed_output/MM_MM_EKKO_20240624_113031_0 1.gz_parsed.csv

[SUMMARY]
 - MM_MM_EKKO_20240624_113031_0 1.gz → 2274 records


In [None]:
import os
import gzip
import csv
import struct
import sys
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import traceback

# Optional compressors
try:
    import lz4.frame
except ImportError:
    lz4 = None
try:
    import zstandard as zstd
except ImportError:
    zstd = None

# ----------------------------
# SAP data type parsers
# ----------------------------
def parse_char(data):
    return data.decode('utf-8', errors='replace').strip()

def parse_numc(data):
    return data.decode('utf-8', errors='replace').strip()

def parse_date(data):
    return data.decode('utf-8', errors='replace').strip()

def parse_int(data):
    if len(data) != 4:
        return data.hex()
    return str(struct.unpack('>i', data)[0])

def parse_packed(data):
    # simple BCD unpacking
    result = ''
    for byte in data:
        high = (byte >> 4) & 0x0F
        low = byte & 0x0F
        result += str(high)
        result += str(low)
    return result[:-1]

SAP_TYPE_PARSERS = {
    'CHAR': parse_char,
    'NUMC': parse_numc,
    'DATE': parse_date,
    'INT4': parse_int,
    'DEC': parse_packed
}

# ----------------------------
# Example EKKO fields (adjust per table)
# ----------------------------
FIELDS = [
    {'FIELDNAME': 'EBELN', 'DATATYPE': 'CHAR', 'LENGTH': 10},
    {'FIELDNAME': 'BUKRS', 'DATATYPE': 'CHAR', 'LENGTH': 4},
    {'FIELDNAME': 'LIFNR', 'DATATYPE': 'NUMC', 'LENGTH': 10},
    {'FIELDNAME': 'AEDAT', 'DATATYPE': 'DATE', 'LENGTH': 8}
]

# ----------------------------
# Read SAP file safely with compression detection
# ----------------------------
def read_sap_file(file_path):
    with open(file_path, 'rb') as f:
        magic = f.read(4)
        f.seek(0)
        data = None

        # gzip
        if magic[:2] == b'\x1f\x8b':
            print(f"[INFO] {file_path}: detected gzip compression", file=sys.stderr)
            with gzip.open(f, 'rb') as gz:
                data = gz.read()
        # lz4
        elif lz4 and magic[:4] == b'\x04\x22\x4D\x18':
            print(f"[INFO] {file_path}: detected lz4 compression", file=sys.stderr)
            data = lz4.frame.decompress(f.read())
        # zstd
        elif zstd and magic[:4] == b'\x28\xb5\x2f\xfd':
            print(f"[INFO] {file_path}: detected zstd compression", file=sys.stderr)
            dctx = zstd.ZstdDecompressor()
            data = dctx.decompress(f.read())
        # uncompressed
        else:
            print(f"[INFO] {file_path}: no known compression detected", file=sys.stderr)
            data = f.read()

    print(f"[INFO] {file_path}: magic bytes {data[:8].hex()}", file=sys.stderr)
    return data

# ----------------------------
# Parse SAP file
# ----------------------------
def parse_sap_file(file_path, fields):
    raw_data = read_sap_file(file_path)
    record_length = sum(f['LENGTH'] for f in fields)
    if record_length == 0:
        print(f"[WARN] {file_path}: record length 0", file=sys.stderr)
        return []

    num_records = len(raw_data) // record_length
    records = []

    print(f"[INFO] {file_path}: {num_records} records detected", file=sys.stderr)

    for i in tqdm(range(num_records), desc=f"Parsing {os.path.basename(file_path)}", file=sys.stderr):
        offset = i * record_length
        record_bytes = raw_data[offset:offset + record_length]
        record_dict = {}
        pos = 0
        for f in fields:
            field_bytes = record_bytes[pos:pos + f['LENGTH']]
            parser = SAP_TYPE_PARSERS.get(f['DATATYPE'], parse_char)
            try:
                val = parser(field_bytes)
            except Exception:
                val = field_bytes.hex()
            record_dict[f['FIELDNAME']] = val
            pos += f['LENGTH']
        records.append(record_dict)

    return records

# ----------------------------
# Export to CSV
# ----------------------------
def export_to_csv(records, fields, output_file):
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['FIELDNAME', 'DATATYPE', 'LENGTH', 'VALUE'])

            for record in records:
                for f in fields:
                    val = record.get(f['FIELDNAME'], "")
                    # calculate actual byte length
                    if isinstance(val, str):
                        byte_len = len(val.encode('utf-8', errors='replace'))
                    else:
                        byte_len = len(str(val))
                    writer.writerow([f['FIELDNAME'], f['DATATYPE'], byte_len, val])
        print(f"[SUCCESS] CSV written: {output_file}")
    except Exception as e:
        print(f"[ERROR] Failed to write CSV: {e}", file=sys.stderr)
        traceback.print_exc()

# ----------------------------
# Process multiple files in parallel
# ----------------------------
def process_file(file_path, fields, output_dir):
    try:
        records = parse_sap_file(file_path, fields)
        if not records:
            print(f"[WARN] {file_path}: no records parsed")
            return
        base_name = os.path.basename(file_path)
        csv_name = os.path.join(output_dir, f"parsed_{base_name}.csv")
        export_to_csv(records, fields, csv_name)
    except Exception as e:
        print(f"[ERROR] {file_path}: {e}", file=sys.stderr)
        traceback.print_exc()

def main():
    input_dir = "/content/"  # change to your folder
    output_dir = "./parsed_csv"
    os.makedirs(output_dir, exist_ok=True)

    # list all files
    files = [os.path.join(input_dir, f) for f in os.listdir(input_dir)
             if os.path.isfile(os.path.join(input_dir, f))]

    print(f"[INFO] Found {len(files)} files in {input_dir}")

    # parallel processing
    with ThreadPoolExecutor(max_workers=4) as executor:
        futures = [executor.submit(process_file, f, FIELDS, output_dir) for f in files]
        for future in futures:
            future.result()  # wait for completion

if __name__ == "__main__":
    main()


[INFO] Found 3 files in /content/


[INFO] /content/parsed_ekko_sap_template.csv: no known compression detected
[INFO] /content/MM_MM_EKKO_20240624_113031_0 1.gz: detected gzip compression
[INFO] /content/parsed_ekko_sap_template.csv: magic bytes 4649454c444e414d
[INFO] /content/parsed_ekko_sap_template.csv: 8896 records detected
[INFO] /content/MM_MM_EKKO_20240624_113031_0 1: no known compression detected
[INFO] /content/MM_MM_EKKO_20240624_113031_0 1: magic bytes 100700650008ee01
[INFO] /content/MM_MM_EKKO_20240624_113031_0 1: 2274 records detected
[INFO] /content/MM_MM_EKKO_20240624_113031_0 1.gz: magic bytes 100700650008ee01
[INFO] /content/MM_MM_EKKO_20240624_113031_0 1.gz: 2274 records detected
Parsing parsed_ekko_sap_template.csv:   0%|          | 0/8896 [00:00<?, ?it/s]

Parsing MM_MM_EKKO_20240624_113031_0 1.gz:   0%|          | 0/2274 [00:00<?, ?it/s][A[A
Parsing MM_MM_EKKO_20240624_113031_0 1.gz: 100%|██████████| 2274/2274 [00:00<00:00, 31048.39it/s]
Parsing parsed_ekko_sap_template.csv: 100%|██████████| 889

[SUCCESS] CSV written: ./parsed_csv/parsed_MM_MM_EKKO_20240624_113031_0 1.gz.csv
[SUCCESS] CSV written: ./parsed_csv/parsed_MM_MM_EKKO_20240624_113031_0 1.csv
[SUCCESS] CSV written: ./parsed_csv/parsed_parsed_ekko_sap_template.csv.csv


In [None]:
import os
import gzip
import csv
import struct
import sys
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
import traceback

# Optional compressors
try:
    import lz4.frame
except ImportError:
    lz4 = None
try:
    import zstandard as zstd
except ImportError:
    zstd = None

# ----------------------------
# SAP data type parsers
# ----------------------------
def parse_char(data):
    return data.decode('utf-8', errors='replace').strip()

def parse_numc(data):
    return data.decode('utf-8', errors='replace').strip()

def parse_date(data):
    return data.decode('utf-8', errors='replace').strip()

def parse_int(data):
    if len(data) != 4:
        return data.hex()
    return str(struct.unpack('>i', data)[0])

def parse_packed(data):
    # simple BCD unpacking
    result = ''
    for byte in data:
        high = (byte >> 4) & 0x0F
        low = byte & 0x0F
        result += str(high)
        result += str(low)
    return result[:-1]

SAP_TYPE_PARSERS = {
    'CHAR': parse_char,
    'NUMC': parse_numc,
    'DATE': parse_date,
    'INT4': parse_int,
    'DEC': parse_packed
}

# ----------------------------
# Example EKKO fields (adjust per table)
# ----------------------------
FIELDS = [
    {'FIELDNAME': 'EBELN', 'DATATYPE': 'CHAR', 'LENGTH': 10},
    {'FIELDNAME': 'BUKRS', 'DATATYPE': 'CHAR', 'LENGTH': 4},
    {'FIELDNAME': 'LIFNR', 'DATATYPE': 'NUMC', 'LENGTH': 10},
    {'FIELDNAME': 'AEDAT', 'DATATYPE': 'DATE', 'LENGTH': 8}
]

# ----------------------------
# Read SAP file safely with compression detection
# ----------------------------
def read_sap_file(file_path):
    with open(file_path, 'rb') as f:
        magic = f.read(4)
        f.seek(0)
        data = None

        if magic[:2] == b'\x1f\x8b':  # gzip
            print(f"[INFO] {file_path}: detected gzip compression", file=sys.stderr)
            with gzip.open(f, 'rb') as gz:
                data = gz.read()
        elif lz4 and magic[:4] == b'\x04\x22\x4D\x18':  # lz4
            print(f"[INFO] {file_path}: detected lz4 compression", file=sys.stderr)
            data = lz4.frame.decompress(f.read())
        elif zstd and magic[:4] == b'\x28\xb5\x2f\xfd':  # zstd
            print(f"[INFO] {file_path}: detected zstd compression", file=sys.stderr)
            dctx = zstd.ZstdDecompressor()
            data = dctx.decompress(f.read())
        else:
            print(f"[INFO] {file_path}: no known compression detected", file=sys.stderr)
            data = f.read()

    print(f"[INFO] {file_path}: magic bytes {data[:8].hex()}", file=sys.stderr)
    return data

# ----------------------------
# Parse SAP file with validation
# ----------------------------
def parse_sap_file(file_path, fields, sample_size=None):
    raw_data = read_sap_file(file_path)
    record_length = sum(f['LENGTH'] for f in fields)
    if record_length == 0:
        print(f"[WARN] {file_path}: record length 0", file=sys.stderr)
        return []

    num_records = len(raw_data) // record_length
    if len(raw_data) % record_length != 0:
        print(f"[WARN] {file_path}: incomplete last record ignored", file=sys.stderr)

    if sample_size:
        num_records = min(num_records, sample_size)

    records = []
    for i in tqdm(range(num_records), desc=f"Parsing {os.path.basename(file_path)}", file=sys.stderr):
        offset = i * record_length
        record_bytes = raw_data[offset:offset + record_length]
        if len(record_bytes) < record_length:
            print(f"[WARN] {file_path}: skipping incomplete record at index {i}", file=sys.stderr)
            continue

        record_dict = {}
        pos = 0
        for f in fields:
            field_bytes = record_bytes[pos:pos + f['LENGTH']]
            parser = SAP_TYPE_PARSERS.get(f['DATATYPE'], parse_char)
            try:
                val = parser(field_bytes)
            except Exception:
                val = f"[ERROR: {field_bytes.hex()}]"
            record_dict[f['FIELDNAME']] = val
            pos += f['LENGTH']
        records.append(record_dict)

    return records

# ----------------------------
# Export to CSV
# ----------------------------
def export_to_csv(records, fields, output_file):
    try:
        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerow(['FIELDNAME', 'DATATYPE', 'LENGTH', 'VALUE'])

            for record in records:
                for f in fields:
                    val = record.get(f['FIELDNAME'], "")
                    if isinstance(val, str):
                        byte_len = len(val.encode('utf-8', errors='replace'))
                    else:
                        byte_len = len(str(val))
                    writer.writerow([f['FIELDNAME'], f['DATATYPE'], byte_len, val])
        print(f"[SUCCESS] CSV written: {output_file}")
    except Exception as e:
        print(f"[ERROR] Failed to write CSV: {e}", file=sys.stderr)
        traceback.print_exc()

# ----------------------------
# Process single file
# ----------------------------
def process_file(file_path, fields, output_dir, sample_size=None):
    try:
        records = parse_sap_file(file_path, fields, sample_size)
        if not records:
            print(f"[WARN] {file_path}: no records parsed")
            return
        base_name = os.path.basename(file_path)
        csv_name = os.path.join(output_dir, f"parsed_{base_name}.csv")
        export_to_csv(records, fields, csv_name)
    except Exception as e:
        print(f"[ERROR] {file_path}: {e}", file=sys.stderr)
        traceback.print_exc()

# ----------------------------
# Main: handle single file or folder
# ----------------------------
def main():
    input_path = "/content/MM_MM_EKKO_20240624_113031_0 1.gz"  # change to file or folder
    output_dir = "./parsed_csv"
    os.makedirs(output_dir, exist_ok=True)

    sample_size = None  # e.g., 100 to only process first 100 records

    if os.path.isfile(input_path):
        print(f"[INFO] Processing single file: {input_path}")
        process_file(input_path, FIELDS, output_dir, sample_size)
    elif os.path.isdir(input_path):
        files = [os.path.join(input_path, f) for f in os.listdir(input_path)
                 if os.path.isfile(os.path.join(input_path, f))]
        print(f"[INFO] Found {len(files)} files in folder: {input_path}")

        with ThreadPoolExecutor(max_workers=4) as executor:
            futures = [executor.submit(process_file, f, FIELDS, output_dir, sample_size) for f in files]
            for future in futures:
                future.result()
    else:
        print(f"[ERROR] Path does not exist: {input_path}")

if __name__ == "__main__":
    main()


[INFO] /content/MM_MM_EKKO_20240624_113031_0 1.gz: detected gzip compression
[INFO] /content/MM_MM_EKKO_20240624_113031_0 1.gz: magic bytes 100700650008ee01
[WARN] /content/MM_MM_EKKO_20240624_113031_0 1.gz: incomplete last record ignored


[INFO] Processing single file: /content/MM_MM_EKKO_20240624_113031_0 1.gz


Parsing MM_MM_EKKO_20240624_113031_0 1.gz: 100%|██████████| 2274/2274 [00:00<00:00, 71541.55it/s]

[SUCCESS] CSV written: ./parsed_csv/parsed_MM_MM_EKKO_20240624_113031_0 1.gz.csv





At the start of the challenge we had trouble understanding SAP files and also getting the first function to run which made it hard to even do the whole project. We decided to backtrack and realized that we didnt understand the problem fully and had to do so much more research. This lead us to finally realizing that we were thinking we were decrypting the files instead of . .gunziping them. This was our first technical challenge