In [2]:
import os
import logging
import numpy as np
import pandas as pd
from pathlib import Path
import sys
from datetime import datetime
import re
# ---------------- CONFIGURATION ----------------
SOURCE_FOLDER = r"E:\\Automation\\Card_recon\\original reports"   # change this
LOG_FILE = "conversion.log"
MIN_TEXT_RATIO = 0.75           # heuristic threshold
# -----------------------------------------------


# ---------------- LOGGING SETUP ----------------
logging.basicConfig(
    filename=LOG_FILE,
    level=logging.INFO,
    format="%(asctime)s | %(levelname)s | %(message)s"
)
logging.info("===== Conversion started =====")
# -----------------------------------------------


def is_likely_text(data: bytes) -> bool:
    """
    Heuristic check:
    If most bytes are printable ASCII or whitespace,
    treat file as text.
    """
    if not data:
        return False

    printable = sum(
        1 for b in data
        if 32 <= b <= 126 or b in (9, 10, 13)
    )

    ratio = printable / len(data)
    return ratio >= MIN_TEXT_RATIO


def convert_file(input_path: str):
    base, _ = os.path.splitext(input_path)
    output_path = base + ".txt"

    with open(input_path, "rb") as f:
        raw = f.read()

    # Integrity check
    if len(raw) == 0:
        logging.warning(f"Empty file skipped: {input_path}")
        return

    # Reconstruction detection
    if not is_likely_text(raw):
        logging.warning(
            f"Binary or archive split detected. "
            f"Reconstruction required: {input_path}"
        )
        return

    # Decode safely
    text = raw.decode("utf-8", errors="ignore")

    with open(output_path, "w", encoding="utf-8") as out:
        out.write(text)

    logging.info(f"Converted: {input_path} -> {output_path}")
    print(f"✔ Converted: {os.path.basename(input_path)}")


def convert_folder(folder_path: str):
    files = [f for f in os.listdir(folder_path) if f.lower().endswith(".001")]

    if not files:
        print("No .001 files found.")
        return

    total = len(files)
    print(f"Found {total} .001 files\n")

    for index, filename in enumerate(files, start=1):
        full_path = os.path.join(folder_path, filename)
        print(f"[{index}/{total}] Processing {filename}...")
        convert_file(full_path)

    print("\nConversion completed.")


# -------------------- RUN ----------------------
if __name__ == "__main__":
    if not os.path.isdir(SOURCE_FOLDER):
        raise FileNotFoundError("Source folder does not exist")

    convert_folder(SOURCE_FOLDER)
    logging.info("===== Conversion finished =====")


Found 5 .001 files

[1/5] Processing TT140T0.2025-02-20-17-52-00.001...
✔ Converted: TT140T0.2025-02-20-17-52-00.001
[2/5] Processing TT140T0.2025-02-20-17-52-08.001...
✔ Converted: TT140T0.2025-02-20-17-52-08.001
[3/5] Processing TT140T0.2025-02-20-17-52-11.001...
✔ Converted: TT140T0.2025-02-20-17-52-11.001
[4/5] Processing TT140T0.2025-02-20-17-58-43.001...
✔ Converted: TT140T0.2025-02-20-17-58-43.001
[5/5] Processing TT461T0.2025-02-20-20-07-20.001...
✔ Converted: TT461T0.2025-02-20-20-07-20.001

Conversion completed.


In [None]:
# ------------------- load all files converted to txt in the folder that starts with tt140 -----------------------
converted_files = [
    f for f in os.listdir(SOURCE_FOLDER)
    if f.lower().endswith(".txt") and f.lower().startswith("tt140")
]

# ---------------------- load all converted files ----------------------------
loaded_data = {}
for file in converted_files:
    file_path = os.path.join(SOURCE_FOLDER, file)
    with open(file_path, "r", encoding="utf-8") as f:
        content = f.read()
        loaded_data[file] = content
    print(f"Loaded: {file}")

    

Loaded: TT140T0.2025-02-20-17-52-00.txt
Loaded: TT140T0.2025-02-20-17-52-08.txt
Loaded: TT140T0.2025-02-20-17-52-11.txt
Loaded: TT140T0.2025-02-20-17-58-43.txt


In [6]:
# ------------------- break the files into lines in each file -----------------------
broken_data = {}
for file, content in loaded_data.items():
    lines = content.splitlines()
    broken_data[file] = lines
    print(f"Broken into lines: {file} ({len(lines)} lines)")
    

Broken into lines: TT140T0.2025-02-20-17-52-00.txt (1273 lines)
Broken into lines: TT140T0.2025-02-20-17-52-08.txt (2781 lines)
Broken into lines: TT140T0.2025-02-20-17-52-11.txt (1331 lines)
Broken into lines: TT140T0.2025-02-20-17-58-43.txt (929 lines)
