<a href="https://colab.research.google.com/github/Chainchigit/Data-Engineer/blob/main/%20text%20sanitizer_version2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!python text_sanitizer.py input.txt --target output.json

python3: can't open file '/content/text_sanitizer.py': [Errno 2] No such file or directory


In [4]:
import argparse
import string
import json

class TextSanitizer:
    def __init__(self, source: str, target: str = None):
        self.source = source
        self.target = target
        self.text = ""
        self.sanitized_text = ""
        self.statistics = {}

    def read_input(self):
        """Read input text from a file."""
        if not self.source:
            print("Error: No source file provided.")
            exit(1)
        try:
            with open(self.source, 'r', encoding='utf-8') as file:
                self.text = file.read()
        except FileNotFoundError:
            print(f"Error: File '{self.source}' not found.")
            exit(1)

    def sanitize(self):
        """Sanitize the input text (convert to lowercase, replace tabs)."""
        self.sanitized_text = self.text.lower().replace('\t', '____')

    def generate_statistics(self):
        """Count occurrences of each alphabet."""
        self.statistics = {char: self.sanitized_text.count(char) for char in string.ascii_lowercase}

    def output_result(self):
        """Print sanitized text and statistics or write to a file if target is specified."""
        output_data = {
            "sanitized_text": self.sanitized_text,
            "statistics": self.statistics
        }

        if self.target:
            with open(self.target, 'w', encoding='utf-8') as file:
                json.dump(output_data, file, indent=4)
            print(f"Output written to '{self.target}'")
        else:
            print("\nSanitized Text:")
            print(self.sanitized_text)
            print("\nCharacter Frequency:")
            for char, count in self.statistics.items():
                print(f"{char}: {count}")

    def process(self):
        """Run the full text sanitization process."""
        self.read_input()
        self.sanitize()
        self.generate_statistics()
        self.output_result()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Text Sanitizer Application")
    parser.add_argument("source", nargs="?", help="Path to the source text file", default=None)
    parser.add_argument("--target", help="Path to the target output file (optional)", default=None)

    args, _ = parser.parse_known_args()
    sanitizer = TextSanitizer(args.source, args.target)
    sanitizer.process()


Sanitized Text:
{
  "shell_port": 1,
  "iopub_port": 2,
  "stdin_port": 3,
  "control_port": 4,
  "hb_port": 5,
  "ip": "/root/.local/share/jupyter/runtime/kernel-f31399b8-0b70-41c4-820f-3870ae21426b-ipc",
  "key": "",
  "transport": "ipc",
  "signature_scheme": "hmac-sha256",
  "kernel_name": ""
}

Character Frequency:
a: 8
b: 5
c: 7
d: 1
e: 14
f: 2
g: 1
h: 6
i: 7
j: 1
k: 3
l: 7
m: 4
n: 8
o: 12
p: 11
q: 0
r: 15
s: 7
t: 13
u: 4
v: 0
w: 0
x: 0
y: 2
z: 0
