In [2]:
import sys
sys.path.append('/Projects/regionintelligenceai/dev_llm/modules/q_and_a_generator/')

In [15]:
import json
import re
import os
from pathlib import Path
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class Los_Angeles_to_JSON:
    def __init__(self, input_subdir, output_filename):
        self.input_dir = RAW_DATA_DIR / input_subdir
        self.output_filename = JSON_DATA_DIR / 'los_angeles_building_codes_json' / output_filename
        self.all_sections = []

    def extract_section(self, line, current_section, sections):
        if re.match(r"\d+(\.\d+)*", line):
            if current_section is not None:
                sections.append(current_section)
            current_section = {"section_id": "", "title": "", "content": ""}
            parts = line.split(" ", 1)
            current_section["section_id"] = parts[0]
            current_section["title"] = parts[1] if len(parts) > 1 else ""
        elif current_section is not None:
            if re.match(r"\[.*?\]", line):
                current_section["title"] = line if line else ""
            elif line.isupper():
                current_section["title"] = line
            else:
                current_section["content"] += line + "\n" if line else ""
        return current_section

    def process_file(self, file_name):
        sections = []
        current_section = None
        try:
            with open(self.input_dir / file_name, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    current_section = self.extract_section(line, current_section, sections)
                if current_section is not None:
                    sections.append(current_section)
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
        self.all_sections.extend(sections)

    def output_json(self):
        try:
            with open(self.output_filename, "w") as f:
                json.dump(self.all_sections, f, indent=4)
        except Exception as e:
            print(f"Error writing to {self.output_filename}: {str(e)}")
    
    def get_latest_file(self):
        files = list(self.input_dir.glob("*.txt"))
        if not files:
            return None
        latest_file = max(files, key=os.path.getctime)
        return latest_file

    def process_files(self):
        latest_file = self.get_latest_file()
        if latest_file:
            self.process_file(latest_file.name)
            self.output_json()
        else:
            print(f"No text files found in {self.input_dir}.")

if __name__ == "__main__":
    input_subdir = 'los_angeles_building_codes'
    output_filename = "los_angeles_building_codes.json"
    
    extractor = Los_Angeles_to_JSON(input_subdir, output_filename)
    extractor.process_files()


In [9]:
import json
import re
import os
from pathlib import Path
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class California_to_JSON:
    def __init__(self, input_subdir, output_filename):
        self.input_dir = RAW_DATA_DIR / input_subdir
        self.output_filename = JSON_DATA_DIR / 'california_building_codes_json' / output_filename
        self.all_sections = []

    def extract_section(self, line, current_section, sections):
        if re.match(r"\d+(\.\d+)*", line):
            if current_section is not None:
                sections.append(current_section)
            current_section = {"section_id": "", "title": "", "content": ""}
            parts = line.split(" ", 1)
            current_section["section_id"] = parts[0]
            current_section["title"] = parts[1] if len(parts) > 1 else ""
        elif current_section is not None:
            if re.match(r"\[.*?\]", line):
                current_section["title"] = line if line else ""
            elif line.isupper():
                current_section["title"] = line
            else:
                current_section["content"] += line + "\n" if line else ""
        return current_section

    def process_file(self, file_name):
        sections = []
        current_section = None
        try:
            with open(self.input_dir / file_name, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    current_section = self.extract_section(line, current_section, sections)
                if current_section is not None:
                    sections.append(current_section)
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
        self.all_sections.extend(sections)

    def output_json(self):
        try:
            with open(self.output_filename, "w") as f:
                json.dump(self.all_sections, f, indent=4)
        except Exception as e:
            print(f"Error writing to {self.output_filename}: {str(e)}")
    
    def get_latest_file(self):
        files = list(self.input_dir.glob("*.txt"))
        if not files:
            return None
        latest_file = max(files, key=os.path.getctime)
        return latest_file

    def process_files(self):
        latest_file = self.get_latest_file()
        if latest_file:
            self.process_file(latest_file.name)
            self.output_json()
        else:
            print(f"No text files found in {self.input_dir}.")

if __name__ == "__main__":
    input_subdir = 'california_building_codes'
    output_filename = "california_building_codes.json"
    
    extractor = California_to_JSON(input_subdir, output_filename)
    extractor.process_files()


No text files found in C:\Projects\regionintelligenceai\dev_llm\modules\q_and_a_generator\data\raw\california_building_codes.


In [13]:
import json
import re
import os
from pathlib import Path
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class Los_Angeles_County_to_JSON:
    def __init__(self, input_subdir, output_filename):
        self.input_dir = RAW_DATA_DIR / input_subdir
        self.output_filename = JSON_DATA_DIR / 'los_angeles_county_building_codes_json' / output_filename
        self.all_sections = []

    def extract_section(self, line, current_section, sections):
        if re.match(r"\d+(\.\d+)*", line):
            if current_section is not None:
                sections.append(current_section)
            current_section = {"section_id": "", "title": "", "content": ""}
            parts = line.split(" ", 1)
            current_section["section_id"] = parts[0]
            current_section["title"] = parts[1] if len(parts) > 1 else ""
        elif current_section is not None:
            if re.match(r"\[.*?\]", line):
                current_section["title"] = line if line else ""
            elif line.isupper():
                current_section["title"] = line
            else:
                current_section["content"] += line + "\n" if line else ""
        return current_section

    def process_file(self, file_name):
        sections = []
        current_section = None
        try:
            with open(self.input_dir / file_name, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    current_section = self.extract_section(line, current_section, sections)
                if current_section is not None:
                    sections.append(current_section)
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
        self.all_sections.extend(sections)

    def output_json(self):
        try:
            with open(self.output_filename, "w") as f:
                json.dump(self.all_sections, f, indent=4)
        except Exception as e:
            print(f"Error writing to {self.output_filename}: {str(e)}")
    
    def get_latest_file(self):
        files = list(self.input_dir.glob("*.txt"))
        if not files:
            return None
        latest_file = max(files, key=os.path.getctime)
        return latest_file

    def process_files(self):
        latest_file = self.get_latest_file()
        if latest_file:
            self.process_file(latest_file.name)
            self.output_json()
        else:
            print(f"No text files found in {self.input_dir}.")

if __name__ == "__main__":
    input_subdir = 'los_angeles_county_building_codes'
    output_filename = "los_angeles_county_building_codes.json"
    
    extractor = Los_Angeles_County_to_JSON(input_subdir, output_filename)
    extractor.process_files()


No text files found in C:\Projects\regionintelligenceai\dev_llm\modules\q_and_a_generator\data\raw\los_angeles_county_building_codes.


In [None]:
import json
import re
import os
from pathlib import Path
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class San_Francisco_to_JSON:
    def __init__(self, input_subdir, output_filename):
        self.input_dir = RAW_DATA_DIR / input_subdir
        self.output_filename = JSON_DATA_DIR / 'san_francisco_building_codes_json' / output_filename
        self.all_sections = []

    def extract_section(self, line, current_section, sections):
        if re.match(r"\d+(\.\d+)*", line):
            if current_section is not None:
                sections.append(current_section)
            current_section = {"section_id": "", "title": "", "content": ""}
            parts = line.split(" ", 1)
            current_section["section_id"] = parts[0]
            current_section["title"] = parts[1] if len(parts) > 1 else ""
        elif current_section is not None:
            if re.match(r"\[.*?\]", line):
                current_section["title"] = line if line else ""
            elif line.isupper():
                current_section["title"] = line
            else:
                current_section["content"] += line + "\n" if line else ""
        return current_section

    def process_file(self, file_name):
        sections = []
        current_section = None
        try:
            with open(self.input_dir / file_name, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    current_section = self.extract_section(line, current_section, sections)
                if current_section is not None:
                    sections.append(current_section)
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
        self.all_sections.extend(sections)

    def output_json(self):
        try:
            with open(self.output_filename, "w") as f:
                json.dump(self.all_sections, f, indent=4)
        except Exception as e:
            print(f"Error writing to {self.output_filename}: {str(e)}")
    
    def get_latest_file(self):
        files = list(self.input_dir.glob("*.txt"))
        if not files:
            return None
        latest_file = max(files, key=os.path.getctime)
        return latest_file

    def process_files(self):
        latest_file = self.get_latest_file()
        if latest_file:
            self.process_file(latest_file.name)
            self.output_json()
        else:
            print(f"No text files found in {self.input_dir}.")

if __name__ == "__main__":
    input_subdir = 'san_francisco_building_codes'
    output_filename = "san_francisco_building_codes.json"
    
    extractor = San_Francisco_to_JSON(input_subdir, output_filename)
    extractor.process_files()


In [8]:
import json
import re
import os
from pathlib import Path
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class San_Jose_to_JSON:
    def __init__(self, input_subdir, output_filename):
        self.input_dir = RAW_DATA_DIR / input_subdir
        self.output_filename = JSON_DATA_DIR / 'san_jose_building_codes_json' / output_filename
        self.all_sections = []

    def extract_section(self, line, current_section, sections):
        if re.match(r"\d+(\.\d+)*", line):
            if current_section is not None:
                sections.append(current_section)
            current_section = {"section_id": "", "title": "", "content": ""}
            parts = line.split(" ", 1)
            current_section["section_id"] = parts[0]
            current_section["title"] = parts[1] if len(parts) > 1 else ""
        elif current_section is not None:
            if re.match(r"\[.*?\]", line):
                current_section["title"] = line if line else ""
            elif line.isupper():
                current_section["title"] = line
            else:
                current_section["content"] += line + "\n" if line else ""
        return current_section

    def process_file(self, file_name):
        sections = []
        current_section = None
        try:
            with open(self.input_dir / file_name, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    current_section = self.extract_section(line, current_section, sections)
                if current_section is not None:
                    sections.append(current_section)
        except FileNotFoundError:
            print(f"File {file_name} not found.")
        except Exception as e:
            print(f"Error processing {file_name}: {str(e)}")
        self.all_sections.extend(sections)

    def output_json(self):
        try:
            with open(self.output_filename, "w") as f:
                json.dump(self.all_sections, f, indent=4)
        except Exception as e:
            print(f"Error writing to {self.output_filename}: {str(e)}")
    
    def get_latest_file(self):
        files = list(self.input_dir.glob("*.txt"))
        if not files:
            return None
        latest_file = max(files, key=os.path.getctime)
        return latest_file

    def process_files(self):
        latest_file = self.get_latest_file()
        if latest_file:
            self.process_file(latest_file.name)
            self.output_json()
        else:
            print(f"No text files found in {self.input_dir}.")

if __name__ == "__main__":
    input_subdir = 'san_jose_building_codes'
    output_filename = "san_jose_building_codes.json"
    
    extractor = San_Jose_to_JSON(input_subdir, output_filename)
    extractor.process_files()


No text files found in C:\Projects\regionintelligenceai\dev_llm\modules\q_and_a_generator\data\raw\san_jose_building_codes.


# The Best Way

In [None]:
class CaliforniaCodeExtractor:
    def __init__(self, input_dir, output_dir):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def extract_from_file(self, filename):
        stack = []
        chapters = []

        with open(filename, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if line == "":
                    continue
                # Check for section ID.
                if re.match(r"\d+\.\d+.*", line):
                    parts = line.split(" ", 1)
                    section_id = parts[0]
                    title = parts[1] if len(parts) > 1 else ""
                    new_section = {
                        "section_id": section_id,
                        "title": title,
                        "content": "",
                        "subsections": [],
                    }

                    # If section_id starts with a new number, start a new chapter.
                    if (
                        len(stack) == 0
                        or section_id.split(".")[0] != stack[0]["section_id"].split(".")[0]
                    ):
                        chapter = {
                            "chapter": section_id.split(".")[0],
                            "sections": [new_section],
                        }
                        chapters.append(chapter)
                        stack = [new_section]
                    else:
                        # Add section to the correct parent.
                        while len(stack) > 1 and (
                            section_id.count(".") <= stack[-1]["section_id"].count(".")
                            or (
                                section_id.count(".") - stack[-1]["section_id"].count(".")
                                > 1
                            )
                        ):
                            stack.pop()

                        # Append to parent section's subsections.
                        stack[-1]["subsections"].append(new_section)

                        # Update current section.
                        stack.append(new_section)
                elif stack:
                # Otherwise, append line to content.
                    stack[-1]["content"] += (line + "\n") if line else ""


        return chapters

    def process_files(self):
        for i in range(36):
            input_file = os.path.join(
                self.input_dir,
                f"testing{i}.txt"
            )

            chapters = self.extract_from_file(input_file)

            output_file = os.path.join(
                self.output_dir,
                f"objects{i+1}.json"
            )

            with open(output_file, "w") as f:
                json.dump(chapters, f, indent=4)

if __name__ == "__main__":
    INPUT_DIRECTORY = RAW_DATA_DIR / 'california_building_codes'
    OUTPUT_DIRECTORY = JSON_DATA_DIR / 'california_building_codes_json'  # Specify your desired output directory

    extractor = CaliforniaCodeExtractor(INPUT_DIRECTORY, OUTPUT_DIRECTORY)
    extractor.process_files()

In [None]:
import json
import re
import os
from pathlib import Path
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class CaliforniaCodeExtractor:
    def __init__(self, input_dir: Path, output_dir: Path):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def get_latest_file(self):
        try:
            files = list(self.input_dir.glob("*.txt"))
            if not files:
                return None
            latest_file = max(files, key=os.path.getctime)
            return latest_file
        except Exception as e:
            print(f"Error while getting the latest file: {str(e)}")
            return None

    def extract_from_file(self, filename: Path):
        stack = []
        chapters = []
        try:
            with filename.open(encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    # Check for section ID.
                    if re.match(r"\d+\.\d+.*", line):
                        parts = line.split(" ", 1)
                        section_id = parts[0]
                        title = parts[1] if len(parts) > 1 else ""
                        new_section = {
                            "section_id": section_id,
                            "title": title,
                            "content": "",
                            "subsections": [],
                        }

                        # If section_id starts with a new number, start a new chapter.
                        if (
                            len(stack) == 0
                            or section_id.split(".")[0] != stack[0]["section_id"].split(".")[0]
                        ):
                            chapter = {
                                "chapter": section_id.split(".")[0],
                                "sections": [new_section],
                            }
                            chapters.append(chapter)
                            stack = [new_section]
                        else:
                            # Add section to the correct parent.
                            while len(stack) > 1 and (
                                section_id.count(".") <= stack[-1]["section_id"].count(".")
                                or (
                                    section_id.count(".") - stack[-1]["section_id"].count(".")
                                    > 1
                                )
                            ):
                                stack.pop()

                            # Append to parent section's subsections.
                            stack[-1]["subsections"].append(new_section)

                            # Update current section.
                            stack.append(new_section)
                    elif stack:
                        # Otherwise, append line to content.
                        stack[-1]["content"] += (line + "\n") if line else ""

        except Exception as e:
            print(f"Error reading from {filename}: {str(e)}")
        return chapters

    def process_files(self):
        latest_file = self.get_latest_file()
        if not latest_file:
            print("No text files found in the directory.")
            return

        chapters = self.extract_from_file(latest_file)

        output_filename = latest_file.stem + ".json"
        output_file = self.output_dir / output_filename

        try:
            with output_file.open("w") as f:
                json.dump(chapters, f, indent=4)
            print(f"Data processed and saved to {output_file}")
        except Exception as e:
            print(f"Error writing to {output_file}: {str(e)}")

if __name__ == "__main__":
    INPUT_DIRECTORY = RAW_DATA_DIR / 'california_building_codes'
    OUTPUT_DIRECTORY = JSON_DATA_DIR / 'california_building_codes_json'

    extractor = CaliforniaCodeExtractor(INPUT_DIRECTORY, OUTPUT_DIRECTORY)
    extractor.process_files()


In [None]:
import json
import re
import os
from pathlib import Path
from datetime import datetime
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class Los_Angeles_to_JSON:
    def __init__(self, input_dir: Path, output_dir: Path):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def get_latest_file(self):
        try:
            files = list(self.input_dir.glob("*.txt"))
            if not files:
                return None
            latest_file = max(files, key=os.path.getctime)
            return latest_file
        except Exception as e:
            print(f"Error while getting the latest file: {str(e)}")
            return None

    def extract_from_file(self, filename: Path):
        stack = []
        chapters = []
        try:
            with filename.open(encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    # Check for section ID.
                    if re.match(r"\d+\.\d+.*", line):
                        parts = line.split(" ", 1)
                        section_id = parts[0]
                        title = parts[1] if len(parts) > 1 else ""
                        new_section = {
                            "section_id": section_id,
                            "title": title,
                            "content": "",
                            "subsections": [],
                        }

                        # If section_id starts with a new number, start a new chapter.
                        if (
                            len(stack) == 0
                            or section_id.split(".")[0] != stack[0]["section_id"].split(".")[0]
                        ):
                            chapter = {
                                "chapter": section_id.split(".")[0],
                                "sections": [new_section],
                            }
                            chapters.append(chapter)
                            stack = [new_section]
                        else:
                            # Add section to the correct parent.
                            while len(stack) > 1 and (
                                section_id.count(".") <= stack[-1]["section_id"].count(".")
                                or (
                                    section_id.count(".") - stack[-1]["section_id"].count(".")
                                    > 1
                                )
                            ):
                                stack.pop()

                            # Append to parent section's subsections.
                            stack[-1]["subsections"].append(new_section)

                            # Update current section.
                            stack.append(new_section)
                    elif stack:
                        # Otherwise, append line to content.
                        stack[-1]["content"] += (line + "\n") if line else ""

        except Exception as e:
            print(f"Error reading from {filename}: {str(e)}")
        return chapters

    def process_files(self):
        latest_file = self.get_latest_file()
        if not latest_file:
            print("No text files found in the directory.")
            return

        chapters = self.extract_from_file(latest_file)

        # Fetching the current date and formatting it as YYYY-MM-DD
        current_date = datetime.now().strftime('%Y-%m-%d')

        output_filename = "los_angeles_" + f"{current_date}.json"
        output_file = self.output_dir / output_filename

        try:
            with output_file.open("w") as f:
                json.dump(chapters, f, indent=4)
            print(f"Data processed and saved to {output_file}")
        except Exception as e:
            print(f"Error writing to {output_file}: {str(e)}")

if __name__ == "__main__":
    INPUT_DIRECTORY = RAW_DATA_DIR / 'los_angeles_building_codes'
    OUTPUT_DIRECTORY = JSON_DATA_DIR / 'los_angeles_building_codes_json'

    extractor = Los_Angeles_to_JSON(INPUT_DIRECTORY, OUTPUT_DIRECTORY)
    extractor.process_files()


In [None]:
import json
import re
import os
from pathlib import Path
from datetime import datetime
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class Los_Angeles_County_to_JSON:
    def __init__(self, input_dir: Path, output_dir: Path):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def get_latest_file(self):
        try:
            files = list(self.input_dir.glob("*.txt"))
            if not files:
                return None
            latest_file = max(files, key=os.path.getctime)
            return latest_file
        except Exception as e:
            print(f"Error while getting the latest file: {str(e)}")
            return None

    def extract_from_file(self, filename: Path):
        stack = []
        chapters = []
        try:
            with filename.open(encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    # Check for section ID.
                    if re.match(r"\d+\.\d+.*", line):
                        parts = line.split(" ", 1)
                        section_id = parts[0]
                        title = parts[1] if len(parts) > 1 else ""
                        new_section = {
                            "section_id": section_id,
                            "title": title,
                            "content": "",
                            "subsections": [],
                        }

                        # If section_id starts with a new number, start a new chapter.
                        if (
                            len(stack) == 0
                            or section_id.split(".")[0] != stack[0]["section_id"].split(".")[0]
                        ):
                            chapter = {
                                "chapter": section_id.split(".")[0],
                                "sections": [new_section],
                            }
                            chapters.append(chapter)
                            stack = [new_section]
                        else:
                            # Add section to the correct parent.
                            while len(stack) > 1 and (
                                section_id.count(".") <= stack[-1]["section_id"].count(".")
                                or (
                                    section_id.count(".") - stack[-1]["section_id"].count(".")
                                    > 1
                                )
                            ):
                                stack.pop()

                            # Append to parent section's subsections.
                            stack[-1]["subsections"].append(new_section)

                            # Update current section.
                            stack.append(new_section)
                    elif stack:
                        # Otherwise, append line to content.
                        stack[-1]["content"] += (line + "\n") if line else ""

        except Exception as e:
            print(f"Error reading from {filename}: {str(e)}")
        return chapters

    def process_files(self):
        latest_file = self.get_latest_file()
        if not latest_file:
            print("No text files found in the directory.")
            return

        chapters = self.extract_from_file(latest_file)

        # Fetching the current date and formatting it as YYYY-MM-DD
        current_date = datetime.now().strftime('%Y-%m-%d')

        output_filename = "los_angeles_county_" + f"{current_date}.json"
        output_file = self.output_dir / output_filename

        try:
            with output_file.open("w") as f:
                json.dump(chapters, f, indent=4)
            print(f"Data processed and saved to {output_file}")
        except Exception as e:
            print(f"Error writing to {output_file}: {str(e)}")

if __name__ == "__main__":
    INPUT_DIRECTORY = RAW_DATA_DIR / 'los_angeles_county_building_codes'
    OUTPUT_DIRECTORY = JSON_DATA_DIR / 'los_angeles_county_building_codes_json'

    extractor = Los_Angeles_County_to_JSON(INPUT_DIRECTORY, OUTPUT_DIRECTORY)
    extractor.process_files()


In [None]:
import json
import re
import os
from pathlib import Path
from datetime import datetime
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class San_Francisco_to_JSON:
    def __init__(self, input_dir: Path, output_dir: Path):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def get_latest_file(self):
        try:
            files = list(self.input_dir.glob("*.txt"))
            if not files:
                return None
            latest_file = max(files, key=os.path.getctime)
            return latest_file
        except Exception as e:
            print(f"Error while getting the latest file: {str(e)}")
            return None

    def extract_from_file(self, filename: Path):
        stack = []
        chapters = []
        try:
            with filename.open(encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    # Check for section ID.
                    if re.match(r"\d+\.\d+.*", line):
                        parts = line.split(" ", 1)
                        section_id = parts[0]
                        title = parts[1] if len(parts) > 1 else ""
                        new_section = {
                            "section_id": section_id,
                            "title": title,
                            "content": "",
                            "subsections": [],
                        }

                        # If section_id starts with a new number, start a new chapter.
                        if (
                            len(stack) == 0
                            or section_id.split(".")[0] != stack[0]["section_id"].split(".")[0]
                        ):
                            chapter = {
                                "chapter": section_id.split(".")[0],
                                "sections": [new_section],
                            }
                            chapters.append(chapter)
                            stack = [new_section]
                        else:
                            # Add section to the correct parent.
                            while len(stack) > 1 and (
                                section_id.count(".") <= stack[-1]["section_id"].count(".")
                                or (
                                    section_id.count(".") - stack[-1]["section_id"].count(".")
                                    > 1
                                )
                            ):
                                stack.pop()

                            # Append to parent section's subsections.
                            stack[-1]["subsections"].append(new_section)

                            # Update current section.
                            stack.append(new_section)
                    elif stack:
                        # Otherwise, append line to content.
                        stack[-1]["content"] += (line + "\n") if line else ""

        except Exception as e:
            print(f"Error reading from {filename}: {str(e)}")
        return chapters

    def process_files(self):
        latest_file = self.get_latest_file()
        if not latest_file:
            print("No text files found in the directory.")
            return

        chapters = self.extract_from_file(latest_file)

        # Fetching the current date and formatting it as YYYY-MM-DD
        current_date = datetime.now().strftime('%Y-%m-%d')

        output_filename = "san_francisco_" + f"{current_date}.json"
        output_file = self.output_dir / output_filename

        try:
            with output_file.open("w") as f:
                json.dump(chapters, f, indent=4)
            print(f"Data processed and saved to {output_file}")
        except Exception as e:
            print(f"Error writing to {output_file}: {str(e)}")

if __name__ == "__main__":
    INPUT_DIRECTORY = RAW_DATA_DIR / 'san_francisco_building_codes'
    OUTPUT_DIRECTORY = JSON_DATA_DIR / 'san_francisco_building_codes_json'

    extractor = San_Francisco_to_JSON(INPUT_DIRECTORY, OUTPUT_DIRECTORY)
    extractor.process_files()


In [None]:
import json
import re
import os
from pathlib import Path
from datetime import datetime
import glob
from src.paths import JSON_DATA_DIR, RAW_DATA_DIR, PROCESSED_DATA_DIR

class San_Jose_to_JSON:
    def __init__(self, input_dir: Path, output_dir: Path):
        self.input_dir = input_dir
        self.output_dir = output_dir

    def get_latest_file(self):
        try:
            files = list(self.input_dir.glob("*.txt"))
            if not files:
                return None
            latest_file = max(files, key=os.path.getctime)
            return latest_file
        except Exception as e:
            print(f"Error while getting the latest file: {str(e)}")
            return None

    def extract_from_file(self, filename: Path):
        stack = []
        chapters = []
        try:
            with filename.open(encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if line == "":
                        continue
                    # Check for section ID.
                    if re.match(r"\d+\.\d+.*", line):
                        parts = line.split(" ", 1)
                        section_id = parts[0]
                        title = parts[1] if len(parts) > 1 else ""
                        new_section = {
                            "section_id": section_id,
                            "title": title,
                            "content": "",
                            "subsections": [],
                        }

                        # If section_id starts with a new number, start a new chapter.
                        if (
                            len(stack) == 0
                            or section_id.split(".")[0] != stack[0]["section_id"].split(".")[0]
                        ):
                            chapter = {
                                "chapter": section_id.split(".")[0],
                                "sections": [new_section],
                            }
                            chapters.append(chapter)
                            stack = [new_section]
                        else:
                            # Add section to the correct parent.
                            while len(stack) > 1 and (
                                section_id.count(".") <= stack[-1]["section_id"].count(".")
                                or (
                                    section_id.count(".") - stack[-1]["section_id"].count(".")
                                    > 1
                                )
                            ):
                                stack.pop()

                            # Append to parent section's subsections.
                            stack[-1]["subsections"].append(new_section)

                            # Update current section.
                            stack.append(new_section)
                    elif stack:
                        # Otherwise, append line to content.
                        stack[-1]["content"] += (line + "\n") if line else ""

        except Exception as e:
            print(f"Error reading from {filename}: {str(e)}")
        return chapters

    def process_files(self):
        latest_file = self.get_latest_file()
        if not latest_file:
            print("No text files found in the directory.")
            return

        chapters = self.extract_from_file(latest_file)

        # Fetching the current date and formatting it as YYYY-MM-DD
        current_date = datetime.now().strftime('%Y-%m-%d')

        output_filename = "san_jose_" + f"{current_date}.json"
        output_file = self.output_dir / output_filename

        try:
            with output_file.open("w") as f:
                json.dump(chapters, f, indent=4)
            print(f"Data processed and saved to {output_file}")
        except Exception as e:
            print(f"Error writing to {output_file}: {str(e)}")

if __name__ == "__main__":
    INPUT_DIRECTORY = RAW_DATA_DIR / 'san_jose_building_codes'
    OUTPUT_DIRECTORY = JSON_DATA_DIR / 'san_jose_building_codes_json'

    extractor = San_Jose_to_JSON(INPUT_DIRECTORY, OUTPUT_DIRECTORY)
    extractor.process_files()
