**HTML to TEXT**

In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re

In [2]:
from bs4 import BeautifulSoup, NavigableString, Tag
from tabulate import tabulate

# --- MathML and cell text helpers ---

def process_math(math_elem):
    """
    Process a <math> element.
    If it contains an <mfrac> element, extract the numerator and denominator
    (using their immediate <mrow> children) and join them with a slash.
    Otherwise, simply concatenate all stripped strings.
    """
    mfrac = math_elem.find("mfrac")
    if mfrac:
        mrows = mfrac.find_all("mrow", recursive=False)
        if len(mrows) >= 2:
            numerator = ''.join(mrows[0].stripped_strings)
            denominator = ''.join(mrows[1].stripped_strings)
            return f"{numerator}/{denominator}"
        else:
            return ''.join(mfrac.stripped_strings)
    else:
        return ''.join(math_elem.stripped_strings)

def get_clean_text(cell):
    """
    Extract clean text from a cell element.
    Any nested <math> elements are processed with process_math().
    Other nested tags (like <b>, <i>, <a>, etc.) are handled automatically.
    """
    for math_elem in cell.find_all("math"):
        replacement = process_math(math_elem)
        math_elem.replace_with(replacement)
    return cell.get_text(separator=" ", strip=True)

# --- Table Parsing (handling rowspan/colspan) ---

def parse_html_table(table):
    """
    Parse an HTML table element into a 2D list (matrix) of strings.
    This function handles both 'rowspan' and 'colspan' attributes by building
    a grid where each spanned cell is repeated as needed.
    """
    rows = table.find_all("tr")
    table_data = []
    spanned = {}  # dictionary mapping (row_index, col_index) to cell content

    for r, tr in enumerate(rows):
        row = []
        cells = tr.find_all(["td", "th"])
        col = 0
        for cell in cells:
            # Skip any columns already filled by a spanning cell.
            while (r, col) in spanned:
                row.append(spanned[(r, col)])
                col += 1

            content = get_clean_text(cell)
            try:
                colspan = int(cell.get("colspan", 1))
            except ValueError:
                colspan = 1
            try:
                rowspan = int(cell.get("rowspan", 1))
            except ValueError:
                rowspan = 1

            # Place the content in the current row for the number of columns it spans.
            for i in range(colspan):
                row.append(content)

            # For rowspan > 1, mark the cells in the subsequent rows.
            if rowspan > 1:
                for i in range(1, rowspan):
                    for j in range(col, col + colspan):
                        spanned[(r + i, j)] = content
            col += colspan
        # Append any remaining spanned cells at the end of the row.
        while (r, col) in spanned:
            row.append(spanned[(r, col)])
            col += 1
        table_data.append(row)
    return table_data

def format_table(table):
    """
    Given a BeautifulSoup table element, parse it and return a formatted text table.
    Assumes that if more than one row exists, the first row is a header.
    """
    table_data = parse_html_table(table)
    if not table_data:
        return ""
    if len(table_data) > 1:
        headers = table_data[0]
        data = table_data[1:]
        return tabulate(data, headers=headers, tablefmt="grid")
    else:
        return tabulate(table_data, tablefmt="grid")

# --- Recursive processing to preserve document order ---

# Extend our block-level tags to include top-level elements.
block_tags = {
    'html', 'body', 'p', 'div', 'table', 'tr', 'ul', 'ol', 'li',
    'header', 'footer', 'section', 'article'
}

def process_node(node):
    """
    Recursively process a BeautifulSoup node.
    - If the node is a <br> tag, return a newline.
    - If the node is a table tag, process it with our table formatter.
    - If the node is a NavigableString, return its text.
    - Otherwise, process its children in order.
    """
    if isinstance(node, NavigableString):
        # Return the text as-is (we will trim later)
        return str(node)
    elif isinstance(node, Tag):
        if node.name == "br":
            return "\n"
        elif node.name == "table":
            # For a table, format it and ensure it is surrounded by newlines.
            table_text = format_table(node)
            return "\n" + table_text + "\n"
        else:
            parts = []
            for child in node.children:
                child_text = process_node(child)
                if child_text:
                    parts.append(child_text)
            if node.name in block_tags:
                # For block-level tags, join with newline.
                return "\n".join(part.strip() for part in parts if part.strip())
            else:
                # For inline tags, join with a single space.
                return " ".join(part.strip() for part in parts if part.strip())
    else:
        return ""

def html_to_text(html_content):
    """
    Convert HTML content to plain text while preserving the order of text and tables,
    and handling newlines and spaces precisely.
    """
    soup = BeautifulSoup(html_content, "html.parser")
    # Remove unwanted tags like script and style.
    for tag in soup(["script", "style"]):
        tag.decompose()
    # Process the entire document.
    text = process_node(soup)
    # Normalize newlines: remove extra spaces and ensure each line is properly trimmed.
    lines = [line.rstrip() for line in text.splitlines()]
    return "\n".join(line for line in lines if line)

# --- Main execution ---

def ultimate_html_to_text(input_file, output_file='output.txt'):
    # For example, read HTML content from "input.html"
    with open(input_file, "r", encoding="utf-8") as file:
        html_content = file.read()
    text_output = html_to_text(html_content)
    # Write the output to "output.txt"
    with open(output_file, "w", encoding="utf-8") as file:
        file.write(text_output)
    print("Conversion complete. Check the 'output.txt' file.")


In [3]:
html_file = "C:/REGULATA/my_work_space/BC2L/bc2l/ngspice-43_64/ngspice_manual.html"
ultimate_html_to_text(html_file)

Conversion complete. Check the 'output.txt' file.


In [4]:
text_content = open("output.txt", "r").read()

In [5]:
print(text_content)

xml version="1.0" encoding="UTF-8"? html PUBLIC "-//W3C//DTD XHTML 1.1 plus MathML 2.0//EN" "http://www.w3.org/Math/DTD/mathml2/xhtml-math11-f.dtd" Ngspice User's ManualVersion 43plus(ngspice development version)
Ngspice User's Manual Version 43plus (ngspice development version)
Holger Vogt, Giles Atkinson, Paolo Nenzi
Locations
The project and download pages of ngspice may be found at
Ngspice home page
http://ngspice.sourceforge.net/
Project page at SourceForge
http://sourceforge.net/projects/ngspice/
Download page at SourceForge
https://sourceforge.net/projects/ngspice/files/ng-spice-rework/
Git source download
https://sourceforge.net/p/ngspice/ngspice/ci/master/tree/
Status
This manual is a work in progress. Some to-dos are listed in Chapt.
20.3
. More is surely needed. You are invited to report bugs, missing items, wrongly described items, bad English style, etc.
How to use this Manual
The manual is a “work in progress.” It may accompany a specific ngspice release, e.g. ngspice-35 

**Table of Contents**

In [8]:
with open("C:/REGULATA/my_work_space/BC2L/bc2l/ngspice-43_64/ngspice_toc_extracted.txt", "r") as file:
    lines = file.readlines()

subchapters = []
cnt = 0
flag = text_content.find("Chapter 1 Introduction", text_content.find("Chapter 1 Introduction")+1)
for line in lines:
    line = line.strip()
    subchapters.append(line)
    t = findidx(line, text_content, flag)
    if t==-1:
        cnt += 1
    else:
        print(text_content[t:t+100].replace('\n', ' '))
    

print(cnt)

1 Introduction Ngspice is a general-purpose circuit simulation program for nonlinear and linear anal
1.1 Simulation Algorithms Computer-based circuit simulation is often used as a tool by designers, te
1.1.1 Analog Simulation Analog simulation focuses on the linear and non-linear behavior of a circuit
1.1.2 Matrix solvers Since version 42 ngspice offers two matrix solvers. Spice3f5 originally has use
1.1.3 Device Models for Analog Simulation There are three models for bipolar junction transistors, a
1.1.4 Digital Simulation Digital circuit simulation differs from analog circuit simulation in severa
1.1.5 Mixed-Signal Simulation Modern circuits often contain a mix of analog and digital circuits. To
1.1.6 Mixed-Level Simulation (Electronic and TCAD) Ngspice implements mixed-level simulation through
1.2 Supported Analyses The ngspice simulator supports the following different types of analysis: DC 
1.2.1 DC Analysis The DC analysis portion of ngspice determines the dc operating point of t

**SUBCHAPTERS EXTRACTION**

In [7]:
def findidx(pattern, text, after):
    pattern = pattern.replace('(',' ').replace(')',' ').replace(' ','[\n ()]*')
    try:
        pattern = re.search(pattern, text[after:], re.DOTALL|re.IGNORECASE).group()
    except:
        #print(pattern)
        return -1
    return text.find(pattern, after)

In [9]:
dict_of_subchapters = dict()
flag = text_content.find("Chapter 1 Introduction", text_content.find("Chapter 1 Introduction")+1)
length = len(subchapters)

for idx in range(length):
    subchapter = subchapters[idx]
    start = findidx(subchapter, text_content, flag)
    if idx == length-1:
        end = text_content.find("Bibliography", flag)
    else:
        end = findidx(subchapters[idx+1], text_content, flag)
    flag = end
    #print(subchapters[idx], start, end)
    if subchapter.startswith('8') or subchapter.startswith('12') or subchapter.startswith('19'):
        continue
    subchapter = text_content[start:end]
    #print(subchapter[:100].replace('\n', ' '))
    dict_of_subchapters[subchapters[idx]] = subchapter
    #print(f"word count in ({subchapters[idx]}) = {subchapter.count(" ")}")

**SUMMARIZATION OF SUBCHAPTERS**

In [10]:
import os
import openai
from openai import AzureOpenAI  
import tiktoken  # Tokenizer library for OpenAI models  
import dotenv

dotenv.load_dotenv()
# Retrieve environment variables  
api_key = os.getenv("AZURE_4O_API_KEY")  
api_endpoint = os.getenv("AZURE_4O_ENDPOINT")  
deployment_name = os.getenv("AZURE_4O_DEPLOYMENT")  
version = os.getenv("AZURE_4O_API_VERSION")
  
# Create an Azure OpenAI client  
client = AzureOpenAI(  
    api_key=api_key,  
    api_version=version,  # Use the latest supported Azure API version  
    azure_endpoint=api_endpoint,  
)  
  
# Initialize tokenizer  
tokenizer = tiktoken.encoding_for_model("gpt-4")  # Use the appropriate model name  
  
# Function to calculate token count  
def count_tokens(text):  
    return len(tokenizer.encode(text))  
  
# Function to generate responses  
def generate_response(prompt):  
    try:  
        response = client.chat.completions.create(  
            model=deployment_name,  # In Azure, 'model' should be the deployment name  
            messages=[  
                {"role": "system", "content": "You are an AI assistant. You should extract important information from a given passage."},  
                {"role": "user", "content": prompt}  
            ],  
            max_tokens=10000  
        )  
          
        # Extract response content  
        response_content = response.choices[0].message.content  
          
        # Count tokens in the response  
        token_count = response.usage.completion_tokens
          
        return response_content, token_count  
    except Exception as e:  
        return f"Error: {e}", 0

In [11]:
keys = list(dict_of_subchapters.keys())

**Base Prompt**

In [12]:
base_prompt = '''You are given a part of Ngspice manual. follow the instruction: 
                1) Strategically summarize,extract important content from the text, completely avoid unnecessary informations. 
                2) Specifically, only include the exact informations which is not already in your memory.
                3) The targeted use of the summary is to use in netlist code generation/modification/debugging/explannation and components/model/command queries.
                4) strongly include and preserve: codes/syntax/templates, default values and theoretical characteristics for components.
                5) if any example code present, include only every important/unique lines after checking line by line that it is not usually known to your database.
                6) Smartly and precisely include table contents.
                7) summary should be brief, precise, structured but avoid unnecessary formatting.
                Text: "{}"
            '''

**summary of every subchapter**

In [13]:
def create_file_name(input_string):  
    # Remove the last character if it's "."  
    if input_string.endswith("."):  
        input_string = input_string[:-1]
    input_string = input_string.replace("`","")
    input_string = input_string.replace("'","")
    input_string = input_string.replace("-"," ")
    # Replace specified characters with "_"  
    replacements = ["/","   ", "  ", ",", " ", ".","|",":"]  
    for char in replacements:  
        input_string = input_string.replace(char, "_")
    input_string = input_string.replace("__","_")
      
    return input_string+".txt"

In [14]:
for name in keys:
    file_name = create_file_name(name)
    path = "subchapter_summaries/"+file_name
    if os.path.exists(path):
        continue
    prompt = base_prompt.format(dict_of_subchapters[name])
    text_content, number_of_tokens = generate_response(prompt)
    print(file_name, number_of_tokens)
    with open(path, "w", encoding="utf-8") as file:
        file.write(text_content)

2_10_2_Brace_expressions_in_circuit_elements_.txt 100
4_2_1_Gxxxx_Linear_Voltage_Controlled_Current_Sources_(VCCS).txt 224
4_2_2_Exxxx_Linear_Voltage_Controlled_Voltage_Sources_(VCVS).txt 204
4_2_3_Fxxxx_Linear_Current_Controlled_Current_Sources_(CCCS).txt 217
4_2_4_Hxxxx_Linear_Current_Controlled_Voltage_Sources_(CCVS).txt 158
5_1_Bxxxx_Nonlinear_dependent_source_(ASRC).txt 223
5_2_Exxxx_non_linear_voltage_source.txt 199
5_3_Gxxxx_non_linear_current_source.txt 190
11_2_1_NODESET_Specify_Initial_Node_Voltage_Guesses.txt 185
11_2_2_IC_Set_Initial_Conditions.txt 389
11_3_1_AC_Small_Signal_AC_Analysis.txt 546
11_3_2_DC_DC_Transfer_Function.txt 489
11_3_3_DISTO_Distortion_Analysis.txt 746
11_3_4_NOISE_Noise_Analysis.txt 451
11_3_5_OP_Operating_Point_Analysis.txt 725
11_3_6_PZ_Pole_Zero_Analysis.txt 443
11_3_7_SENS_DC_or_Small_Signal_AC_Sensitivity_Analysis.txt 518
11_3_9_TF_Transfer_Function_Analysis.txt 167
11_3_10_TRAN_Transient_Analysis.txt 422
11_3_12_PSS_Periodic_Steady_State_Analysis