# HTML 2 XLSX
Notebook to convert the HTML output of `AnyParser` to an XLSX file.

## 1. Install Libraries

In [1]:
# !pip3 install BeautifulSoup4
# !pip3 install lxml
# !pip3 install openpyxl
# !pip3 install pandas



## Input Data
Load the html string from the `AnyParser` output, located in the html_2_xlsx/input folder.

In [2]:
from html_2_xlsx.input.html_input import input_html

## Convert HTML to XLSX
The `html_to_excel` function reads the html string and converts it to an XLSX file using **BeautifulSoup** and pandas.

In [3]:
import os
from bs4 import BeautifulSoup
import pandas as pd
from io import StringIO


def html_to_excel(html_string, output_folder, output_filename):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    soup = BeautifulSoup(html_string, 'html.parser')

    tables = soup.find_all('table')

    dfs = {}
    for i, table in enumerate(tables):
        dfs[f"Table_{i+1}"] = pd.read_html(StringIO(str(table)))[0]

    output_file = os.path.join(output_folder, output_filename)
    with pd.ExcelWriter(output_file) as writer:
        for name, df in dfs.items():
            df.to_excel(writer, sheet_name=name, index=False)

    print(f"Excel file saved to {output_file}")


Run `html_to_excel` to convert the HTML to XLSX.

In [4]:
output_file = 'html_2_excel_output.xlsx'
output_folder = 'html_2_xlsx/output'
html_to_excel(input_html, output_folder, output_file)

Excel file saved to html_2_xlsx/output/html_2_excel_output.xlsx
