In [2]:
import os
from dotenv import load_dotenv
load_dotenv()

import nest_asyncio
nest_asyncio.apply()
from pprint import pprint

from docx import Document
from docx.shared import Inches, Pt
from docx.enum.text import WD_PARAGRAPH_ALIGNMENT

from langchain.schema import (
    AIMessage,
    HumanMessage,
    SystemMessage
)
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import AsyncHtmlLoader
from langchain.document_transformers import BeautifulSoupTransformer

In [3]:
url = ["https://skuterymotocykle.pl/pl/p/CROSS-KXD-607M-1412-manual-E-Start/2809"]

In [8]:
def summarize_weblink(url):
    loader = AsyncHtmlLoader(url)
    docs = loader.load()
    
    bs_transformer = BeautifulSoupTransformer()
    docs_transformed = bs_transformer.transform_documents(docs, tags_to_extract=["div"])
    
    chat = ChatOpenAI()
    
    messages = [
        SystemMessage(content="You are a helpful assistant that summarize product specification from given input. List top 5 product features in polish. Below the top 5 print price of the product"),
        HumanMessage(content=docs_transformed[0].page_content)
    ]
    summary = chat(messages)
    
    return summary.content



def write_to_docx(filename="output.docx", logo_path="logo.png"):
    # Create new document
    doc = Document()
    
    # Add company logo to the header
    section = doc.sections[0]
    header = section.header
    header.paragraphs[0].add_run().add_picture(logo_path, width=Inches(1.5))     # Adjust width as needed
    
    # Add address to the footer
    footer = section.footer
    footer.paragraphs[0].text = "Firma XYZ, Miast, ul. Ulica 7, email: biuro@gmail.com"
    
    # Add empty paragraphs until the 14th line
    for _ in range(6):
        doc.add_paragraph()
        
    # Add "Oferta handlowa" title in the 15th line in the middle with 24 font size
    oferta_paragraph = doc.add_paragraph()
    oferta_paragraph.alignment = WD_PARAGRAPH_ALIGNMENT.CENTER
    run = oferta_paragraph.add_run("Oferta handlowa")
    run.font.size = Pt(24)
    
    # Add the input text to the document
    text = summarize_weblink(url)
    
    doc.add_paragraph(text)
    
    # Save the document with the given filname
    doc.save(filename)
    print(f"Text saved to {filename}")

In [9]:
if __name__ == "__main__":
    write_to_docx()

Fetching pages:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching pages: 100%|##########| 1/1 [00:00<00:00,  1.32it/s]


Text saved to output.docx
