In [28]:
from openai import OpenAI
from os import getenv
import pandas as pd
import numpy as np

from playwright.async_api import async_playwright
from bs4 import BeautifulSoup

In [23]:
openai_client = OpenAI(api_key=getenv("OPENAI_API_KEY"))

In [24]:
async def extract_data(url):
  async with async_playwright() as p:
    browser = await p.chromium.launch(headless=False)
    page = await browser.new_page()

    await page.goto(url)
    await page.wait_for_load_state()

    page_content = await page.content()
    await browser.close()

    return page_content
  
def clean_html(html_content):
  soup = BeautifulSoup(html_content, 'html.parser')
        
  for script in soup(["script", "style", "svg", "path", "meta", "link"]):
    script.extract()
  
  return soup.decode()

def text_chunks(text:str, chunk_size: int = 500):
  return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

def get_embedding(text: str, model="text-embedding-3-small"):
  response = openai_client.embeddings.create(
    model=model,
    input=text
  )

  return response.data[0].embedding

def cosine_similarity(vec1, vec2):
  """Computes the cosine similarity between two vectors."""
  vec1 = np.array(vec1)
  vec2 = np.array(vec2)
  return np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))

In [31]:
full_html = await extract_data("https://lista.mercadolivre.com.br/mouse-sem-fio#D[A:mouse%20sem%20fio]")
cleaned_html = clean_html(full_html)
chunks = text_chunks(cleaned_html)

In [32]:
df = pd.DataFrame({ "chunk": chunks })
df.head()

Unnamed: 0,chunk
0,"<!DOCTYPE html>\n<html lang=""pt-BR""><head><nos..."
1,"ontent"">Pular para o conteúdo</span></a><a cla..."
2,"""GET"" role=""search""><label class=""nav-header-v..."
3,"ategorySearch"">Somente em Mouses</label></p><b..."
4,"_journey&amp;me.position=0""><img alt=""Assine m..."


In [37]:
query_embedding = get_embedding("input e botão de busca")

In [33]:
df["chunk_embedding"] = df['chunk'].apply(lambda x: get_embedding(x))
df.head()

Unnamed: 0,chunk,chunk_embedding
0,"<!DOCTYPE html>\n<html lang=""pt-BR""><head><nos...","[0.03531873971223831, 0.01972978003323078, -0...."
1,"ontent"">Pular para o conteúdo</span></a><a cla...","[0.0337677039206028, 0.010828710161149502, -0...."
2,"""GET"" role=""search""><label class=""nav-header-v...","[0.020205704495310783, 0.021671174094080925, -..."
3,"ategorySearch"">Somente em Mouses</label></p><b...","[0.0009286377462558448, 0.03196325898170471, -..."
4,"_journey&amp;me.position=0""><img alt=""Assine m...","[-0.011209553107619286, 0.010126677341759205, ..."


In [38]:
df['similarities'] = df["chunk_embedding"].apply(lambda x: cosine_similarity(x, query_embedding))
res = df.sort_values("similarities", ascending=False).head(2)
res

Unnamed: 0,chunk,chunk_embedding,similarities
2,"""GET"" role=""search""><label class=""nav-header-v...","[0.020205704495310783, 0.021671174094080925, -...",0.668593
134,"SENSOR_RESOLUTION"" enterkeyhint=""go"" id="":Rpig...","[0.031246818602085114, 0.033144205808639526, -...",0.64206
