In [1]:
from bs4 import BeautifulSoup
from bs4.element import Comment
import urllib.request
import openai
from sentence_transformers import SentenceTransformer, util
import os

  from .autonotebook import tqdm as notebook_tqdm


In [15]:
sections = ['navigation', 'page heading', 'buying selections', 'checkout options']
similarityThreshold = 0.3
model = SentenceTransformer('sentence-transformers/all-roberta-large-v1')
url = 'https://boxbox.in/products/copy-of-boxbox-oversized-t-shirt-44-lewis-hamilton-1'

# HELPER FUNCTIONS

In [16]:
openai.api_key = os.getenv("OPENAI_API_KEY")

def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    
    if str(element) == '\n':
        return False
    
    if str(element) == " ":
        return False
    
    return True


def text_from_html(body):
    soup = BeautifulSoup(body, 'html.parser')
    texts = soup.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return texts, u" ".join(t.strip() for t in visible_texts)


def queryOracle(text):
    if len(sections) <= 1:
        sectionsOfInterest = "".join(sections)
    else:
        sectionsOfInterest = ", ".join(sections[:-1]) + " and " + sections[-1]

    completion = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0.8,
    max_tokens = 2000,
    messages = [
        {"role": "system", "content": "You are a webpage analyzer. You find sections of text on an e-commerce webpage that correspond to {}.".format(sectionsOfInterest)},
        {"role": "user", "content": text}
    ]
    )
    return completion


def findOverlap(readableText, lines):
    text = ""
    for count, i in enumerate(lines):
        text += str(count) + ") " + i

    completion = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0.8,
    max_tokens = 2000,
    messages = [
        {"role": "system", "content": "You a text overlap analyzer. You find sections of a paragraph by giving the exact sequence of text that overlap most with a list of sentences."},
        {"role": "user", "content": "Paragraph: {}\n\n\n Sentences: {}".format(readableText, text)}
    ]
    )

    return completion

def bert_sim(text1, text2):
    embeddings = model.encode([text1,text2])
    res = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    return res.numpy()[0][0] 



In [2]:
completion = openai.ChatCompletion.create(
    model = "gpt-3.5-turbo",
    temperature = 0.8,
    max_tokens = 2000,
    messages = [
        {"role": "system", "content": "You a text overlap analyzer. You find sections of a paragraph by giving the exact sequence of text that overlap most with a list of sentences."},
        {"role": "user", "content": "Paragraph: {}\n\n\n Sentences: {}".format("hello world bla bla", "bla bla")}
    ]
    )

In [3]:
completion

<OpenAIObject chat.completion id=chatcmpl-8FrLKFyBKpvMtpZTvaE0zVGPZBOX1 at 0x292e1e270> JSON: {
  "id": "chatcmpl-8FrLKFyBKpvMtpZTvaE0zVGPZBOX1",
  "object": "chat.completion",
  "created": 1698790662,
  "model": "gpt-3.5-turbo-0613",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "The exact sequence of text that overlaps most with the list of sentences is \"bla bla\"."
      },
      "finish_reason": "stop"
    }
  ],
  "usage": {
    "prompt_tokens": 51,
    "completion_tokens": 18,
    "total_tokens": 69
  }
}

# SCRAPE PAGE & FIND CONTENT SECTIONS

In [17]:
html = urllib.request.urlopen(url).read()
elements, readableText = text_from_html(html)
elements = list(filter(tag_visible, elements))
response = queryOracle(readableText).choices[0].message
response

  texts = soup.findAll(text=True)


<OpenAIObject at 0x29532f6b0> JSON: {
  "role": "assistant",
  "content": "Navigation: \n- \"Home\"\n- \"Formula 1\"\n- \"Football\"\n- \"Contact\"\n- \"About Us\"\n- \"Blogs\"\n- \"Smiles\"\n- \"Log in\"\n- \"Instagram\"\n\nPage Heading: \n- \"Formula 1 T-shirts Oversized T-shirts Sweatshirts Hoodies\"\n\nBuying Selections: \n- \"Oversized T-shirt Lewis Hamilton Sketch\"\n- Size options: S, M, L, XL\n- Color options: Black, White\n- Quantity selection\n\nCheckout Options: \n- \"Add to cart\"\n- \"Couldn't load pickup availability\"\n- \"Refresh\"\n- \"FREE SHIPPING across INDIA!\"\n- \"Easy returns and replacements\"\n- \"Payment methods\""
}

In [18]:
capSections = [i.title() for i in sections]

parts = []
inds = []
for count, i in enumerate(capSections):
    p = response.content.find(i, 0)
    start = p+len(i)+1
    inds.append(p+len(i)+1)
    sub = response.content[start : len(response.content)]
    parts.append(sub)
    if (count > 0):
        parts[count-1] = parts[count-1][0: p - inds[count-1]]


parts

[' \n- "Home"\n- "Formula 1"\n- "Football"\n- "Contact"\n- "About Us"\n- "Blogs"\n- "Smiles"\n- "Log in"\n- "Instagram"\n\n',
 ' \n- "Formula 1 T-shirts Oversized T-shirts Sweatshirts Hoodies"\n\n',
 ' \n- "Oversized T-shirt Lewis Hamilton Sketch"\n- Size options: S, M, L, XL\n- Color options: Black, White\n- Quantity selection\n\n',
 ' \n- "Add to cart"\n- "Couldn\'t load pickup availability"\n- "Refresh"\n- "FREE SHIPPING across INDIA!"\n- "Easy returns and replacements"\n- "Payment methods"']

# FIND HTML CODE FOR CONTENT SECTIONS

In [19]:
code = {}
for count, section in enumerate(sections):
    print("Finding HTML Code for {} Section".format(section))
    l = len(parts[count])
    res = []
    vals = []
    for i in elements:
        em = bert_sim(i.text, parts[0])
        res.append(em)
        vals.append(i.text)

    inds = [c for c, i in enumerate(res) if i > similarityThreshold]
    code[section] = [elements[i].parent for i in inds]


Finding HTML Code for navigation Section
Finding HTML Code for page heading Section
Finding HTML Code for buying selections Section
Finding HTML Code for checkout options Section


In [20]:
code['page heading'][0]

<a class="menu-drawer__menu-item list-menu__item link link--text focus-inset" href="/" id="HeaderDrawer-home">
                      Home
                    </a>