# A full business solution

Create a product that builds a Brochure for a company to be used for prospective clients, investors and potential recruits using their website.

In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display, update_display
from openai import OpenAI

In [2]:
# Initialize and constants
# load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY') #OPENAI_API_KEY is in Path variable on local machine

if api_key and api_key.startswith('sk-proj-') and len(api_key)>10:
    print("API key looks good so far")
else:
    print("There might be a problem with your API key? Please visit the troubleshooting notebook!")
    
MODEL = 'gpt-4o-mini'
openai = OpenAI()

API key looks good so far


In [3]:
#Use class to represent a webpage

class Website:
    """
    A utility class to represent a website that we have scraped
    """
    #data structure
    url: str 
    title: str
    body: str
    links: List[str]
    text: str
    
    #initialize
    def __init__(self,url):
        self.url = url 
        response = requests.get(url) # stores response from url
        self.body = response.content #extracts raw HTML content
        soup = BeautifulSoup(self.body,'html.parser') # parses HTML content
        self.title = soup.title.string if soup.title else "No title found" 
        #if body tag exists - remove some elements since they don't contain meaningful text
        if soup.body:
            for irrelevant in soup.body(["script","style","img","input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n",strip=True) #get clean text from <body> tag with line break
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')] # collect all links on webpage with a tag
        self.links = [link for link in links if link] # Filter out None values
        
    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"
            

In [5]:
#Test it with reddit
reddit = Website('http://www.reddit.com/')
print(reddit.get_contents())

Webpage Title:
Reddit - Heart of the internet
Webpage Contents:
Skip to main content
Open menu
Open navigation
Go to Reddit Home
Get App
Get the Reddit app
Log In
Log in to Reddit
Expand user menu
Open settings menu
Black Mirror S7 trailer
Black Mirror: Season 7 | Official Trailer | Netflix
r/blackmirror
and more
Playboi Carti album date
Boys we made it
r/playboicarti
and more
Rudolph returns to Steelers
[Garofolo] A Steel City reunion: QB Mason Rudolph has agreed to a two-year, $8 million deal with $4.5 million to return to the #Steelers, sources tell me and @RapSheet. Rudolph mulled multiple offers, including a return to the #Titans, but was motivated to head back to Pittsburgh.
r/steelers
and more
Glaser to host Golden Globes
Nikki Glaser to Return as Golden Globes Host for 2026 Ceremony
r/television
and more
Half-Life 2 RTX demo
Half Life 2 RTX Demo coming March 18th
r/HalfLife
and more
Invincible S3 finale
Enough time has passed Amazon
r/Invincible
and more
Hot
Open sort options
B

In [6]:
reddit.links

['#main-content',
 '/',
 'https://www.reddit.com/login/',
 '/search/?q=%22Black+Mirror%22+AND+%22Season+7%22+AND+trailer&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=1b06cb92-d34d-432a-b810-d2c5974cb6db',
 '/search/?q=subreddit%3A%22playboicarti%22&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=8a2f64d8-a2ee-4cee-b31b-fce1a9e950ed',
 '/search/?q=%22Mason+Rudolph%22+OR+%28subreddit%3Asteelers+%28Rudolph+OR+back%29%29&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=cfabb6a5-6a2c-48de-9db7-86ceea7204a7',
 '/search/?q=Glaser+AND+%22Golden+Globes%22&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=d736b01c-9c6f-4cc2-b82e-7cf579b713cb',
 '/search/?q=%28%22Half-Life%22+OR+%22Half+Life%22%29+AND+RTX&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=113059d8-74fd-457b-ab4f-3e7f09ad4773',
 '/search/?q=subreddit%3A%22Invincible%22&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=0a6f9469-bb30-4ebf-ad49-6eb8ba1216f1',
 '

# Have GPT figure out which links are relevant #
### Use a call to gpt-4o-mini to read links and respond in structured JSON ###

Want to decide which links are relevant and replace relative links as /about with "https://company.com/about"
This is one shot prompting - providing a singular example of how it should respond in prompt

In [11]:
link_system_prompt = """
You have a list of links found on a webpage.
You are able to decide which of the links would be most relevant to include in a brochure about the company,
such as links to an about me page, company page, or a Career\jobs page.
You should respond in JSON as in this example: 
{
"links": [
    {"type":"About Page", "url":"https://www.company_name.com/about"},
    {"type":"Career Page","url":"https://www.company_name.com/maybe/other/links/careers"}
    ]
}
"""

  link_system_prompt = """


In [12]:
def get_user_links_prompt(website):
    user_prompt = f"""
    Here is the list of links on the website of {website.url}
    Please decide which of these are relevant web links for a brochure about the company.  Respond with the full url and do not include
    terms of Service, Privacy, or email links.
    
    The links are here (some may be relative): {website.links}
    """
    return user_prompt
    

In [14]:
print(get_user_links_prompt(reddit))


    Here is the list of links on the website of http://www.reddit.com/
    Please decide which of these are relevant web links for a brochure about the company.  Respond with the full url and do not include
    terms of Service, Privacy, or email links.

    The links are here (some may be relative): ['#main-content', '/', 'https://www.reddit.com/login/', '/search/?q=%22Black+Mirror%22+AND+%22Season+7%22+AND+trailer&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=1b06cb92-d34d-432a-b810-d2c5974cb6db', '/search/?q=subreddit%3A%22playboicarti%22&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=8a2f64d8-a2ee-4cee-b31b-fce1a9e950ed', '/search/?q=%22Mason+Rudolph%22+OR+%28subreddit%3Asteelers+%28Rudolph+OR+back%29%29&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=cfabb6a5-6a2c-48de-9db7-86ceea7204a7', '/search/?q=Glaser+AND+%22Golden+Globes%22&source=trending&cId=5831e34c-84bc-4e1b-a553-8b09278a3c8c&iId=d736b01c-9c6f-4cc2-b82e-7cf579b713cb', '/search/

In [None]:
#Now let's write function

def get_links(url):
    website = Website(url)
    completion = openai.chat.completions.create(
        model = MODEL,
        messages= [
            {"role":"system","content":link_system_prompt},
            {"role":"user","content":get_user_links_prompt(website)}
        ],
        response_format = {"type":"json_object"} # We tell OpenAI to provide a Json object back in it's response. This is only for OpenAI.
    )
    result = completion.choices[0].message.content 
    return json.loads(result)

In [22]:
get_links("https://reddit.com")

{'links': [{'type': 'About Page', 'url': 'https://www.redditinc.com'},
  {'type': 'Career Page', 'url': 'https://www.redditinc.com/careers'},
  {'type': 'Company Page', 'url': 'https://www.redditinc.com/press'}]}

# Make a brochure #

Here we want to assemble all the details from above into a separate prompt.