# Agentic 1 (Business Brouchre)

In [1]:
import os
import requests
import json
from typing import List
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display

# Initialize and constants
load_dotenv(override=True)
api_key = os.getenv('GEMINI_API_KEY')

if not api_key:
    raise ValueError("No API key was found. Please ensure the GEMINI_API_KEY is set in your .env file.")
elif not api_key.startswith("AIza"):
    raise ValueError("The API key was found but does not start with 'AIza'. Check if it is valid.")
elif api_key.strip() != api_key:
    raise ValueError("The API key contains leading or trailing spaces. Please fix it.")
else:
    print("API key found and looks valid!")

MODEL = 'gemini-pro'  # Gemini model name
API_ENDPOINT = "https://generativelanguage.googleapis.com/v1beta/models/gemini-pro:generateContent"

# Define headers to include the API key and mimic a browser request
headers = {
    "Content-Type": "application/json",
    "x-goog-api-key": api_key,  # Use the API key in the header
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36"
}

# A class to represent a Webpage
class Website:
    """
    A utility class to represent a Website that we have scraped, now with links
    """

    def __init__(self, url):
        self.url = url
        response = requests.get(url, headers=headers)
        self.body = response.content
        soup = BeautifulSoup(self.body, 'html.parser')
        self.title = soup.title.string if soup.title else "No title found"
        if soup.body:
            for irrelevant in soup.body(["script", "style", "img", "input"]):
                irrelevant.decompose()
            self.text = soup.body.get_text(separator="\n", strip=True)
        else:
            self.text = ""
        links = [link.get('href') for link in soup.find_all('a')]
        self.links = [link for link in links if link]

    def get_contents(self):
        return f"Webpage Title:\n{self.title}\nWebpage Contents:\n{self.text}\n\n"

# System prompt for generating the brochure
system_prompt = (
    "You are an assistant that analyzes the contents of several relevant pages from a company website "
    "and creates a short brochure about the company for prospective customers, investors and recruits. "
    "Respond in markdown. Include details of company culture, customers and careers/jobs if you have the information."
)

# Generate user prompt for extracting relevant information
def get_links_user_prompt(website):
    user_prompt = f"Here is the list of links on the website of {website.url} - "
    user_prompt += "please decide which of these are relevant web links for a brochure about the company, respond with the full https URL in JSON format. "
    user_prompt += "Do not include Terms of Service, Privacy, email links.\n"
    user_prompt += "Links (some might be relative links):\n"
    user_prompt += "\n".join(website.links)
    return user_prompt

def get_links(url):
    website = Website(url)
    messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_links_user_prompt(website)}
    ]
    data = {
      "contents": [
        {
          "parts": [
              {
                "text": message["content"]
               } for message in messages
           ]
        }
      ]
    }

    response = requests.post(API_ENDPOINT, headers=headers, json=data)

    if response.status_code == 200:
        try:
            json_response = response.json()
            if 'candidates' in json_response and json_response['candidates']:
                if  'content' in json_response['candidates'][0] and 'parts' in json_response['candidates'][0]['content']:
                    if json_response['candidates'][0]['content']['parts'][0] and 'text' in json_response['candidates'][0]['content']['parts'][0]:
                        result = json_response['candidates'][0]['content']['parts'][0]['text']
                        return json.loads(result)
        except (json.JSONDecodeError, KeyError, TypeError) as e:
             print(f"JSON Decode Error: {e}. Response Text: {response.text}")

    print(f"Error {response.status_code}: {response.text}")
    return {"links": []}

   

# Function to get all details and content of the website
def get_all_details(url):
    result = "Landing page:\n"
    result += Website(url).get_contents()
    links = get_links(url)
    print("Found links:", links)
    for link in links.get("links",[]):
        result += f"\n\n{link['type']}\n"
        result += Website(link["url"]).get_contents()
    return result

# User prompt to build the brochure
def get_brochure_user_prompt(company_name, url):
    user_prompt = f"You are looking at a company called: {company_name}\n"
    user_prompt += f"Here are the contents of its landing page and other relevant pages; use this information to build a short brochure of the company in markdown.\n"
    user_prompt += get_all_details(url)
    user_prompt = user_prompt[:5_000]  # Truncate if more than 5,000 characters
    return user_prompt

# Final function to create the brochure
def create_brochure(company_name, url):
     messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": get_brochure_user_prompt(company_name, url)}
        ]
     data = {
      "contents": [
        {
          "parts": [
              {
                "text": message["content"]
               } for message in messages
           ]
        }
      ]
    }

     response = requests.post(API_ENDPOINT, headers=headers, json=data)
     if response.status_code == 200:
        json_response = response.json()
        if 'candidates' in json_response and json_response['candidates']:
            if  'content' in json_response['candidates'][0] and 'parts' in json_response['candidates'][0]['content']:
                 if json_response['candidates'][0]['content']['parts'][0] and 'text' in json_response['candidates'][0]['content']['parts'][0]:
                    result = json_response['candidates'][0]['content']['parts'][0]['text']
                    display(Markdown(result))
                    return
     print(f"Error {response.status_code}: {response.text}")
     return

API key found and looks valid!


In [2]:
create_brochure("ScikitLearn", "https://scikit-learn.org/stable")

JSON Decode Error: Expecting value: line 1 column 1 (char 0). Response Text: {
  "candidates": [
    {
      "content": {
        "parts": [
          {
            "text": "```json\n{\n  \"relevant_urls\": [\n    \"https://scikit-learn.org/stable/about.html\",\n    \"https://scikit-learn.org/dev/governance.html\"\n  ]\n}\n```"
          }
        ],
        "role": "model"
      },
      "finishReason": "STOP",
      "index": 0,
      "safetyRatings": [
        {
          "category": "HARM_CATEGORY_SEXUALLY_EXPLICIT",
          "probability": "NEGLIGIBLE"
        },
        {
          "category": "HARM_CATEGORY_HATE_SPEECH",
          "probability": "NEGLIGIBLE"
        },
        {
          "category": "HARM_CATEGORY_HARASSMENT",
          "probability": "NEGLIGIBLE"
        },
        {
          "category": "HARM_CATEGORY_DANGEROUS_CONTENT",
          "probability": "NEGLIGIBLE"
        }
      ]
    }
  ],
  "usageMetadata": {
    "promptTokenCount": 1308,
    "candidatesTokenC

## Scikit-Learn

Scikit-Learn is a free software machine learning library for the Python programming language.

### Mission statement
Scikit-Learn exists to improve the world through open source machine learning.

### Values
* Simplicity: We focus on making scikit-learn easy to learn and use.
* Efficiency: We strive to make scikit-learn as fast as possible.
* Reusability: We design scikit-learn to be used in a wide variety of applications.
* Openness: We are committed to open source and believe that scikit-learn should be freely available to everyone.

### Team
Scikit-Learn is developed and maintained by a team of volunteers from around the world. Our team is made up of scientists, engineers, and data enthusiasts who are passionate about making machine learning accessible to everyone.

### Products
Scikit-Learn offers a wide range of machine learning algorithms, including:
* Classification algorithms
* Regression algorithms
* Clustering algorithms
* Dimensionality reduction algorithms
* Model selection algorithms
* Preprocessing algorithms

### Who uses Scikit-Learn?
Scikit-Learn is used by a wide variety of organizations, including:
* Google
* Facebook
* Microsoft
* Amazon
* Netflix
* Airbnb
* Uber
* Lyft
* Spotify
* Pinterest
* Twitter
* LinkedIn
* Reddit
* Quora
* Yelp
* Indeed
* Glassdoor
* Coursera
* Udacity
* Pluralsight
* DataCamp
* edX
* MIT OpenCourseWare
* Stanford Online
* HarvardX

### Join us!
We are always looking for new contributors to help us make Scikit-Learn even better. If you are interested in contributing, please visit our website or join our community on Slack.

### Contact us
If you have any questions, please contact us at info@scikit-learn.org.