In [1]:
%pip install pyppeteer_stealth pyppeteer urllib langchain langchain_openai --upgrade

[31mERROR: Could not find a version that satisfies the requirement urllib (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for urllib[0m[31m
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import asyncio
from pyppeteer import launch
from pyppeteer_stealth import stealth
import urllib.parse

In [3]:
# Setting custom user agents to help avoid detection:
MOBILE_USER_AGENT = "Mozilla/5.0 (Linux; Android 7.0; SM-G892A Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.125 Mobile Safari/537.36"
DESKTOP_USER_AGENT = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3"

# TODO - You must edit this!
DOMAIN = "understandingdata.com" # Change this to the domain you want to use as your base domain

In [4]:
async def screenshot_full_page(url, screenshot_filename, device_type="desktop"):
    browser = await launch(headless=True)
    page = await browser.newPage()

    if device_type == "mobile":
        await page.emulate(
            {
                "viewport": {"width": 360, "height": 640, "isMobile": True},
                "hasTouch": True,
                "isMobile": True,
                "userAgent": MOBILE_USER_AGENT,
            }
        )
    else:
        await page.setViewport({"width": 1280, "height": 800})
        await page.emulate({
            "viewport": {"width": 1280, "height": 800},
            "userAgent": DESKTOP_USER_AGENT
        
        })

    await stealth(page)
    await page.goto(url, {"waitUntil": "networkidle0"})

    # Scroll to the bottom to ensure all lazy-loaded images are loaded:
    await page.evaluate("window.scrollTo(0, document.body.scrollHeight);")
    await page.waitFor(2000)  # Wait for lazy-loaded images

    # Scroll back to the top:
    await page.evaluate("window.scrollTo(0, 0);")

    # Take screenshot of the entire page, change the viewport to the full page:
    await page.setViewport(
        {
            "width": await page.evaluate("document.body.scrollWidth"),
            "height": await page.evaluate("document.body.scrollHeight"),
        }
    )

    # Take screenshot:
    screenshot_bytes = await page.screenshot()

    # Save the screenshot
    with open(screenshot_filename, "wb") as f:
        f.write(screenshot_bytes)
    await browser.close()

In [5]:
# List of URLs to take screenshots of:
urls = [
    "https://understandingdata.com/",  # Replace with your main website URL
    "https://www.dufrain.co.uk/data-solutions/data-engineering/",  # Replace with the first competitor URL
    "https://www.fdmgroup.com/services/technical-services/data-engineering/",  # Replace with the second competitor URL
]

if DOMAIN not in urls[0]:
    raise ValueError(f"The first URL must be from the domain {DOMAIN}")

# Make clean names from the urls using urllib:
clean_names = [urllib.parse.urlparse(url).netloc for url in urls]

# Loop through the URLs and take screenshots
async def take_screenshot(url, clean_name, device_type):
    filename = f"screenshot_{clean_name}_{device_type}.png"
    print(f"Taking screenshot of {url} and saving to {filename}")
    await screenshot_full_page(url, filename, device_type)

async def main():
    tasks = []
    for url, clean_name in zip(urls, clean_names):
        for device_type in ["desktop", "mobile"]:
            tasks.append(take_screenshot(url, clean_name, device_type))
    await asyncio.gather(*tasks)

await main()

Taking screenshot of https://understandingdata.com/ and saving to screenshot_understandingdata.com_desktop.png
Taking screenshot of https://understandingdata.com/ and saving to screenshot_understandingdata.com_mobile.png
Taking screenshot of https://www.dufrain.co.uk/data-solutions/data-engineering/ and saving to screenshot_www.dufrain.co.uk_desktop.png
Taking screenshot of https://www.dufrain.co.uk/data-solutions/data-engineering/ and saving to screenshot_www.dufrain.co.uk_mobile.png
Taking screenshot of https://www.fdmgroup.com/services/technical-services/data-engineering/ and saving to screenshot_www.fdmgroup.com_desktop.png
Taking screenshot of https://www.fdmgroup.com/services/technical-services/data-engineering/ and saving to screenshot_www.fdmgroup.com_mobile.png


----------------------------------------------------------------------

## Making A Call To Vision API

In [4]:
import base64
import glob
from langchain_core.messages import SystemMessage, HumanMessage, AIMessage
from langchain_core.prompts import ChatPromptTemplate
from langchain_openai import ChatOpenAI
from langchain_core.output_parsers import PydanticOutputParser

In [5]:
# Load all of the .png files in the current directory:
screenshot_files = glob.glob("*.png")

# Convert all of them to base64:
screenshot_base64s = {}
for filename in screenshot_files:
    with open(filename, "rb") as f:
        screenshot_base64s[filename] = base64.b64encode(f.read()).decode("utf-8")

In [6]:
# Defining a pydantic model for output parsing:
from langchain_core.pydantic_v1 import BaseModel
from typing import List, Optional

class FeedbackAspect(BaseModel):
    aspect: str
    description: str
    recommendations: Optional[List[str]] = None

class LandingPageFeedback(BaseModel):
    website_url: str
    strengths: List[FeedbackAspect]
    areas_for_improvement: List[FeedbackAspect]
    general_feedback: Optional[str] = None
    additional_comments: Optional[str] = None

In [7]:
# Define an output parser:
output_parser = PydanticOutputParser(pydantic_object=LandingPageFeedback)

In [10]:
prompt = ChatPromptTemplate.from_messages(
    [
        SystemMessage(
            content=(
                f"""Act as a marketing user researcher. 
                    You will receive a set of screenshots of your website and your different websites. 
                    Your website is {urls[0]} and {clean_names[0]}.
                    ---
                    The different websites are {urls[1]} and {clean_names[1]}, and {urls[2]} and {clean_names[2]}.
                    ---
                    Please provide a brief analysis of the screenshots and identify any areas for improvement on your website.
                    You are allowed to use the different websites for research purposes.
                    
                    You must output a JSON schema that follows the following format: {output_parser.get_format_instructions()} Only return back JSON.
                    """
            )
        ),
        HumanMessage(
            content=[
                {
                    "type": "text",
                    "text": f"My website is {urls[0]} and {clean_names[0]}. Please find attached both the mobile and desktop version.",
                },
            ]
            + [
                # If the DOMAIN is in the keys, use the DOMAIN's screenshots:
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{screenshot_base64s[key]}"
                    },
                }
                for key in screenshot_base64s.keys()
                if DOMAIN in key
            ],
        ),
        AIMessage(content='Thanks for providing your web pages in both desktop and mobile versions. Before analysing them, I will need to research the different websites to understand the competition. Can you provide some information on the different websites?'),
        HumanMessage(
            content=[
                {
                    "type": "text",
                    "text": "Sure, here are some competitor images"
                }
            ] + [ 
                {
                    "type": "image_url",
                    "image_url": {
                        "url": f"data:image/jpeg;base64,{screenshot_base64s[key]}"
                    },
                }
                for key in screenshot_base64s.keys()
                if DOMAIN not in key
            ]
        ),
    ]
)

In [11]:
model = ChatOpenAI(model="gpt-4-vision-preview", max_tokens=1000)
chain = prompt | model | output_parser
result = chain.invoke({})
print(result)

website_url='https://understandingdata.com/' strengths=[FeedbackAspect(aspect='Clean Design', description='The website features a clean and modern design that is visually appealing and easy to navigate.', recommendations=[]), FeedbackAspect(aspect='Mobile Optimization', description='The website is well optimized for mobile devices, ensuring a good user experience across various screen sizes.', recommendations=[]), FeedbackAspect(aspect='Service Descriptions', description='The services offered are clearly described, providing potential clients with a good understanding of what is available.', recommendations=[])] areas_for_improvement=[FeedbackAspect(aspect='Content Depth', description="Competitors' websites contain more in-depth content regarding their services, which could help in SEO and providing detailed information to clients.", recommendations=['Expand the services section with more detailed descriptions and case studies.', 'Include a blog or resources section with articles, guid

In [12]:
print(type(result))

<class '__main__.LandingPageFeedback'>


In [20]:
print(result.json(indent=2))

{
  "website_url": "https://understandingdata.com/",
  "strengths": [
    {
      "aspect": "Clean Design",
      "description": "The website features a clean and modern design that is visually appealing and easy to navigate.",
      "recommendations": []
    },
    {
      "aspect": "Mobile Optimization",
      "description": "The website is well optimized for mobile devices, ensuring a good user experience across various screen sizes.",
      "recommendations": []
    },
    {
      "aspect": "Service Descriptions",
      "description": "The services offered are clearly described, providing potential clients with a good understanding of what is available.",
      "recommendations": []
    }
  ],
  "areas_for_improvement": [
    {
      "aspect": "Content Depth",
      "description": "Competitors' websites contain more in-depth content regarding their services, which could help in SEO and providing detailed information to clients.",
      "recommendations": [
        "Expand the servi

---

## Edge Cases/Next Steps To Improve The Script:

1. Click on cookie banners to accept cookies, edge case: https://www.dufrain.co.uk/data-solutions/data-engineering/
2. Click on any pop ups such as banner ads, edge case: https://www.dufrain.co.uk/data-solutions/data-engineering/
3. Create an x,y cordinate grid that will allow us to completely control the vision model.