In [2]:
import re
import os
import json
import requests
import boto3
import pandas as pd
from redmail import EmailSender
from bs4 import BeautifulSoup
from pretty_html_table import build_table

In [18]:
# Define base URL and get all products from the page
base_url = "https://www.sunspel.com/"
riv_polo_url = "uk/mens/polo-shirts/riviera-polo.html"
page = requests.get(base_url+riv_polo_url, headers = {'User-agent': 'tmp'})
soup = BeautifulSoup(page.content, "html.parser")
product_list = soup.find_all("li", class_="prd-List_Item")
assert len(product_list) > 0, "No products found from HTML, web page has changed."

In [38]:
# Get product ID, colour, URL, image URL, and price
try:
    prod_type = product_list[0].find("h3", class_="prd-Card_Title").get_text().strip()
except AttributeError as e:
    print("Unable to get product type from product HTML, web page has changed.")
    raise e

try:
    prod_id = int(product_list[0]['id'].split('-')[-1])
except KeyError as e:
    print("Unable to get product ID from product HTML, web page has changed.")
    raise e

try:
    prod_colour = product_list[0].find("p", class_="prd-Card_Colour").get_text().strip()
except AttributeError as e:
    print("Unable to get product colour from product HTML, web page has changed.")
    raise e

try:
    prod_url = base_url + product_list[0].find("a", class_="util-FauxLink_Link")['href']
except Exception as e:
    print("Unable to get product URL from product HTML, web page has changed.")
    raise e

try:
    prod_img_url = ('https:' + product_list[0].find('div', class_='prd-Card_Image').find('div', class_='rsp-Image').find('img', class_='rsp-Image_Image')['src']).split('?')[0]
except Exception as e:
    print("Unable to get product image URL from product HTML, web page has changed.")
    raise e

try:
    prod_price = float(product_list[0].find('p', class_='prd-Card_Price').get_text().strip('£'))
except Exception as e:
    print("Unable to get product price from product HTML, web page has changed.")
    raise e

In [3]:
# Define base URL and get all products from the page
base_url = "https://www.sunspel.com/"
riv_polo_url = "uk/mens/polo-shirts/riviera-polo.html"
page = requests.get(base_url+riv_polo_url, headers = {'User-agent': 'tmp'})
soup = BeautifulSoup(page.content, "html.parser")
product_list = soup.find_all("li", class_="prd-List_Item")
assert len(product_list) > 0, "No products found from HTML, web page has changed."

# Loop through each product
prod_dict = {}
for prod in product_list:
    
    # Get the product type
    try:
        prod_type = prod.find("h3", class_="prd-Card_Title").get_text().strip()
    except KeyError as e:
        print("Unable to get product type from product HTML, web page has changed.")
        raise e
    
    # If it isn't the one I want then move on
    if len(prod_type.split(' ')) != 3:
        continue
    
    # Get product ID, colour, URL, image URL, and price
    try:
        prod_id = int(prod['id'].split('-')[-1])
    except KeyError as e:
        print("Unable to get product ID from product HTML, web page has changed.")
        raise e

    try:
        prod_colour = prod.find("p", class_="prd-Card_Colour").get_text().strip()
    except KeyError as e:
        print("Unable to get product colour from product HTML, web page has changed.")
        raise e

    try:
        prod_url = base_url + prod.find("a", class_="util-FauxLink_Link")['href']
    except Exception as e:
        print("Unable to get product URL from product HTML, web page has changed.")
        raise e

    try:
        prod_img_url = ('https:' + product_list[0].find('div', class_='prd-Card_Image').find('div', class_='rsp-Image').find('img', class_='rsp-Image_Image')['src']).split('?')[0]
    except Exception as e:
        print("Unable to get product image URL from product HTML, web page has changed.")
        raise e

    try:
        prod_price = float(prod.find('p', class_='prd-Card_Price').get_text().strip('£'))
    except Exception as e:
        print("Unable to get product price from product HTML, web page has changed.")
        raise e
    
    # Store in dictionary
    prod_dict[prod_id] = {}
    prod_dict[prod_id]['url'] = prod_url
    prod_dict[prod_id]['img_url'] = prod_img_url
    prod_dict[prod_id]['price'] = prod_price
    prod_dict[prod_id]['colour'] = prod_colour

In [9]:
def convert_dict_to_dataframe(prod_dict: dict):
    """Converts a dictionary into a Pandas DataFrame.
    
    Args:
        prod_dict (dict): Dictionary to convert into dataframe
        
    Returns:
        Pandas DataFrame
    """
    # Specify the dtypes of columns to cast
    convert_dict = {
        'url': str,
        'price': float,
        'colour': str
    }

    # Convert the dict into a dataframe
    prod_df = pd.DataFrame.from_records(prod_dict).T.reset_index()
    prod_df = prod_df.drop(columns=['index'])
    
    # Check required keys are in dict
    try:
        assert set(prod_df.columns).intersection(convert_dict.keys()) == set(convert_dict.keys())
    except AssertionError as e:
        missing_keys = set(convert_dict.keys()).difference(prod_dict.keys())
        print(f"Missing following keys in scraped product dictionary: {', '.join(missing_keys)}")
        raise e

    # Cast columns
    for col, dtype in convert_dict.items():
        prod_df[col] = prod_df[col].astype(dtype)

    return prod_df

In [10]:
convert_dict_to_dataframe(prod_dict)

Unnamed: 0,url,img_url,price,colour
0,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Black
1,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Gingerbread
2,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Navy
3,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Lagoon Blue
4,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Grey Melange
5,https://www.sunspel.com//products/mens-short-s...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Charcoal
6,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Lead
7,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Candy Red
8,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,White
9,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Archive White


In [7]:
convert_dict_to_dataframe(prod_dict)

Missing following keys in scraped product dictionary: colour, price, url


AssertionError: 