In [1]:
import re
import os
import json
import requests
import boto3
import pandas as pd
from redmail import EmailSender
from bs4 import BeautifulSoup
from pretty_html_table import build_table

In [2]:
def convert_dict_to_dataframe(prod_dict: dict):
    """Converts a dictionary into a Pandas DataFrame.
    
    Args:
        prod_dict (dict): Dictionary to convert into dataframe
        
    Returns:
        Pandas DataFrame
    """
    # Specify the dtypes of columns to cast
    convert_dict = {
        'url': str,
        'price': float,
        'colour': str
    }

    # Convert the dict into a dataframe
    prod_df = pd.DataFrame.from_records(prod_dict).T.reset_index()
    prod_df = prod_df.drop(columns=['index'])
    
    # Check required keys are in dict
    try:
        assert set(prod_df.columns).intersection(convert_dict.keys()) == set(convert_dict.keys())
    except AssertionError as e:
        missing_keys = set(convert_dict.keys()).difference(prod_dict.keys())
        print(f"Missing following keys in scraped product dictionary: {', '.join(missing_keys)}")
        raise e

    # Cast columns
    for col, dtype in convert_dict.items():
        prod_df[col] = prod_df[col].astype(dtype)

    return prod_df


def size_availability(url: str):
    """Checks the sizes available from a product URL.
    
    Args:
        url (str): URL of the product page to check
        
    Returns:
        list of in-stock sizes
    """
    page = requests.get(url, headers = {'User-agent': 'tmp'})
    soup = BeautifulSoup(page.content, "html.parser")
    
    # Try and see if this works
    try:
        prods_json_txt = soup.find('script', attrs={'type': "application/json", 'data-variant-selects-el':'data'}).text
        size_details = json.loads(prods_json_txt)
        
    except AttributeError:
        # If not then fall back to a different approach, if that fails then just return a string
        try:
            prods_json_txt = soup.find('script', attrs={'type': "application/json", 'data-variant-radios-el':'data'}).text
            size_details = json.loads(prods_json_txt)
            
        except AttributeError:
            return ['Failed to get sizes']

    avail_dict = {}
    for detail in size_details:
        size = detail['title']
        avail = detail['available']

        if avail not in avail_dict:
            avail_dict[avail] = [size]
        else:
            avail_dict[avail].append(size)

    return avail_dict.get(True, ['None'])

In [3]:
# Define base URL and get all products from the page
base_url = "https://www.sunspel.com/"
riv_polo_url = "uk/mens/polo-shirts/riviera-polo.html"
page = requests.get(base_url+riv_polo_url, headers = {'User-agent': 'tmp'})
soup = BeautifulSoup(page.content, "html.parser")
product_list = soup.find_all("li", class_="prd-List_Item")
assert len(product_list) > 0, "No products found from HTML, web page has changed."

In [8]:
# Get product ID, colour, URL, image URL, and price
try:
    prod_type = product_list[0].find("h3", class_="prd-Card_Title").get_text().strip()
except AttributeError as e:
    print("Unable to get product type from product HTML, web page has changed.")
    raise e

try:
    prod_id = int(product_list[0]['id'].split('-')[-1])
except KeyError as e:
    print("Unable to get product ID from product HTML, web page has changed.")
    raise e

try:
    prod_colour = product_list[0].find("p", class_="prd-Card_Colour").get_text().strip()
except AttributeError as e:
    print("Unable to get product colour from product HTML, web page has changed.")
    raise e

try:
    prod_url = base_url + product_list[0].find("a", class_="util-FauxLink_Link")['href']
except Exception as e:
    print("Unable to get product URL from product HTML, web page has changed.")
    raise e

try:
    prod_img_url = (
        'https:' + (
            product_list[0]
            .find('div', class_='prd-Card_Image')
            .find('div', class_='rsp-Image')
            .find('img', class_='rsp-Image_Image')['src']
            )
        ).split('?')[0]
except Exception as e:
    print("Unable to get product image URL from product HTML, web page has changed.")
    raise e

try:
    prod_price = float(product_list[0].find('p', class_='prd-Card_Price').get_text().strip('£'))
except Exception as e:
    print("Unable to get product price from product HTML, web page has changed.")
    raise e

In [9]:
prod_img_url

'https://cdn.shopify.com/s/files/1/0636/5154/5319/products/mpol1026-whaa-1_13.jpg'

In [5]:
# Loop through each product
prod_dict = {}
for prod in product_list:
    
    # Get the product type
    try:
        prod_type = prod.find("h3", class_="prd-Card_Title").get_text().strip()
    except KeyError as e:
        print("Unable to get product type from product HTML, web page has changed.")
        raise e
    
    # If it isn't the one I want then move on
    if len(prod_type.split(' ')) != 3:
        continue
    
    # Get product ID, colour, URL, image URL, and price
    try:
        prod_id = int(prod['id'].split('-')[-1])
    except KeyError as e:
        print("Unable to get product ID from product HTML, web page has changed.")
        raise e

    try:
        prod_colour = prod.find("p", class_="prd-Card_Colour").get_text().strip()
    except KeyError as e:
        print("Unable to get product colour from product HTML, web page has changed.")
        raise e

    try:
        prod_url = base_url + prod.find("a", class_="util-FauxLink_Link")['href']
    except Exception as e:
        print("Unable to get product URL from product HTML, web page has changed.")
        raise e

    try:
        prod_img_url = ('https:' + product_list[0].find('div', class_='prd-Card_Image').find('div', class_='rsp-Image').find('img', class_='rsp-Image_Image')['src']).split('?')[0]
    except Exception as e:
        print("Unable to get product image URL from product HTML, web page has changed.")
        raise e

    try:
        prod_price = float(prod.find('p', class_='prd-Card_Price').get_text().strip('£'))
    except Exception as e:
        print("Unable to get product price from product HTML, web page has changed.")
        raise e
    
    # Store in dictionary
    prod_dict[prod_id] = {}
    prod_dict[prod_id]['url'] = prod_url
    prod_dict[prod_id]['img_url'] = prod_img_url
    prod_dict[prod_id]['price'] = prod_price
    prod_dict[prod_id]['colour'] = prod_colour

In [6]:
# Convert product dictionary into dataframe
df = convert_dict_to_dataframe(prod_dict)

# Calculate discount
df['discount (%)'] = (100 * (1- (df['price'] / df['price'].max()))).round(2)

# Get sizes in-stock for each product
df['sizes available'] = df['url'].apply(lambda x: ', '.join(size_availability(x)))

In [7]:
df

Unnamed: 0,url,img_url,price,colour,discount (%),sizes available
0,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Black,0.0,"XS, S, M, L, XL, XXL"
1,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Gingerbread,0.0,"XS, S, XL"
2,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Navy,0.0,"XS, S, M, L, XL, XXL"
3,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Lagoon Blue,0.0,"XS, S, XXL"
4,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Grey Melange,0.0,"XS, S, M, L, XL"
5,https://www.sunspel.com//products/mens-short-s...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Charcoal,0.0,"XS, S, L, XXL"
6,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Lead,0.0,"XS, S, XXL"
7,https://www.sunspel.com//products/riviera-polo...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Candy Red,0.0,"S, M, L, XL, XXL"
8,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,White,0.0,"XS, S, M, L, XXL"
9,https://www.sunspel.com//products/mens-cotton-...,https://cdn.shopify.com/s/files/1/0636/5154/53...,115.0,Archive White,0.0,"XS, M, L, XXL"


In [31]:
for url in df['url']:
    try:
        size_availability(url)
    except Exception as e:
        print(url)
        raise e

In [44]:
page = requests.get('https://www.sunspel.com//products/mens-cotton-riviera-polo-shirt-black-mpol1026', headers = {'User-agent': 'tmp'})
soup = BeautifulSoup(page.content, "html.parser")

# print(soup.prettify())

# script data-variant-selects-el="data" type="application/json"

try:
    tmp = json.loads(soup.find('script', attrs={'type': "application/json", 'data-variant-selects-el':'data'}).text)
except AttributeError as e:
    
    try:
        tmp = json.loads(soup.find('script', attrs={'type': "application/json", 'data-variant-radios-el':'data'}).text)
    except AttributeError as e:
        print('2')
        raise e

avail_dict = {}
# size_details = soup.find_all('span', class_="nosto_sku")
for detail in tmp:
    size = detail['title']
    avail = detail['available']

    if avail not in avail_dict:
        avail_dict[avail] = [size]
    else:
        avail_dict[avail].append(size)

In [41]:
print(soup.prettify())

<!DOCTYPE html>
<html lang="en">
 <head>
  <!--
Elevar Data Layer V2

This file is automatically updated and should not be edited directly.

https://knowledge.getelevar.com/how-to-customize-data-layer-version-2

Updated: 2022-08-30 09:01:49+00:00
Version: 2.38.0
-->
  <!-- Google Tag Manager -->
  <script>
   window.dataLayer = window.dataLayer || [];
  </script>
  <script>
   (function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({"gtm.start":
  new Date().getTime(),event:"gtm.js"});var f=d.getElementsByTagName(s)[0],
  j=d.createElement(s),dl=l!="dataLayer"?"&l="+l:"";j.async=true;j.src=
  "https://www.googletagmanager.com/gtm.js?id="+i+dl;f.parentNode.insertBefore(j,f);
})(window,document,"script","dataLayer","GTM-KD9LBH5");
  </script>
  <!-- End Google Tag Manager -->
  <script id="elevar-gtm-suite-config" type="application/json">
   {"gtm_id": "GTM-KD9LBH5", "event_config": {"cart_reconcile": true, "cart_view": true, "checkout_complete": true, "checkout_step": true, "collection_view": true