In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import math

# Functions

### Extract Raw HTML of a Non-Dynamic Web Page

In [2]:
def extract(shop):
  '''Functions to extract HTML of the whole page depending on the shop'''
  headers = {"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36"}

  if shop == "Aritzia":
    url = "https://www.aritzia.com/en/sale?lastViewed=1000"
  
  elif shop == "Dynamite":
    url = "https://www.dynamiteclothing.com/ca/d/sale/newly-added/?start=0&sz=23"

  r = requests.get(url,headers)
  return r.text

### Aritzia Pandas Dataframe Generator

In [3]:
def pandasDataframeAritzia():
  rawHTML = extract("Aritzia")
  soup = BeautifulSoup(rawHTML,"html.parser")

  objectsList=soup.find_all("li",class_="ar-product-grid__tile")

  titlesList=[]
  regularPricesList=[]
  salePricesList=[]
  salePercentagesList=[]
  houseBrandsList=[]
  companiesList=[]
  imagesList=[]


  #iterate over each sale object
  for object in objectsList:

    titleObject = object.find("a",class_="ws-normal")
    if  titleObject != None:
      titlesList.append(titleObject.text)

    regularPriceObject = object.find("span", class_="")
    if regularPriceObject != None:
      regularPricesList.append(float(regularPriceObject.text[1:]))

    salePriceObject=object.find("span",class_="js-product__sales-price")
    if salePriceObject != None:
      salePrice = float(salePriceObject.text.split()[0][1:])
      salePricesList.append(salePrice)

      salePercentage=salePriceObject.text.split()[1].replace("(","").replace(")","").replace("%","").replace("−","").replace("-","")
      if salePercentage=="": salePercentage = 0
      salePercentagesList.append(int(salePercentage))

    houseBrandObject = object.find("div",class_="product-brand")
    if houseBrandObject != None:
      houseBrand = houseBrandObject.text.strip()
      houseBrandsList.append(houseBrand)
      companiesList.append("Aritzia")
    
    imageObject = object.find("img", class_="w-auto")
    if imageObject != None:
      imagesList.append(imageObject["data-mouseout-img"])
  

    
  #print(imagesList)
  #print(titlesList) 
  #print(regularPricesList) 
  #print(salePricesList) 
  #print(salePercentagesList)
  #print(houseBrandsList)
  pandasDict={"Product":titlesList, "Company":companiesList,"House Brand":houseBrandsList,"Regular Price ($)":regularPricesList,"Sale Price ($)":salePricesList,"Sale Percentage (%)": salePercentagesList, "Image":imagesList}

  df=pd.DataFrame(pandasDict)
  #drop regular prices
  df=df.drop(df[df['Sale Percentage (%)'] == 0].index).reset_index(drop=True)
  
  return df

### Convert Pandas Dataframe to a List of Dictionaries

In [4]:
def dataDict(pandasDf):
  "functions to convert pandas dataframe to a data list of products info dictionaries "

  dataList = pandasDf.to_dict("records")

  return dataList

# Running Code

### Generate Aritzia Data

In [5]:
pandasDfAritzia=pandasDataframeAritzia()
pandasDfAritzia

Unnamed: 0,Product,Company,House Brand,Regular Price ($),Sale Price ($),Sale Percentage (%),Image
0,utility denim shirt,Aritzia,Babaton,98.0,58.80,40,https://assets.aritzia.com/image/upload/medium...
1,williamsburg top,Aritzia,Babaton,58.0,34.80,40,https://assets.aritzia.com/image/upload/medium...
2,paloma skirt,Aritzia,Sunday Best,88.0,52.80,40,https://assets.aritzia.com/image/upload/medium...
3,cache cargo pant,Aritzia,Tna,128.0,63.99,50,https://assets.aritzia.com/image/upload/medium...
4,eleventh linen trench coat,Aritzia,Wilfred,298.0,178.80,40,https://assets.aritzia.com/image/upload/medium...
...,...,...,...,...,...,...,...
427,moto - cab,Aritzia,Suicoke,295.0,146.99,50,https://assets.aritzia.com/image/upload/medium...
428,the ganna™ cropped shirt jacket,Aritzia,Wilfred Free,178.0,70.99,60,https://assets.aritzia.com/image/upload/medium...
429,the ex boyfriend mid thigh short,Aritzia,Denim Forum,88.0,35.20,60,https://assets.aritzia.com/image/upload/medium...
430,formation turtleneck,Aritzia,Babaton,118.0,58.99,50,https://assets.aritzia.com/image/upload/medium...


In [6]:
dataAritzia = dataDict(pandasDfAritzia)
dataAritzia

[{'Product': 'utility denim shirt',
  'Company': 'Aritzia',
  'House Brand': 'Babaton',
  'Regular Price ($)': 98.0,
  'Sale Price ($)': 58.8,
  'Sale Percentage (%)': 40,
  'Image': 'https://assets.aritzia.com/image/upload/medium/s24_01_a02_117081_32511_off_a.jpg'},
 {'Product': 'williamsburg top',
  'Company': 'Aritzia',
  'House Brand': 'Babaton',
  'Regular Price ($)': 58.0,
  'Sale Price ($)': 34.8,
  'Sale Percentage (%)': 40,
  'Image': 'https://assets.aritzia.com/image/upload/medium/s24_01_a03_106702_3030_off_a.jpg'},
 {'Product': 'paloma skirt',
  'Company': 'Aritzia',
  'House Brand': 'Sunday Best',
  'Regular Price ($)': 88.0,
  'Sale Price ($)': 52.8,
  'Sale Percentage (%)': 40,
  'Image': 'https://assets.aritzia.com/image/upload/medium/s24_02_a07_106863_28554_off_a.jpg'},
 {'Product': 'cache cargo pant',
  'Company': 'Aritzia',
  'House Brand': 'Tna',
  'Regular Price ($)': 128.0,
  'Sale Price ($)': 63.99,
  'Sale Percentage (%)': 50,
  'Image': 'https://assets.aritzia.c

### Generate Dynamite Data

In [14]:
def getItemsNum (shop):
  '''Function to get items number'''
  if shop == "Dynamite":
    url = "https://www.dynamiteclothing.com/ca/d/sale/newly-added"
  driver = webdriver.Chrome(ChromeDriverManager().install())
  #time.sleep(5)
  driver.get(url)
  #time.sleep(5)
  soup = BeautifulSoup(driver.page_source, 'html.parser')
  driver.close()

  items_n=int(soup.find("p",class_="css-dn5yeb e1yiaaga0").text.split()[0])
  return items_n


In [15]:
def calculateSalePercentage(regularPrice,salePrice):
  return round(((regularPrice*100)-(salePrice*100))/(regularPrice*100)*100)


In [69]:
def pandasDataframeDynamite(): 
  '''Function to create pandas dataframe from dynamic webpage'''
  from selenium import webdriver

  items_n = getItemsNum("Dynamite")
  
  #url = f"https://www.dynamiteclothing.com/ca/d/sale/newly-added/?start=0&sz={items_n}"
  url = f"https://www.dynamiteclothing.com/ca/d/sale/newly-added/?start=0&sz=230"

  
  driver = webdriver.Chrome(ChromeDriverManager().install())
  time.sleep(5)
  driver.get(url)
  time.sleep(100)
  

  #with open('trial.txt', 'r') as file:
    #text = file.read().replace('\n', '')

  soup = BeautifulSoup(driver.page_source, 'html.parser')
  driver.close()

  objectsList=soup.find_all("div",class_="css-y9x9wf")

  titlesList=[]
  regularPricesList=[]
  salePricesList=[]
  companiesList=[]
  imagesList=[]
  salePercentagesList=[]
  i=1

  for object in objectsList:

    titleObject = object.find("p",class_="PLP_p_ProductName")
    if  titleObject != None:
      titlesList.append(titleObject.text)

    regularPriceObject = object.find("span",class_="sr-only")
    if regularPriceObject != None:
      regularPrice=float(regularPriceObject.text.strip().replace("$",""))
      regularPricesList.append(regularPrice)

    salePriceObject = object.find("p",class_="PLP_p_SalePrice")
    if salePriceObject != None:
      salePrice=float(salePriceObject.text.strip().replace("$",""))
      salePricesList.append(salePrice)
      salePercentagesList.append(calculateSalePercentage(regularPrice,salePrice))
      companiesList.append("Dynamite")

    imageObject = object.find("button", class_="syte")
    if imageObject != None:
      imagesList.append(imageObject['data-image-src'])
      

  print(len(titlesList))
  print(len(regularPricesList))
  print(len(salePricesList))
  print(imagesList)
  print(len(salePercentagesList))
  print(len(objectsList))
  driver.quit()
      
  pandasDict={"Product":titlesList, "Company":companiesList,"Regular Price ($)":regularPricesList,"Sale Price ($)":salePricesList,"Sale Percentage (%)": salePercentagesList, "Image":imagesList}

  df=pd.DataFrame(pandasDict)
  return df
  
  



In [70]:
pandasDfDynamite = pandasDataframeDynamite()
pandasDfDynamite

230
230
230
['https://www.dynamiteclothing.com/dw/image/v2/BDRP_PRD/on/demandware.static/-/Sites-root_dynamite_catalog/default/dwc49e2337/images/100087362/100087362_7FF_1920x2880.jpg?sw=320&sh=480', 'https://www.dynamiteclothing.com/dw/image/v2/BDRP_PRD/on/demandware.static/-/Sites-root_dynamite_catalog/default/dwb8d9f3ac/images/100089279/100089279_4KA_1920x2880.jpg?sw=320&sh=480', 'https://www.dynamiteclothing.com/dw/image/v2/BDRP_PRD/on/demandware.static/-/Sites-root_dynamite_catalog/default/dw727424da/images/100087465/100087465_4KA_1920x2880.jpg?sw=320&sh=480', 'https://www.dynamiteclothing.com/dw/image/v2/BDRP_PRD/on/demandware.static/-/Sites-root_dynamite_catalog/default/dwd49e06b1/images/100088861/100088861_04N_1920x2880.jpg?sw=320&sh=480', 'https://www.dynamiteclothing.com/dw/image/v2/BDRP_PRD/on/demandware.static/-/Sites-root_dynamite_catalog/default/dwb65b0826/images/100086637/100086637_04N_1920x2880.jpg?sw=320&sh=480', 'https://www.dynamiteclothing.com/dw/image/v2/BDRP_PRD/on

Unnamed: 0,Product,Company,Regular Price ($),Sale Price ($),Sale Percentage (%),Image
0,V-Neck Slip Mini Dress,Dynamite,59.95,30.0,50,https://www.dynamiteclothing.com/dw/image/v2/B...
1,Calie Crinkled Cami,Dynamite,34.95,20.0,43,https://www.dynamiteclothing.com/dw/image/v2/B...
2,Malika Crinkled Wide Leg Pants,Dynamite,59.95,30.0,50,https://www.dynamiteclothing.com/dw/image/v2/B...
3,Everlee Poplin Mini Skirt,Dynamite,49.95,20.0,60,https://www.dynamiteclothing.com/dw/image/v2/B...
4,Popcorn Textured Cami Bodysuit,Dynamite,39.95,25.0,37,https://www.dynamiteclothing.com/dw/image/v2/B...
...,...,...,...,...,...,...
225,Square Neck Rib Cami,Dynamite,19.95,10.0,50,https://www.dynamiteclothing.com/dw/image/v2/B...
226,Scuba Funnel Neck Sweatshirt,Dynamite,49.95,40.0,20,https://www.dynamiteclothing.com/dw/image/v2/B...
227,One Shoulder Floral Ruched Mini Dress,Dynamite,59.95,30.0,50,https://www.dynamiteclothing.com/dw/image/v2/B...
228,Havyn Boatneck Jersey Crop Top,Dynamite,39.95,30.0,25,https://www.dynamiteclothing.com/dw/image/v2/B...


In [11]:

dataDynamite=dataDict(pandasDfDynamite)
dataDynamite

AttributeError: 'NoneType' object has no attribute 'to_dict'