#   Imports

In [1]:
import pandas as pd
import numpy as np
import requests
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By 
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException
from selenium.common.exceptions import ElementNotInteractableException
import time

# Extract links of products items 

When loading a specific category page on the Asos website, the page showes only 72 first items.
At the bottom of the page there is a button named 'Load More'. When you click on this button 72 more items are loaded. 
In order to get the links of all the items in each category, we had to continuously click on the 'Load More' button. The function below demonstrates the above action. 

In [2]:
def get_category_links(category_link):
    driver = webdriver.Chrome() # initialize the web driver
    driver.get(category_link)
    time.sleep(1)
    while True:
        time.sleep(1)
        load_more_buttons = driver.find_elements(By.XPATH,"//a[@class='loadButton_i3U2b']")
        if len(load_more_buttons) == 0:
            break
        load_more_buttons[0].click()
    mtag = driver.find_element(By.XPATH,"//div[@class='content_mnXK_']")
    links = []
    for link in mtag.find_elements(By.TAG_NAME,"a"):
        links.append(link.get_attribute("href"))
        time.sleep(0.1)
    return links

# Extract details from each item 

Parameters: 
  links: an array of links that represent each product
  properties: dictionary that helps retrieve the xpath for each detail in the item
  arr: an array that asists in retrieving the x path for each detail in the item
  
There was a lot of data and Asos restrictions. Therefore, we couldn't extract all of our data in one go. We worked in intervals of categories in order to divide the extraction. We added a time sleep function in order to reduce the trafic on the Asos servers and stop the company from blocking our atempts. 

The function extracts the parameters:, price, color, number of likes (that the item recieved), brand, fabric type, discount percentage of the price, customer reviews, clothing fit, is the item on sale, did the item sell fast?

In [3]:

def extract_items(links, properties, arr):
    driver = webdriver.Chrome()
    actions = ActionChains(driver)
    df = pd.DataFrame(columns=properties.keys())
    for link in links[1000:]:
        driver.get(link)
        out_of_stuck_el = driver.find_elements(By.XPATH, '//*[@id="g5v49"]')
        if len(out_of_stuck_el) != 0:
            continue
        row = {}
        for prop, xpath in properties.items():
            try:
                if prop == "brand":
                    button = driver.find_element(By.XPATH, arr[2])
                    button.click()
                    element = driver.find_element(By.XPATH, xpath)
                    row[prop] = element.text
                    
                elif prop == "material type":
                    button = driver.find_element(By.XPATH, arr[3])
                    button.click()
                    element = driver.find_element(By.XPATH, xpath)
                    row[prop] = element.text
                    
                elif prop == "Percentage discount":
                    dicraption = driver.find_elements(By.XPATH, arr[4])
                    for dis in dicraption:
                        element = driver.find_element(By.XPATH, xpath)
                        row[prop] = element.text
                        
                elif prop == "Fit":
                    product_details_b = driver.find_element(By.XPATH, arr[5])
                    product_details_b.click()
                    time.sleep(0.2)
                    product_details = driver.find_element(By.XPATH, "//div[@id='productDescriptionDetails']")
                    tags = product_details.find_elements(By.TAG_NAME, "li")
                    details = []
                    for tag in tags:
                        if(tags == 0):
                            row[prop] = np.nan
                        else:
                            details.append(tag.text)
                            row[prop] = details[len(details) - 1]
                    
                else:
                    element = driver.find_element(By.XPATH, xpath)  
                    row[prop] = element.text
                time.sleep(0.2)
            except ElementNotInteractableException:
                row[prop] = np.nan
            except ElementClickInterceptedException:
                row[prop] = np.nan
            except NoSuchElementException:
                row[prop] = np.nan
        df = pd.concat([df, pd.DataFrame([row])], ignore_index=True)
        time.sleep(0.3)
    #put 1 for every product that in sale
    df["in sale"] = 0
    df["category"]=0
    df["sex"]=0
    for index, row in df.iterrows():
        df.at[index, "category"] = arr[0]
        df.at[index, "sex"] = arr[1]
        sale_element = driver.find_elements(By.XPATH,'//*[@id="pdp-react-critical-app"]/div[1]/div/span[1]')
#         df['price'] = df['price'].astype(str)
        if 'Now' in str(row['price']):
            df.at[index, "in sale"] = 1
    for index, row in df.iterrows():
        if row["in sale"]==0:
            df.at[index, "Percentage discount"] = 0
            
    return df

In the following cell, we call a function that extracts all of the links of a specific category. The data extraction of each category took aproximetly two hours (depends on the number of items). Every time we manually sent one link of one category to the function. Every once in a while, Asos blocked us and the program did not finish. Therefore, in order to be more in control we ran each category separately. 

In [74]:
current_category_links = get_category_links("https://www.asos.com/women/hoodies-sweatshirts/cat/?cid=11321")

The next line was ment to check whether the number of items we extracted match the number of items that are on the page. We did this before running the extraction function. 

In [75]:
len(current_category_links)

1829

In the following cell, we created a dictionary that holds the paths of the items that we want to extract. In addition, we used an array to extract data from the HTML list. We also used the array to input manually the data category and the gender because this is known in advance for each category. 

In [97]:
xpath_dic = {"price":"//span[@data-testid='current-price']","color":"//p[@class='aKxaq']","likes":"//span[@class='BFMOG']","brand":'//*[@id="productDescriptionDetails"]/div/div/a[2]/strong',"material type":'//div[@id="productDescriptionAboutMe"]',"Percentage discount":' //span[@data-testid="percentage-discount"]','Rate':'//div[@data-testid="overall-rating"]', "Fit":"//div[@id='productDescriptionDetails']", "Saling fast":'//*[@id="product-gallery"]/div[2]/div[1]/span'}
xpath_arr = ["Tops","Women",'//*[@id="pdp-ssr-product-description"]/div/ul/li[1]/div/h2/button','//button[@aria-controls="productDescriptionAboutMe"]',"//div[@id='pdp-react-critical-app']", '//*[@id="pdp-ssr-product-description"]/div/ul/li[1]/div/h2/button']
df = extract_items(current_category_links,xpath_dic,xpath_arr)
df


Unnamed: 0,price,color,likes,brand,material type,Percentage discount,Rate,Fit,Saling fast,in sale,category,sex
0,Now 236.01 ILS,RED,363,,"Fleece: soft, cosy and insulating\n\nMain: 100...",(-40%),,Regular fit,,1,Tops,Women
1,Now 270.28 ILS,BLACK,1K,,Sweatshirt fabric: soft and warm\n\nMain: 100%...,(-29%),4.3,Oversized fit,,1,Tops,Women
2,Now 165.32 ILS,BLACK,132,,,(-29%),,Regular fit,,1,Tops,Women
3,Now 238.79 ILS,BLACK,390,,,(-29%),,Regular fit,,1,Tops,Women
4,Now 217.80 ILS,BLUE,712,,Sweatshirt fabric: soft and warm\n\nMain: 80% ...,(-24%),4.5,Oversized fit,,1,Tops,Women
...,...,...,...,...,...,...,...,...,...,...,...,...
786,Now 167.94 ILS,Grey heather / white,121,Nike,Soft sweatshirt fabric,(-39%),4.7,Regular fit,,1,Tops,Women
787,445.00 ILS,Black,546,Napapijri,,0,,Regular fit,,0,Tops,Women
788,445.00 ILS,Off White,263,Napapijri,Sweatshirt fabric: soft and warm\n\nMain: 100%...,0,,Regular fit,,0,Tops,Women
789,500.00 ILS,Off White,439,Napapijri,Sweatshirt fabric: soft and warm\n\nMain: 100%...,0,,Relaxed fit,,0,Tops,Women


When we recieved the data frame and and saw that it maches the expectations we saved it as a file on our computer. 

In [99]:
df.to_csv("C:\\Users\\amitx\\Desktop\\d\\Women Hoodies & Sweatshirts3.csv", index=False)

After we finished extracting the data from each of the categories, we loaded all of the files to a data frame and merged them to one file with all data. 

In [5]:
data = pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Hoodies & Sweatshirts3.csv")
data1=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Tops5.csv")
data2 = pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Tops6.csv")
data3=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Tracksuits & Joggers.csv")
data4=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Hoodies & Sweatshirts1.csv")
data5=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Tops1.csv")
data6=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Tops2.csv")
data7=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Tops3.csv")
data8=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Tops4.csv")
data9=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Women Coats & Jackets.csv")
data10=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\jack&coats.csv")
data11=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\jack&coats0.csv")
data12=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\Jeans.csv")
data13=pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\T-shirts & Vests.csv")

In [4]:
FullData =pd.read_csv("C:\\Users\\amitx\\Desktop\\d\\asos data.csv")

In [109]:
FullData.to_csv("C:\\Users\\amitx\\Desktop\\d\\asos data.csv", index=False)