# IMPORT LIBRARIES

In [21]:
# !/usr/bin/env python3
# !pip install tqdm
# ! pip install selenium
# ! pip install bs4
# ! pip install alive_progress

In [22]:
import time
import random
import pandas as pd
from alive_progress import alive_bar
import re
import requests # to get image from the web
import shutil # to save it locally
from os.path import exists
from os.path import expanduser
from tqdm import tqdm
import json
import os

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.common.exceptions import NoSuchWindowException

from bs4 import BeautifulSoup
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# SETTINGS

- NOTE: If we want 10,000 items from four categories (shirts, sweaters, dresses, outwerwear), we need about 2,500 per category
- so set max_products = 2,500
- max_pages = 21 (bc 120 items per pages as default)
- for each front_page_url, make sure to save a copy of the link used here and it should have items ordered from newest to oldest

In [23]:
save_folder = "C:/Users/Esra/Desktop/Deep_Learning/Image_Classification/Fashion/Classify_ThreadUp_Images/data/threadup/dress/"
max_products = "all" #set to "all" if want to take all products on scraped pages
front_page_url = "https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=1"
max_pages = 21

# SCRAPE PAGE FUNCTION

In [24]:
def scrape_page(my_url):
    print("Scraping URL:", my_url)
    firefox_options = webdriver.FirefoxOptions()
    firefox_options.add_argument('--headless')
    firefox_options.add_argument('window-size=1920x1080')
    driver = webdriver.Firefox(options=firefox_options)
    
    try:
        driver.get(my_url)
        time.sleep(random.randrange(5, 10))
        
        # Attempt to close the pop-up window
        try:
            close_popup = driver.find_element(By.CSS_SELECTOR, 'button.u-right-2x.u-top-2x.u-absolute.u-z-2.hover\\:u-opacity-50')
            close_popup.click()
            print("Pop-up closed")
        except Exception:
            print("No pop-up found or error closing pop-up:")
        
        # Allow additional time for the page to stabilize after closing the pop-up
        time.sleep(2)
        
        page_contents = driver.page_source
    except NoSuchWindowException:
        print("Browser window was lost. Unable to scrape:", my_url)
        page_contents = ""
    finally:
        driver.quit()

    if page_contents:
        return BeautifulSoup(page_contents, 'html.parser')
    else:
        return None
    

# GET LINKS TO INDIVIDUAL ITEM PAGES FROM MAIN PAGE

In [26]:
def get_item_links(front_page_url, max_pages = 1):
    product_links = []
    # Everytime range increases, items increase by 50.
    for page_number in range(max_pages + 1):
        print(f"KSN: Page {page_number}...")
        #See if page exists
        try:
            url_page = front_page_url[:-1] + str(page_number)
            print(url_page)
            main_page_items = scrape_page(url_page)
        except:
            print('exception')
            break #exit for loop if a page doesn't exist (presumably means past last page for this item) 
        
        if main_page_items:
            #Pull all href links
            url_front = "https://www.thredup.com"
            all_products = main_page_items.find_all(attrs={"data-inp-label": "link-item-card"})


            for product in all_products: #get all product links
                product_link = url_front + product["href"]
                product_links.append(product_link)
                
    return product_links

            
product_links = get_item_links(front_page_url, max_pages)


print(f"Product links found")


KSN: Page 0...
https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=0
Scraping URL: https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=0
No pop-up found or error closing pop-up:
KSN: Page 1...
https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=1
Scraping URL: https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=1
No pop-up found or error closing pop-up:
KSN: Page 2...
https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=2
Scraping URL: https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=2
Pop-up closed
KSN: Page 3...
https://www.thredup.com/women/dresses?category_tags=dresses&department_tags=women&sort=newest_first&page=3
Scraping URL: https://www.thredup.com/women/dresses?c

In [27]:
len(product_links)

2618

# FUNCTIONS TO SCRAPE THE DETAILS of ITEMS

In [28]:
#Get Product Id from product page (prepare function)

def get_product_id(product_link):
    product_id = product_link.split("?query")[0]   
    product_id = product_id.split("/")[-1]
    
    return product_id

    

In [29]:
#Get product image from product page (prepare functions)

def get_image_link(scraped_page):
    try:
        images = scraped_page.findAll(
            "img", {"class": lambda L: L and L.startswith("u-rounded-4 u-cursor-pointer")} 
        )
        
        image_link = images[0].get("src")
        print(image_link)
    except Exception:
        print("No image link found")
        image_link = ""
        
    return image_link

def save_image(image_link, product_id, save_folder):
    filename = save_folder + "item" + product_id +".jpg"

    # Open the url image, set stream to True, this will return the stream content.
    r = requests.get(image_link, stream = True)

     # Check if the image was retrieved successfully
    if r.status_code == 200:
        # Set decode_content value to True, otherwise the downloaded image file's size will be zero.
        r.raw.decode_content = True
        # Open a local file with wb ( write binary ) permission.
        with open(filename,'wb') as f:
            shutil.copyfileobj(r.raw, f)
        print("image saved")
    else:
        print('Image Couldn\'t be retrieved')

In [30]:
def get_size_and_brand(scraped_page):
    size = scraped_page.find_all("div", attrs={"class": "u-text-16"}) #structure of find_all input is (element, attrs={})
    brand = size[0].parent.find_previous_sibling().find("a")["title"]
    size = size[0].text
    # print(size)
    # print(brand)
    return size, brand


In [31]:
def get_category(scraped_page):
    category = scraped_page.find_all("span", attrs={"class": "u-text-16"}) #structure of find_all input is (element, attrs={})
    category = category[0].text
    # print(category)
    return category

In [32]:
def get_price(scraped_page):
    price = scraped_page.find_all("span", attrs={"class": "price"}) 
    price = price[0].text
    # print(price)
    return price

In [33]:
def get_condition(scraped_page):
    condition_header = scraped_page.find("h2", string="Condition")
    condition = condition_header.find_next_sibling()
    condition = condition.text
    # print(condition)
    return condition

In [34]:
def get_material_and_features(scraped_page):
    details_header = scraped_page.find("h2", string="Item details")
    details = details_header.parent.find_next_sibling().find_all('li')
    material = details[0].text
    features = details[1].text
    
    # print(material)
    # print(features)

    return material, features


In [35]:
def get_measurements_and_fit(scraped_page):
    try:
        measurements_and_fit_header = scraped_page.find("h2", string="Size & fit")
        measurements_and_fit = measurements_and_fit_header.find_next_sibling().find_all('li')
        measurements = measurements_and_fit[0].text
        measurements = measurements.replace("How we measure","")
        fit = measurements_and_fit[1].text
    except:
        measurements = None
        fit = None
    
    # print(measurements)
    # print(fit)

    return measurements, fit

In [36]:
def save_json_files(product, product_id, save_folder):
    filename = save_folder + "json_files/item" + product_id +".json"

    with open(filename, 'w') as f:
        json.dump(product, f, indent=4)

# Get all product info from product page

In [37]:
def get_product_info(product_link, save_folder):
    """Extract and save product information from a given URL.
    
    Args:
        product_link (str): URL of the product page to scrape.
        save_folder (str): Directory path to save the product's image and JSON data.
        
    Returns:
        dict: A dictionary containing product details or None if an error occurs or data cannot be scraped.
    """
    try:
        scraped_page = scrape_page(product_link)
        if not scraped_page:
            return None
        
        product_id = get_product_id(product_link)
        image_link = get_image_link(scraped_page)
        if not image_link:
            return None

        save_image(image_link, product_id, save_folder)
        size, brand = get_size_and_brand(scraped_page)
        category = get_category(scraped_page)
        condition = get_condition(scraped_page)
        material, features = get_material_and_features(scraped_page)
        measurements, fit = get_measurements_and_fit(scraped_page)

        product_dict = {
            "product_link": product_link,
            "product_id": product_id,
            "size": size,
            "brand": brand,
            "category": category,
            "condition": condition,
            "material": material,
            "features": features,
            "measurements": measurements,
            "fit": fit
        }

        save_json_files(product_dict, product_id, save_folder)
        return product_dict
    except Exception as e:
        return None


# PRODUCT LOOP TO GET ALL OF EACH PRODUCT'S INFO

In [None]:
if max_products == "all":
    max_products = len(product_links)

products = []
for product_link in tqdm(product_links[:max_products]):
    print(f"--------------\ngetting info for product: {product_link}")
    product = get_product_info(product_link, save_folder)
    if product:
        products.append(product)
    #Pause for random duration to not trigger bot blocker
    time.sleep(random.randrange(5, 10))
    


In [40]:
#Save json

# Specify the filename
filename = 'scraped_product_details.json'

with open(save_folder + filename, 'w') as f:
    json.dump(products, f, indent=4)
