In [57]:
pip install selenium


Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [58]:
# importing required libraries
from selenium import webdriver
from time import sleep
from bs4 import BeautifulSoup
import re
import random
import unicodedata
import pandas as pd
import time

In [None]:
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

In [59]:
# Global Constants
# output csv file name
CSV_FILENAME = 'gap_women_jeans.csv'
# PLP stands for Product Listing Page
# It is the page in which the list of products is available 
PLP_URL = "https://www.gap.com/browse/category.do?cid=5664&nav=meganav%3AWomen%3ACategories%3AJeans#pageId=0&department=136"

In [60]:
# Initialize the WebDriver
def initialize_driver():
    """
    return: WebDriver instance (Chrome in this case)
    """
    driver = webdriver.Chrome()
    return driver

In [61]:
# Random sleep to mimic human behavior
def random_sleep(min_time=4, max_time=10):
    """
    here uniform function is used which takes decimal values as well and not just whole numbers
    thus showing somewhat more natural human behavior

    :param min_time: minimum time to sleep, default value is 2
    :param max_time: maximum time to sleep, default value is 5

    delays program execution for a random amount of time between min_time and max_time
    we use a range of 4-7 seconds as selenium sometimes requires quite a bit of time to load the page
    """
    sleep_time = random.uniform(min_time, max_time)
    sleep(sleep_time)

In [62]:
# Scroll down the page to load more products
def scroll(driver, timeout=30):
    """
    Scrolls through the webpage to ensure all dynamic content is loaded.
    :param driver: Selenium WebDriver instance
    :param timeout: Time to wait for the page to load new content after scrolling
    """
    scroll_pause_time = 2.5  # You can adjust this depending on how quickly the page loads
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to the bottom of the page
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait for new content to load
        time.sleep(scroll_pause_time)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If heights are the same, it means there are no more products to load
            break
        last_height = new_height
        # Optionally, add a random sleep to mimic human behavior
        random_sleep(1, 3)

In [63]:
# Get BeautifulSoup object from the current page
def get_soup(driver):
    """
    :param driver: WebDriver instance
    :return: BeautifulSoup object
    """
    return BeautifulSoup(driver.page_source, 'lxml')

In [64]:
# Extract product url from a product element
def extract_pdp_url(product):
    """
    pdp stands for product description page
    it is the page in which the whole information about the product is 
    present 
    product url is present in the form - 
    'https://www.gap.com/browse/product.do? 
 pid=774933022&cid=11900&pcid=11900&vid=1&nav=meganav%3AMen%3AJust%20Arrived%3ANew%20Arrivals&cpos=116&cexp=2859&kcid=CategoryIDs%3D11900&ctype=Listing&cpid=res23090805504869997736471#pdp-page-content'
    we need to extract the part till the value of pid (inclusive)
    the rest of the url is not needed and can even break the url at a 
    later date    

    :param product: product element
    :return: pdp url which is a string
    """

    try:
        url = product.find('a').get('href')
        url = url.split('&')[0]
    except:
        url = 'Not available'
    return url

In [65]:
def extract_product_demo(soup):
    """
    Extracts the product demographic (e.g., men, women, boys, baby) from the product page.
    The information is contained within the first <a> tag with a specific class.

    :param soup: BeautifulSoup object
    :return: product demographic as a string
    """
    try:
        # Directly find the first <a> tag with the specified class
        product_demo_link = soup.find('a', class_='pdp-mfe-1lmagf7')
        product_demo = product_demo_link.get_text() if product_demo_link else 'Not available'
    except Exception as e:
        print(f"Error extracting product demographic: {e}")
        product_demo = 'error'

    return product_demo

In [66]:
def extract_product_type(soup):
    """
    Extracts the product type (e.g., jeans, t-shirts, shirts) from the product page.
    The information is contained within the second <a> tag with a specific class.

    :param soup: BeautifulSoup object
    :return: product type as a string
    """
    try:
        # Find all <a> tags with the specified class and select the second one for product type
        product_type_links = soup.find_all('a', class_='pdp-mfe-1lmagf7')
        product_type = product_type_links[1].get_text() if len(product_type_links) > 1 else 'Not available'
    except Exception as e:
        print(f"Error extracting product type: {e}")
        product_type = 'error'

    return product_type

In [67]:
#Extract product name from the product page
def extract_product_name(soup):
    """
    the h1 tag which contains the product has a different class name 
    for each product
    but every h1 tag has the class name starting with pdp-mfe-

    :param soup: BeautifulSoup object
    :return: product name which is a string
    """
    try:
        product_name_element = soup.select('h1[class^="pdp-mfe-"]') 
        product_name = product_name_element[0].text
    except:
        product_name = 'Not available'
    return product_name

In [68]:
def extract_prices(soup):
    """
    Extracts the selling price and max retail price from the product page,
    accommodating different class identifiers for different website configurations.
    
    :param soup: BeautifulSoup object
    :return: selling price and max retail price as strings
    """
    
    try:
        # Try to find the selling price for both cases
        selling_price_element = soup.find('div', class_="product-price--pdp__highlight")
        if not selling_price_element:
            selling_price_element = soup.find('div', class_="pdp-pricing--highlight pdp-pricing__selected pdp-mfe-1jiw3bl")
        if not selling_price_element:  # Fallback to a more general search if specific classes fail
            selling_price_element = soup.find('div', class_=re.compile(r'product-price|pdp-pricing'))
        selling_price = selling_price_element.text.strip('$')
        selling_price = re.sub(r'\([^()]*\)', '', selling_price).strip()
        
        # Try to find the max retail price for both cases
        max_retail_price_element = soup.find('div', class_="product-price--pdp__regular")
        if not max_retail_price_element:
            max_retail_price_element = soup.find('div', class_="product-price__strike pdp-mfe-vo1pn1")
        if max_retail_price_element:
            max_retail_price = max_retail_price_element.text.strip('$')
        else:
            max_retail_price = selling_price  # Use the selling price if no separate max price is found
    except AttributeError:  # Catch if any element is not found
        selling_price = 'Not available'
        max_retail_price = 'Not available'
    
    return selling_price, max_retail_price

In [69]:
# Extract product rating from the product page
def extract_star_value(soup):
    """
    the span with class pdp-mfe-3jhqep contains the star rating in the 
    form - 5 stars, x are filled
    we need to extract the value of x

    :param soup: BeautifulSoup object
    :return: star value which is a string
    """

    try:
        star_value = soup.find('span', class_='pdp-mfe-3jhqep').text
        star_value = star_value.split(',')[1].split(' ')[1]
    except:
        star_value = 'Not available'
    return star_value

In [70]:
# Extract the number of product ratings from the product page
def extract_ratings_count(soup):
    """
    the div with class pdp-mfe-17iathi contains the number of ratings 
    in the form - x ratings
    we need to extract the value of x

    :param soup: BeautifulSoup object
    :return: ratings count which is a string
    """

    try:
        ratings_count = soup.find('div', class_='pdp-mfe-17iathi').text
        ratings_count = ratings_count.split(' ')[0]
    except:
        ratings_count = 'Not available'
    return ratings_count

In [71]:
# Extract product color from the product page
def extract_color(soup):
    """
    the span with class swatch-label__value contains the color of the 
    product

    :param soup: BeautifulSoup object
    :return: color which is a string
    """
    try:
        color = soup.find('span', class_='swatch-label__value').text
    except:
        color = 'Not available'
    return color

In [72]:
# Extract available sizes from the product page
def extract_available_sizes(soup):
    """
    the div with class pdp-mfe-1kg10fj pdp-dimension pdp-dimension-- 
    should-display-redesign-in-stock contains the available sizes
    the available sizes are stored into a list

    in cases where there is no size available, the div with class pdp- 
    mfe-17f6z2a pdp-dimension pdp-dimension--should-display-redesign- 
    in-stock is not present
    in such cases we return a list with 'Not applicable' as the only 
    element
    this can be seen in case of accessories such as bags

    :param soup: BeautifulSoup object
    :return: available sizes which is a list
    """
    try:
        available_sizes_element = soup.find_all('div', class_='pdp-mfe-1kg10fj pdp-dimension pdp-dimension--should-display-redesign-in-stock')
        available_sizes = []
        for size in available_sizes_element:
            available_sizes.append(size.text)
    except:
        available_sizes = ['Not available']
    if not available_sizes:
        available_sizes = ['Not applicable']
    return available_sizes

In [73]:
# Extract product details from the product page
def extract_details(soup):
    """
    the product details are present in the form of a list
    there are three sets of details - fit and sizing, product details, 
    fabric and care
    each set of details is present in a ul tag with class name starting 
    with product-information-item__list
    the text obtained is then normalized to remove any unicode 
    characters
    normalizing means converting the special characters to their normal 
    form
    in our case we can particularly see zero width space characters 
    (u200b) in the text 

    :param soup: BeautifulSoup object
    :return: fit and sizing, product details, fabric and care which are 
     lists
    """

    try:
        details_elements = soup.select('ul[class^="product-information-item__list"]')
        if len(details_elements) == 3:
            fit_sizing_element = details_elements[0].find_all('li')
            fit_sizing = []
            for detail in fit_sizing_element:
                if 'wearing' not in detail.text:
                    text = unicodedata.normalize(
                        "NFKD",
                        detail.text
                        ).rstrip('. ')
                    fit_sizing.append(text)

            product_details_element = details_elements[1].find_all('li')
            product_details = []
            for detail in product_details_element:
                if '#' not in detail.text and  'P.A.C.E.' not in detail.text and 'pace' not in detail.text:
                    text = unicodedata.normalize(
                        "NFKD", 
                        detail.text
                        ).rstrip('.')
                    product_details.append(text)

            fabric_care_element = details_elements[2].find_all('li')
            fabric_care = []
            for detail in fabric_care_element:
                text = unicodedata.normalize(
                    "NFKD", 
                    detail.text
                    ).rstrip('. ')
                fabric_care.append(text)

        else:
            fit_sizing = ['Not applicable']
            
            product_details_element = details_elements[0].find_all('li')
            product_details = []
            for detail in product_details_element:
                if '#' not in detail.text and 'P.A.C.E.' not in detail.text and 'pace' not in detail.text:
                    text = unicodedata.normalize(
                        "NFKD", 
                        detail.text
                        ).rstrip('.')
                    product_details.append(text)
            
            fabric_care_element = details_elements[1].find_all('li')
            fabric_care = []
            for detail in fabric_care_element:
                text = unicodedata.normalize(
                    "NFKD", 
                    detail.text
                    ).rstrip('. ')
                fabric_care.append(text)
                fabric_care.append(text)
    except:
        fit_sizing = ['Not available']
        product_details = ['Not available']
        fabric_care = ['Not available']
    
    return [fit_sizing, product_details, fabric_care]

In [74]:
def main():
    """
    begins with initializing the WebDriver
    then goes to the PLP_URL
    - PLP means product listing page and it is the page in which the list of products is available

    then scrolls down the page to load more products
    then gets the BeautifulSoup object from the current page
    then gets each product element from the main page
    then extracts the product url from each product element and stores it in a list
    then initializes a pandas dataframe with the required columns
    then iterates through each product and extracts information
    then stores the information in the initialized pandas dataframe
    then prints the progress, which is the count of the current product
    after going through every url writes the dataframe to the CSV file
    then quits the WebDriver

    in the above description each line corresponds to each section of the main function which is seperated by a blank line
    """
    driver = initialize_driver()
    driver.get(PLP_URL)

    scroll(driver)

    soup = get_soup(driver)
    product_info = soup.find_all('div', class_='category-page-1wcebst')
    pdp_url_list = [extract_pdp_url(product) for product in product_info]

    df = pd.DataFrame(columns=
                      ['Product_URL', 'Product_Demographic', 'Product_Type', 'Product_Name',
                       'Selling_Price', 'Max_Retail_Price', 'Rating', 
                       'Rating_Count', 'Color', 'Available_Sizes', 
                       'Fit_Sizing', 'Product_Details', 'Fabric_Care']
                       )
        
    for index, pdp_url in enumerate(pdp_url_list, start=1):
        if pdp_url != 'Not available':
            driver.get(pdp_url)
            random_sleep()
            soup = get_soup(driver)
            product_demo = extract_product_demo(soup)
            product_type = extract_product_type(soup)
            product_name = extract_product_name(soup)
            selling_price, max_retail_price = extract_prices(soup)
            star_value = extract_star_value(soup)
            ratings_count = extract_ratings_count(soup)
            color = extract_color(soup)
            available_sizes = extract_available_sizes(soup)
            details = extract_details(soup)

            df.loc[index] = [pdp_url, product_demo, product_type, product_name, selling_price, max_retail_price, star_value, ratings_count, color, ', '.join(available_sizes), *details]

            print(f"Processed product {index}")

    df.to_csv(CSV_FILENAME, index=False)
    print(f"Data written to {CSV_FILENAME}")
    driver.quit()


# Run the main function if the script is executed directly
if __name__ == "__main__":
    main()
  

Processed product 1
Processed product 2
Processed product 3
Processed product 4
Processed product 5
Processed product 6
Processed product 7
Processed product 8
Processed product 157
Processed product 158
Processed product 159
Processed product 160
Processed product 161
Processed product 162
Processed product 163
Processed product 164
Processed product 165
Processed product 166
Processed product 167
Processed product 168
Processed product 169
Processed product 170
Processed product 171
Processed product 172
Processed product 173
Processed product 174
Processed product 175
Processed product 176
Processed product 177
Processed product 178
Processed product 179
Processed product 180
Processed product 181
Data written to gap_women_jeans.csv
