# Objective
To scrape following data from the curator site (The Bay):
* product_title
* product_description
* product_brand
* product_images
* product_price
* product_material
* product_rating

# Importing Libraries

In [12]:
from bs4 import BeautifulSoup
import requests

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

import pandas as pd
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
import time
import re
from collections import Counter

# Scraping the data

## Scraped products url

![Product Page](images/products.png)

In [2]:
# website: thebay

no_pages = 2

links_list = []

def get_url(pageNo):
    option = webdriver.ChromeOptions()

    # Run the argument with incognito
    # option.add_argument(' — incognito')
    driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)

    driver.get('https://www.thebay.com/c/women/womens-clothing?start='+str(96*(pageNo-1))+'&sz=24')
    
    
    
    # Wait 30 seconds for page to load
    timeout = 30
    try:
        WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.ID, "welcome-email-modal")))
    except TimeoutException:
        print("Timed out waiting for page to load")
        browser.quit()
    # close the sign-in pop up
    driver.find_element_by_id('consent-close').click()
    
    time.sleep(3)
    
    # scroll to the bottom to load the page fully
    for i in range(1,4):
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(4)

    soup = BeautifulSoup(driver.page_source, "html.parser")

    links = soup.find_all('a', attrs={'class':'thumb-link'})

    # Loop for extracting links
    for link in links:
        links_list.append(link.get('href'))
    return links_list

###  Ran the scraper and stored url in list

In [3]:
results = []
for i in range(1, no_pages+1):
    results.append(get_url(i))
x = lambda l: [item for sublist in l for item in sublist]
product_url_list = x(results)

  driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)


## Scraped products to gather the informations mentioned above
![Scraped product](images/product_scrap.png)

In [4]:
option = webdriver.ChromeOptions()

driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)


# Creating lists of features interested
product_title_list = list()
product_description_list = list()
product_brand_list = list()
product_image_list =list()
product_price_list = list()
product_rating_list = list()


# Getting the start time to track on time required
start = time.time()
product_number = 0

driver.set_window_size(1000,900)
driver.get('https://www.thebay.com'+product_url_list[0])

# Wait 30 seconds for page to load and extract the element after it loads
    
timeout = 30
try:
    WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.ID, "welcome-email-modal")))
except TimeoutException:
    print('Timed out waiting for page to load')
    
    # ----------------------------click close------------------------------------
driver.find_element_by_id('consent-close').click()

# -------------------------------Web Scraping-------------------------------
for link in product_url_list: 
    # Open the url
    driver.set_window_size(1000,900)
    driver.get('https://www.thebay.com'+link)

    # Wait 30 seconds for page to load and extract the element after it loads
    
    timeout = 30
    try:
        WebDriverWait(driver, timeout).until(EC.visibility_of_element_located((By.ID,'maincontent')))
    except TimeoutException:
        print('Timed out waiting for page to load')
        
    time.sleep(3)
    
    soup = BeautifulSoup(driver.page_source, "html.parser")

    # -------------------------------Product title-------------------------------
    # find_elements_by_id returns an array of selenium objects.
    try:
        product_title = soup.find('h1', attrs={'class':'product-name h2'}).text
    except:
        product_title = 'NA'
#     print("product title ",product_title)
    product_title_list.append(product_title)

    # -------------------------------Product price-------------------------------
    # This will return the product price
    try:
        product_price = soup.find('span', attrs={'class':'price'}).text
    except:
        product_price = 'NA'
#     print("product price ",product_price)
    product_price_list.append(product_price)

    # -------------------------------Description---------------------------------
    # This will return the description of the product
    try:
        product_description = soup.find('div', attrs={'class':'value content'}).text
    except:
        product_description = 'NA'
#     print("product description ",product_description)
    product_description_list.append(product_description)
  
    # -------------------------------Brand---------------------------------------
    # This will return the brand of the product
    try:
        product_brand = soup.find('a', attrs={'class':'product-brand adobelaunch__brand'}).text
    except:
        product_brand = 'NA'
#     print("product brand ",product_brand)
    product_brand_list.append(product_brand)
    
    # -------------------------------Images---------------------------------------
    # This will return the images of the product
    for img in soup.find_all('div', attrs={'class':'primary-images-wrapper col-12 col-md-10'}):
        image = img.find_all('img')
    product_image = list()    
    for img in image:
#         print(img.get('src'))
        product_image.append(img.get('src'))
    product_image = ' '.join(product_image)
    product_image_list.append(product_image)
    
    
        # -------------------------------Rating---------------------------------------
    # This will return the rating of the product out of 5
    try:
        product_rating = soup.find('span', attrs={'class':'tt-c-reviews-summary__rating-number'}).text
    except:
        product_rating = 'NA'
#     print("product rating ",product_rating)
    product_rating_list.append(product_rating)
    
    time.sleep(5)
    product_number += 1
    print('The product number {} has been scraped successfully'.format(product_number))
    
    
    
# Let us make a panda dataframe of information scraped
data = {'url':product_url_list,'product_title': product_title_list,'price':product_price_list,'description':product_description_list,
       'brand':product_brand_list,'images':product_image_list,'rating':product_rating_list}
df_product = pd.DataFrame.from_dict(data)
df_product.index.name = 'id'
display(df_product)

# Generate time tracker print
end = time.time()
print("For {} links, the time taken is {}".format(len(product_url_list), end-start))

  driver = webdriver.Chrome(executable_path='chromedriver', chrome_options=option)


The product number 1 has been scraped successfully
The product number 2 has been scraped successfully
The product number 3 has been scraped successfully
The product number 4 has been scraped successfully
The product number 5 has been scraped successfully
The product number 6 has been scraped successfully
The product number 7 has been scraped successfully
The product number 8 has been scraped successfully
The product number 9 has been scraped successfully
The product number 10 has been scraped successfully
The product number 11 has been scraped successfully
The product number 12 has been scraped successfully
The product number 13 has been scraped successfully
The product number 14 has been scraped successfully
The product number 15 has been scraped successfully
The product number 16 has been scraped successfully
The product number 17 has been scraped successfully
The product number 18 has been scraped successfully
The product number 19 has been scraped successfully
The product number 20

The product number 158 has been scraped successfully
The product number 159 has been scraped successfully
The product number 160 has been scraped successfully
The product number 161 has been scraped successfully
The product number 162 has been scraped successfully
The product number 163 has been scraped successfully
The product number 164 has been scraped successfully
The product number 165 has been scraped successfully
The product number 166 has been scraped successfully
The product number 167 has been scraped successfully
The product number 168 has been scraped successfully
The product number 169 has been scraped successfully
The product number 170 has been scraped successfully
The product number 171 has been scraped successfully
The product number 172 has been scraped successfully
The product number 173 has been scraped successfully
The product number 174 has been scraped successfully
The product number 175 has been scraped successfully
The product number 176 has been scraped succes

The product number 313 has been scraped successfully
The product number 314 has been scraped successfully
The product number 315 has been scraped successfully
The product number 316 has been scraped successfully
The product number 317 has been scraped successfully
The product number 318 has been scraped successfully
The product number 319 has been scraped successfully
The product number 320 has been scraped successfully
The product number 321 has been scraped successfully
The product number 322 has been scraped successfully
The product number 323 has been scraped successfully
The product number 324 has been scraped successfully
The product number 325 has been scraped successfully
The product number 326 has been scraped successfully
The product number 327 has been scraped successfully
The product number 328 has been scraped successfully
The product number 329 has been scraped successfully
The product number 330 has been scraped successfully
The product number 331 has been scraped succes

Unnamed: 0_level_0,product_title,price,description,brand,image1,rating
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Lane Button-Front Check Shacket,\n\n\n\nPrice reduced from\n\n\n$109\n\n\nto\n...,\nLive your cozy dreams in this check shacket ...,ONLY,https://image.s5a.com/is/image/TheBay/57150985...,
1,Relaxed-Knit Quarter-Zip Sweater,\n\n\n\nPrice reduced from\n\n\n$99.99\n\n\nto...,\nA slouchy pocketed silhouette coupled with b...,Mango,https://image.s5a.com/is/image/TheBay/84454383...,
2,High-Waist Distressed Straight Jeans,\n\n\n\nPrice reduced from\n\n\n$89.95\n\n\nto...,\nCut in a waist-emphasizing high rise silhoue...,Levi's,https://image.s5a.com/is/image/TheBay/80032282...,
3,Jessica Plaid Wool-Blend Shacket,\n\n\n\nPrice reduced from\n\n\n$84\n\n\nto\n\...,"\nAn amalgamation of a shirt and jacket, this ...",Design Lab,https://image.s5a.com/is/image/TheBay/88287016...,5.0
4,Ottoman Slouchy Tunic,\n\n\n\nPrice reduced from\n\n\n$204\n\n\nto\n...,\nKnitted cotton-blend tunic with a unique fun...,Free People,https://image.s5a.com/is/image/TheBay/19038057...,3.7
...,...,...,...,...,...,...
371,Ribbed Slouchy Cashmere Sweater,\n\n\n$329\n\n\n,\nLayering at its most luxurious. A slightly s...,Club Monaco,https://image.s5a.com/is/image/TheBay/19469803...,
372,Boatneck Fit-&-Flare Dress,\n\n\n$119.99\n\n\n,\nThis fit-and-flare midi dress features a fla...,Lauren Ralph Lauren,https://image.s5a.com/is/image/TheBay/19593411...,
373,Faux Fur-Trim Hooded Puffer Coat,\n\n\n\nPrice reduced from\n\n\n$349\n\n\nto\n...,\nCozy puffer coat featuring a detachable zip ...,Anne Klein,https://image.s5a.com/is/image/TheBay/72308835...,
374,Down Faux-Fur-Trim Hooded Parka Coat,\n\n\n\nPrice reduced from\n\n\n$299\n\n\nto\n...,\nDown-filled parka with a quilted design and ...,Anne Klein,https://image.s5a.com/is/image/TheBay/72308836...,5.0


For 376 links, the time taken is 3636.1308279037476


# Correcting the columns before exporting

## Correction of 'product link' column

In [9]:
#correction of link, added the https before the text as the craped url didn't have that
df_product['url'] = df_product['url'].apply(lambda x: 'https://www.thebay.com'+ x)

## Images url
The url of all the images is present in images column separated by white space. Below is the code for saving images in database/locally

In [None]:
# import urllib.request
# for i in range(len(df['image1'][:5])):
#     for image_url in range (len(df['image1'][i].split())):
# #         print(df['image1'][i].split()[image_url])
#         urllib.request.urlretrieve(df['image1'][i].split()[image_url], "image{}_of_product{}.jpg".format(image_url, i))

## Parsing price before and after sale
![price](images/price.png)

In [13]:
# created columns for price before and after sale, and one column to identify if the product was on sale
df_product['price_before_sale'] = df_product['price'].apply(lambda x: float(re.findall('[$][0-9.]*', x)[0].split('$')[1]))
# Sale 1 means the product was on sale and 0 indicates not on sale
df_product['sale_flag'] = df_product['price'].apply(lambda x: 1 if len(re.findall('[$][0-9.]*', x)) == 2 else 0)
df_product['price_after_sale'] = df_product['price'].apply(lambda x: float(re.findall('[$][0-9.]*', x)[-1].split('$')[1]))

In [17]:
df_product[['price_before_sale','Sale','price_after_sale']].head()

Unnamed: 0_level_0,price_before_sale,Sale,price_after_sale
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,109.0,1,76.3
1,99.99,1,74.99
2,89.95,1,59.99
3,84.0,1,58.8
4,204.0,1,163.2


The price of product is in US dollars. We can see columns of price before sale, sale, and price after sale. 

## Parsed details, specifications, and material information from description
![Description](images/description.png)

In [14]:
# parsed details, specs and material information from description column
df_product['details'] = df_product['description'].apply(lambda x:  x.split('.')[0].split('\n')[1])
df_product['specifications'] = df_product['description'].apply(lambda x: ', '.join(re.findall('[a-zA-Z][^0-9A-Z]*', x.split('Style')[0].split('SIZE')[0].split('.')[-1])))
df_product['material'] = df_product['description'].apply(lambda x: ', '.join(list(map(str, set(re.findall('[0-9]*[%][^A-Z]*', x))))))

In [18]:
df_product[['Details', 'Specifications', 'Material']].head()

Unnamed: 0_level_0,Details,Specifications,Material
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,Live your cozy dreams in this check shacket de...,"Spread collar, Long sleeves, Front button clos...",100% polyester
1,A slouchy pocketed silhouette coupled with blo...,"Rib-knit mockneck, Dropped shoulders, Long blo...",49% viscose/28% elastane/23% polyamide
2,Cut in a waist-emphasizing high rise silhouett...,"Five-pocket style, Zip fly with front button c...",100% cotton
3,"An amalgamation of a shirt and jacket, this sh...","Spread collar, Long sleeves with buttoned cuff...",
4,Knitted cotton-blend tunic with a unique funne...,"Funnel neck, Long sleeves, Rib-knit cuffs and ...",


In [19]:
# added NA for columns were no material was found. It also shows NA for products whose % of material was not mentioned
df_product['Material'] = df_product['Material'].apply(lambda x: 'NA' if len(x)== 0 else x)

## A more effiecent way of parsing material can be using a list of materials and extracting the material if mentioned

## Parsing materials information from Material column

In [21]:
# made a list of all material present in material column
materials = list()
for i in range(len(df_product['Material'])):
    materials.append(re.findall('[a-z][a-z]*', (df_product['Material'][i])))
all_material = x(materials)

In [22]:
# top 5 materials used to produce apparels
common_materials = Counter(all_material).most_common(5)
common_materials

[('polyester', 222),
 ('cotton', 88),
 ('viscose', 68),
 ('elastane', 42),
 ('nylon', 38)]

In [24]:
material_list = [item[0] for item in common_materials]

#---------------------- function to get the percentage value of each material------------------------
# for i in range (len(material_list)):
#     df_product['material_{}_percentage'.format(material_list[i])] = df_product['Material'].apply(lambda x: int(re.findall('[0-9][0-9]*', x.rpartition(material_list[i])[0])[-1]) if (material_list[i] in x) else 0)

#created new columns with common materials, 1 if material is present and 0 if not present
for i in range (len(material_list)):
    df_product['material_{}_flag'.format(material_list[i])] = df_product['description'].apply(lambda x: 1 if (material_list[i] in x) else 0)

In [37]:
df_product.dtypes

url                               object
product_title                     object
brand                             object
images                            object
rating                            object
price_before_sale                float64
sale_flag                          int64
price_after_sale                 float64
details                           object
specifications                    object
material                          object
material_polyester_flag            int64
material_cotton_flag               int64
material_viscose_flag              int64
material_elastane_flag             int64
material_nylon_flag                int64
material_polyester_percentage      int64
dtype: object

In [31]:
# Specified the column types of following as string
df_product[['url', 'product_title', 'brand', 'images', 'details', 'specifications', 'material']] = df_product[['url', 'product_title', 'brand', 'images', 'details', 'specifications', 'material']].astype(str)

In [32]:
# drop useless columns
df_product = df_product.drop(columns=['price', 'description'])

In [38]:
df_product.head()

Unnamed: 0_level_0,url,product_title,brand,images,rating,price_before_sale,sale_flag,price_after_sale,details,specifications,material,material_polyester_flag,material_cotton_flag,material_viscose_flag,material_elastane_flag,material_nylon_flag,material_polyester_percentage
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
0,https://www.thebay.com/product/only-lane-butto...,Lane Button-Front Check Shacket,ONLY,https://image.s5a.com/is/image/TheBay/57150985...,,109.0,1,76.3,Live your cozy dreams in this check shacket de...,"Spread collar, Long sleeves, Front button clos...",100% polyester,1,0,0,0,0,100
1,https://www.thebay.com/product/mango-relaxed-k...,Relaxed-Knit Quarter-Zip Sweater,Mango,https://image.s5a.com/is/image/TheBay/84454383...,,99.99,1,74.99,A slouchy pocketed silhouette coupled with blo...,"Rib-knit mockneck, Dropped shoulders, Long blo...",49% viscose/28% elastane/23% polyamide,0,0,1,1,0,0
2,https://www.thebay.com/product/levis-high-wais...,High-Waist Distressed Straight Jeans,Levi's,https://image.s5a.com/is/image/TheBay/80032282...,,89.95,1,59.99,Cut in a waist-emphasizing high rise silhouett...,"Five-pocket style, Zip fly with front button c...",100% cotton,0,1,0,0,0,0
3,https://www.thebay.com/product/design-lab-jess...,Jessica Plaid Wool-Blend Shacket,Design Lab,https://image.s5a.com/is/image/TheBay/88287016...,5.0,84.0,1,58.8,"An amalgamation of a shirt and jacket, this sh...","Spread collar, Long sleeves with buttoned cuff...",,0,0,0,0,1,0
4,https://www.thebay.com/product/free-people-ott...,Ottoman Slouchy Tunic,Free People,https://image.s5a.com/is/image/TheBay/19038057...,3.7,204.0,1,163.2,Knitted cotton-blend tunic with a unique funne...,"Funnel neck, Long sleeves, Rib-knit cuffs and ...",,0,1,0,0,1,0


# Export and Save

In [39]:
# -------------------------------EXPORT and SAVE-------------------------------
# Exporting the data into csv
df_product.to_csv('product_info_thebay.csv')

# Future recommendations
1. We can group products by brand and use NLP to determine public sentiment for that particular brand using comments
2. Similarly, we can determine the trendy specs and brands by exploring the description using NLP. 
3. We can also explore eco friendly and sustainable tags in description of the apparels and group them to analyze the brand, specs, and reviews.

# References
* https://github.com/VincentTatan/Web-Scraping/blob/master/Selenium%20Web%20Scraping/amazon/scraping-amazon.py
* https://www.datacamp.com/community/tutorials/amazon-web-scraping-using-beautifulsoup