# Intro

The purpose of this notebook is to scrap products form real retail websites in order to evaluate approaches on product retrieval by match.

In [165]:
from typing import List
from time import sleep
import itertools
import pickle

import requests
import urllib.request

from dataclasses import dataclass
from bs4 import BeautifulSoup
from bs4.element import Tag

from tqdm import tqdm

# Define domain

For each product:

- We use first product image found
- We select all possible colors

In [148]:
@dataclass
class Product(object):
    
    url: str
    image_url: str
    name: str
    section: str
        
    def from_url(product_url: str, section: str) -> Product:
        """ Returns product from url. Returns None on error"""
        response = requests.get(product_url)
        soup = BeautifulSoup(response.text, 'html.parser')
        return Product(url=product_url,
                       section=section,
                       name=name_in_product_page(soup).text,
                       image_url=images_in_product_page(soup)[0].get('src'))


def name_in_product_page(product_soup: BeautifulSoup) -> str:
    return product_soup.find('span', {'class': 'nj-namecomponent-name'})
        
        
def images_in_product_page(product_soup: BeautifulSoup) -> List[Tag]:
    return product_soup.findAll(
        'img',
        {
            'class': 'aino-image',
            'src': lambda x: x is not None and 'nudiejeans' in x
        }
    )


def color_urls(product_url: str, root_url: str) -> List[str]:
    """ Returns pages for all available colors from product """
    response = requests.get(product_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    color_links = soup.findAll('a', {'class': '_19uix'})
    return [
        f'{root_url}/{color_link.get("href")}'
        for color_link in color_links
    ]


def listing_to_products(listing_tag: Tag,
                        section: str,
                        root_url: str = 'https://www.nudiejeans.com',
                        sleep_between_products_ms: int = 100) -> List[Product]:
    path = listing_tag.find('a').get('href')
    url = f'{root_url}/{path}'
    all_colors = color_urls(url, root_url)
    
    if len(all_colors) == 0:
        all_colors = [url]

    products = []
    sleep_seconds = sleep_between_products_ms / 1000.0
    for color_url in all_colors:
        products.append(Product.from_url(color_url, section))
        sleep(sleep_seconds)

    return products

# List sections

In [147]:
sections = [
    'https://www.nudiejeans.com/jeans',
    'https://www.nudiejeans.com/selection/pants',
    'https://www.nudiejeans.com/selection/shorts',
    'https://www.nudiejeans.com/selection/denim-jackets',
    'https://www.nudiejeans.com/selection/jackets',
    'https://www.nudiejeans.com/selection/shirts'
]

# Parse products from listing

In [156]:
def section_to_products(section_url) -> List[Product]:
    response =  requests.get(section_url)
    section_soup = BeautifulSoup(response.text, "html.parser")
    section_name = section_url.split('/')[-1]
    
    listings = section_soup.findAll('div', {'class': '_1WLsJ'})
    products = [
        listing_to_products(listing, section_name)
        for listing in tqdm(listings, desc=f'Products from {section_name}')
    ]
    
    return list(itertools.chain(*products))

all_products = [
    section_to_products(section_url)
    for section_url in sections
]

Products from jeans: 100%|██████████| 102/102 [06:10<00:00,  3.63s/it]
Products from pants: 100%|██████████| 18/18 [00:57<00:00,  3.22s/it]
Products from shorts: 100%|██████████| 9/9 [00:12<00:00,  1.41s/it]
Products from denim-jackets: 100%|██████████| 27/27 [01:32<00:00,  3.44s/it]
Products from jackets: 100%|██████████| 24/24 [00:42<00:00,  1.75s/it]
Products from shirts: 100%|██████████| 44/44 [01:58<00:00,  2.70s/it]


Flatten list of products.

In [160]:
all_products = list(itertools.chain(*all_products))

In [163]:
print(f'We have {len(all_products)} products')

We have 790 products


# Store products

In [166]:
with open('products.pickle', 'wb') as handle:
    pickle.dump(all_products, handle, protocol=pickle.HIGHEST_PROTOCOL)