# Notebook Overview

# Imports

In [1]:
from bs4 import BeautifulSoup # For Webscraping
import requests
import re

import pandas as pd

In [2]:
# # Import custom functions
# import sys
# sys.path.append('../')

# from custom_functions import functions_for_parsing

# Which pages are allowed?

In [3]:
# Find the url for next's robots.txt
robots_txt = 'https://www.next.co.uk/robots.txt'

# Get the text from robots.txt page
response = requests.get(robots_txt)
allow_and_disallow = response.text

# Create list of each line (string -> list)
allow_and_disallow = allow_and_disallow.split("\r\n")

# Filter for only pages that are allowed or disallowed
allow = [i for i in allow_and_disallow if i.startswith('Allow')]
disallow = [i for i in allow_and_disallow if i.startswith('Disallow')]

# Filter for only pages that are clothing
allow_clothing = [i for i in allow if not i.startswith('Allow: /homeware')]
allow_clothing = [i for i in allow_clothing if not i.startswith('Allow: /shop/department-homeware')]

In [4]:
def items_starting_with_string(_list, _string):
    '''Return items in list which begin with string'''
    return [i for i in _list if i.startswith(_string)]

def items_containing_string(_list, _string):
    '''Return items in list with the string'''
    return [i for i in _list if _string in i]

In [5]:
items_starting_with_string(disallow, 'Disallow: /shop/gender-men')

['Disallow: /shop/gender-men-gender-women*',
 'Disallow: /shop/gender-men-gender-oldergirls*',
 'Disallow: /shop/gender-men-gender-youngergirls*',
 'Disallow: /shop/gender-men-gender-women-gender-youngergirls*',
 'Disallow: /shop/gender-men-gender-unisex*',
 'Disallow: /shop/gender-men/*',
 'Disallow: /shop/gender-men-productaffiliation-casualshirts*',
 'Disallow: /shop/gender-men-productaffiliation-formalshirts*',
 'Disallow: /shop/gender-men-productaffiliation-shirts*']

In [6]:
items_containing_string(disallow, 'sizetype')

['Disallow: /shop/gender-*/sizetype-*sizetype*',
 'Disallow: /shop/gender-women/sizetype-*/*',
 'Disallow: /shop/sizetype-*/*sizetype-*',
 'Disallow: /shop/*sizetype-*/*sizetype-*',
 'Disallow: /shop/*-sizetype-*-sizetype-*',
 'Disallow: /shop/sizetype-*-sizetype-*',
 'Disallow: /shop/*/*-sizetype-*-sizetype-*',
 'Disallow: /shop/*/sizetype-*-sizetype-*']

In [7]:
allow_clothing

['Allow: /shop/gender-men-brand-nike-0$',
 'Allow: /shop/gender-women-brand-nike-0$',
 'Allow: /shop/gender-men-brand-adidas-0$',
 'Allow: /shop/gender-women-brand-adidas-0$',
 'Allow: /shop/gender-men-brand-converse-0$',
 'Allow: /shop/gender-women-brand-converse-0$',
 'Allow: /shop/gender-men-brand-skechers-0$',
 'Allow: /shop/gender-women-brand-skechers-0$',
 'Allow: /shop/gender-men-brand-underarmour-0$',
 'Allow: /shop/gender-women-brand-underarmour-0$',
 'Allow: /shop/gender-women-category-dresses/size-16$',
 'Allow: /shop/gender-women-category-dresses/size-20$',
 'Allow: /shop/gender-women-category-dresses/size-22$',
 'Allow: /shop/gender-women-category-dresses/size-4$',
 'Allow: /shop/gender-women-category-dresses/size-6$',
 'Allow: /shop/gender-women-category-dresses/size-12$',
 'Allow: /shop/gender-women-category-dresses/size-24$',
 'Allow: /shop/gender-women-category-dresses/size-8$',
 'Allow: /shop/gender-women-category-dresses/size-10$',
 'Allow: /shop/gender-women-categor

Great pages to use:
* `/shop/gender-women/sizetype-*$` 
    * [Link](https://www.next.co.uk/shop/gender-women/sizetype-)
    * https://www.next.co.uk/shop/gender-men/sizetype-
* `/shop/gender-men/sizetype-*$'` [Link](https://www.next.co.uk/shop/gender-men/sizetype-)

# Create Beautiful Soup Object

## Test on Women's clothing 

In [8]:
url_women = 'https://www.next.co.uk/shop/gender-women/sizetype-'

In [9]:
# Make a get request to retrieve the page
html_page = requests.get(url_women)

# Pass the page contents to beautiful soup for parsing
soup = BeautifulSoup(html_page.content, 'html.parser')

In [10]:
# Display the html code
print(soup.prettify())

<!DOCTYPE html>
<html dir="ltr" lang="en-gb">
 <head>
  <link as="font" crossorigin="" href="//www.next.co.uk/content/Fonts/AzoSans-Regular-webfont.woff" rel="preload" type="font/woff"/>
  <link as="font" crossorigin="" href="//www.next.co.uk/content/Fonts/AzoSans-Medium-webfont.woff" rel="preload" type="font/woff"/>
  <link as="font" crossorigin="" href="//www.next.co.uk/content/Fonts/AzoSans-Light-webfont.woff" rel="preload" type="font/woff"/>
  <link as="style" href="/CSS/fonts/Desktop/Fonts.min.css" rel="preload"/>
  <link href="/CSS/fonts/Desktop/Fonts.min.css" rel="stylesheet"/>
  <link as="font" href="/content/Fonts/AzoSans-Regular-webfont.woff" rel="preload" type="font/woff"/>
  <link as="font" href="/content/Fonts/AzoSans-Medium-webfont.woff" rel="preload" type="font/woff"/>
  <link as="script" href="https://xcdn.next.co.uk/content/platmod/vendors/react/16.13.1/umd/react.production.min.js" rel="preload"/>
  <link as="script" href="https://xcdn.next.co.uk/content/platmod/vendor

### Retrieve all women's products

In [11]:
all_clothing_womens = soup.find_all('article', class_='Item Fashion')
product_one = all_clothing_womens[0] 
product_one # Preview the first entry

<article class="Item Fashion" data-brand="Next" data-colour="Black" data-department="Womenswear" data-itemnumber="201339" data-itemposition="1" id="i1">
<section class="Details firstRow">
<div class="Info">
<h2 class="Title">
<a class="TitleText" data-desc="Black Super Stretch Soft Sculpt Pull-On Denim Leggings" data-label="Black Super Stretch Soft Sculpt Pull-On Denim Leggings" data-usespan="False" href="//www.next.co.uk/g9434s13/201339#201339" title="Black Super Stretch Soft Sculpt Pull-On Denim Leggings">
<span class="Desc">
                            Black Super Stretch Soft Sculpt Pull-On Denim Leggings
                        </span>
</a>
</h2>
<div class="Price">
<a href="//www.next.co.uk/g9434s13/201339#201339" title="Black Super Stretch Soft Sculpt Pull-On Denim Leggings">
£22                        </a>
</div>
<div class="Rating rating-50">
<a href="//www.next.co.uk/g9434s13/201339#201339" title="Black Super Stretch Soft Sculpt Pull-On Denim Leggings">
</a>
</div>
<div class

### Find the title of product one

In [12]:
product_one.find("h2", class_="Title")

<h2 class="Title">
<a class="TitleText" data-desc="Black Super Stretch Soft Sculpt Pull-On Denim Leggings" data-label="Black Super Stretch Soft Sculpt Pull-On Denim Leggings" data-usespan="False" href="//www.next.co.uk/g9434s13/201339#201339" title="Black Super Stretch Soft Sculpt Pull-On Denim Leggings">
<span class="Desc">
                            Black Super Stretch Soft Sculpt Pull-On Denim Leggings
                        </span>
</a>
</h2>

In [13]:
product_one.find('a')

<a class="TitleText" data-desc="Black Super Stretch Soft Sculpt Pull-On Denim Leggings" data-label="Black Super Stretch Soft Sculpt Pull-On Denim Leggings" data-usespan="False" href="//www.next.co.uk/g9434s13/201339#201339" title="Black Super Stretch Soft Sculpt Pull-On Denim Leggings">
<span class="Desc">
                            Black Super Stretch Soft Sculpt Pull-On Denim Leggings
                        </span>
</a>

In [14]:
product_one_title = product_one.find('a').attrs['title']
product_one_title

'Black Super Stretch Soft Sculpt Pull-On Denim Leggings'

### Find the price of product one

In [15]:
product_one.find("div", class_="Price")

<div class="Price">
<a href="//www.next.co.uk/g9434s13/201339#201339" title="Black Super Stretch Soft Sculpt Pull-On Denim Leggings">
£22                        </a>
</div>

In [16]:
product_one.find("div", class_="Price").find('a')

<a href="//www.next.co.uk/g9434s13/201339#201339" title="Black Super Stretch Soft Sculpt Pull-On Denim Leggings">
£22                        </a>

In [17]:
product_one.find("div", class_="Price").find('a').text

'\r\n£22                        '

In [18]:
product_one.find("div", class_="Price").find('a').text.strip()

'£22'

In [19]:
product_one_price = product_one.find("div", class_="Price").find('a').text.strip().strip('£')
product_one_price

'22'

### Get Product Name

In [20]:
def retrive_first_product_price():
    # Retrieve all products
    all_products = soup.find_all('article', class_='Item Fashion')
    # Pick out the first product
    product_one = all_products[0]
    # Pick out the price of the first product, which is in the strong tag
    product_one_price = product_one.find("strong")
    
    # Print the result
    print('Product Price: {}'.format(product_one_price.get_text()))
    # Remove the $ sign
    print(product_one_price.get_text().strip().strip('$'))

if __name__ == '__main__':
    retrive_first_product_price()

AttributeError: 'NoneType' object has no attribute 'get_text'

### Build Dictionary of product names and prices

In [21]:
def product_prices():
    # Retrieve all products
    all_products = soup.find_all('article', class_='Item Fashion')
    # Empty Dictionary
    products = []

    # Build Dictionary with key as name of each product, which is under the "p" tag. and value as the price
    
    for product in all_products:
        name_and_price = {}
        name_and_price['Name'] = product.find("a").attrs['title']
        name_and_price['Price'] = int(product.find("div", class_="Price").find('a').text.strip().strip('£'))
        products.append(name_and_price)
        #         products[product.find("a").attrs['title']] = [int(product.find("div", class_="Price").find('a').text.strip().strip('£'))]
    
    # Sort by name
#     products_sorted = dict(sorted(products.items(), key=lambda item: item[1]))
    
    return products

In [22]:
product_prices()

[{'Name': 'Black Super Stretch Soft Sculpt Pull-On Denim Leggings',
  'Price': 22},
 {'Name': 'Black Oversized Soft Shacket', 'Price': 20},
 {'Name': 'Black Tree Graphic Christmas Sweatshirt', 'Price': 25},
 {'Name': 'Light Pink Womens Christmas Jumper', 'Price': 34},
 {'Name': 'Black Teddy Borg Coat', 'Price': 68},
 {'Name': 'Khaki Emma Willis Rib High Neck Jumper', 'Price': 34},
 {'Name': 'Black Womens Christmas Jumper', 'Price': 30},
 {'Name': 'Black/Red Womens Christmas Jumper', 'Price': 30},
 {'Name': 'Red Padded Coat', 'Price': 50},
 {'Name': 'Oatmeal Tree Graphic Christmas Sweatshirt', 'Price': 26},
 {'Name': 'Black Roll Neck Dress', 'Price': 35},
 {'Name': 'Charcoal Rainbow Cotton Blend Pyjamas', 'Price': 25},
 {'Name': 'Black Embellished Star Dolman Long Sleeve Top', 'Price': 18},
 {'Name': 'Black Forever Comfort® Rider Knee High Material Mix Boots',
  'Price': 56},
 {'Name': 'Nike Run Black/Gold Revolution 5 Premium Trainers', 'Price': 48},
 {'Name': 'Red Womens Christmas Jum

In [23]:
df = pd.DataFrame.from_dict(product_prices())
df

Unnamed: 0,Name,Price
0,Black Super Stretch Soft Sculpt Pull-On Denim ...,22
1,Black Oversized Soft Shacket,20
2,Black Tree Graphic Christmas Sweatshirt,25
3,Light Pink Womens Christmas Jumper,34
4,Black Teddy Borg Coat,68
5,Khaki Emma Willis Rib High Neck Jumper,34
6,Black Womens Christmas Jumper,30
7,Black/Red Womens Christmas Jumper,30
8,Red Padded Coat,50
9,Oatmeal Tree Graphic Christmas Sweatshirt,26
