In [1]:
def strip_string_to_digits_only(string):
    '''Takes a string and returns only the digits'''
    digits = int(''.join([i for i in string if i.isdigit()]))
    return digits

In [2]:
def sku_meta_data(url):
    '''Returns a Counter of the total, new, in stock, restocked and out of stock values from a url page'''
    
    # Make a get request to retrieve the page
    html_page = requests.get(url)

    # Pass the page contents to beautiful soup for parsing
    soup = BeautifulSoup(html_page.content, 'html.parser')

    # Container that contains all the information we need
    container = soup.find('div', class_='Filters')
    
    # Total SKU
    total_sku = container.find('div', class_='ResultCount').div.text
    total_sku = strip_string_to_digits_only(total_sku)

    # New SKU
    new_sku = container.find('li', class_='Option newin').text
    new_sku = strip_string_to_digits_only(new_sku)

    # In Stock SKU
    is_sku = container.find('li', class_='Option available').text
    is_sku = strip_string_to_digits_only(is_sku)
    is_pct = round(is_sku/total_sku*100,rounding)

    # Restocked SKU
    restocked_sku = container.find('li', class_='Option backinstock').text
    restocked_sku = strip_string_to_digits_only(restocked_sku)

    # Out of Stock SKU
    oos_sku = total_sku - is_sku
    oos_pct = round(oos_sku/total_sku*100,rounding)
    
    return Counter({'Total': total_sku, 'New': new_sku, 'In Stock': is_sku, 'Restocked': restocked_sku, 'Out of Stock': oos_sku})

# Notebook Overview

# Information to scrape:

* Total SKU
* Total SKU in stock vs OOS

# Imports

In [3]:
from bs4 import BeautifulSoup # Webscraping
import requests
import re

import pandas as pd # Dataframes

from collections import Counter # Combining dictionary values
from datetime import date # Input date the data is scraped

In [4]:
rounding = 2

# Creat Beautiful Soup Object

## Women's clothing

In [5]:
url_women = 'https://www.next.co.uk/shop/gender-women/sizetype-'

# Make a get request to retrieve the page
html_page = requests.get(url_women)

# Pass the page contents to beautiful soup for parsing
soup = BeautifulSoup(html_page.content, 'html.parser')
# Container that contains all the information we need
container = soup.find('div', class_='Filters')

# Total SKU
total_sku = container.find('div', class_='ResultCount').div.text
total_sku = strip_string_to_digits_only(total_sku)

# New SKU
new_sku = container.find('li', class_='Option newin').text
new_sku = strip_string_to_digits_only(new_sku)

# In Stock SKU
is_sku = container.find('li', class_='Option available').text
is_sku = strip_string_to_digits_only(is_sku)
is_pct = round(is_sku/total_sku*100,rounding)

# Restocked SKU
restocked_sku = container.find('li', class_='Option backinstock').text
restocked_sku = strip_string_to_digits_only(restocked_sku)

# Out of Stock SKU
oos_sku = total_sku - is_sku
oos_pct = round(oos_sku/total_sku*100,rounding)

print(f'''
SKU
---
Total:\t\t{total_sku}
New:\t\t{new_sku}
In Stock:\t{is_sku}\t({is_pct}%)
Restocked:\t{restocked_sku}
Out of Stock:\t{oos_sku}\t({oos_pct}%)
''')


SKU
---
Total:		52846
New:		6249
In Stock:	45464	(86.03%)
Restocked:	303
Out of Stock:	7382	(13.97%)



In [13]:
sku_meta_data('https://www.next.co.uk/shop/gender-women/sizetype-')

TypeError: 'Counter' object is not callable

## Men's Clothing

In [6]:
url_men = 'https://www.next.co.uk/shop/gender-men/sizetype-'

# Make a get request to retrieve the page
html_page = requests.get(url_men)

# Pass the page contents to beautiful soup for parsing
soup = BeautifulSoup(html_page.content, 'html.parser')

# Container that contains all the information we need
container = soup.find('div', class_='Filters')

# Total SKU
total_sku = container.find('div', class_='ResultCount').div.text
total_sku = strip_string_to_digits_only(total_sku)

# New SKU
new_sku = container.find('li', class_='Option newin').text
new_sku = strip_string_to_digits_only(new_sku)

# In Stock SKU
is_sku = container.find('li', class_='Option available').text
is_sku = strip_string_to_digits_only(is_sku)
is_pct = round(is_sku/total_sku*100,rounding)

# Restocked SKU
restocked_sku = container.find('li', class_='Option backinstock').text
restocked_sku = strip_string_to_digits_only(restocked_sku)

# Out of Stock SKU
oos_sku = total_sku - is_sku
oos_pct = round(oos_sku/total_sku*100,rounding)

print(f'''
SKU
---
Total:\t\t{total_sku}
New:\t\t{new_sku}
In Stock:\t{is_sku}\t({is_pct}%)
Restocked:\t{restocked_sku}
Out of Stock:\t{oos_sku}\t({oos_pct}%)
''')


SKU
---
Total:		25090
New:		2131
In Stock:	20476	(81.61%)
Restocked:	213
Out of Stock:	4614	(18.39%)



In [7]:
# Dataframe to store the results
meta_data_df = pd.DataFrame(columns=('Date', 'Total', 'New', 'In Stock', 'Restocked', 'Out of Stock'))
meta_data_df

Unnamed: 0,Date,Total,New,In Stock,Restocked,Out of Stock


In [8]:
# Timestamp of date of scraping
todays_date = {'Date': date.today()}
# todays_meta_data['Date'] = today

# Combine the meta_data for sku from webpages scraped
womens_meta_data = sku_meta_data('https://www.next.co.uk/shop/gender-women/sizetype-')
mens_meta_data = sku_meta_data('https://www.next.co.uk/shop/gender-men/sizetype-')
sku_meta_data = mens_meta_data + womens_meta_data

# Combine the Date with the meta_data
todays_meta_data = {**todays_date, **sku_meta_data}
todays_meta_data

{'Date': datetime.date(2020, 12, 21),
 'Total': 77936,
 'New': 8380,
 'In Stock': 65940,
 'Restocked': 516,
 'Out of Stock': 11996}

In [9]:
# Update meta data df
meta_data_df = meta_data_df.append(todays_meta_data, ignore_index=True)
meta_data_df

Unnamed: 0,Date,Total,New,In Stock,Restocked,Out of Stock
0,2020-12-21,77936,8380,65940,516,11996


In [10]:
# Save the updated df
meta_data_df.to_csv('../data/meta_data')

In [11]:
# Append the new data to meta_data csv
meta_data_df.to_csv('../data/meta_data.csv', mode='a', header=False)

Need to write function to create blank csv if none exists, 
then append to that csv
then repeat every X hours

In [12]:
# code from tutorial:
if __name__ == '__main__':
    while True:
        find_jobs()
        time_wait = 10
        print(f'Waiting {time_wait} minutes...')
        time.sleep(time_wait * 60)

NameError: name 'find_jobs' is not defined