In [1]:
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import json
from tqdm import tqdm
import pandas as pd

In [2]:
def list_of_brands():
    
    brands_ = []
    map_brands = {}
    url_2 = "https://www.next.co.uk/clearance/search?w=*&af=gender:men"
    page = requests.get(url_2)
    soup = BeautifulSoup(page.text, 'html.parser').prettify()

    a = soup.find("saleclearance.pageState.filters = ")
    b = soup.find("saleclearance.pageState.filters_TotalResults")
    aa = soup[a+len("saleclearance.pageState.filters = "):b].strip()[:-1]
    list_of_filters = json.loads(aa)
    for obj_brand in list_of_filters[3]['FilterOptions']:
        map_brands[obj_brand['Value']] = obj_brand['Name'].lower()
        brands_.append(obj_brand['Value'])

    return map_brands, brands_

In [3]:
def find_total_results(soup):
    
    text_for_total = "saleclearance.pageState.filters_TotalResults = "
    pos = soup.find(text_for_total) + len(text_for_total)
    res = ""
    while(soup[pos]!= ';'):
        res += soup[pos]
        pos +=1
    
    return int(res)

In [4]:
def source_to_data(soup):
    
    start = "saleclearance.pageState.results = "
    end = "saleclearance.pageState.isLandingPage = false"
    start_pos = soup.find(start) + len(start)
    end_pos = soup.find(end)
    return str(soup[start_pos:end_pos]).strip()[:-1]

In [5]:
def parse_products_of_brand(url, brand):
    
    page = requests.get(url)
    soup = BeautifulSoup(page.text, 'html.parser').prettify()
    total_results = find_total_results(soup)
    
    if(total_results > 24):
        no_pages=total_results//24 + (1 if total_results%24 > 0 else 0)
    else:
        no_pages=1
    
    
    pages = [i+1 for i in range(no_pages)]
    str_list = [(i-1)*24 for i in pages]
    
    for str_no, page_no in zip(str_list, pages):
        url_page = url + f"%20&srt={str_no}#searchpos_{page_no}"
        per_page = requests.get(url_page)
        soup_per_page = BeautifulSoup(per_page.text, 'html.parser').prettify()
        
        try:
            list_of_products = json.loads(source_to_data(soup_per_page))
        except e:
            print(e)
        
        for product in list_of_products:
            original_price = product['ItemOptions'][0]['OriginalPrice']
            price = product['ItemOptions'][0]['Price']
            image_url = product['SearchImage'] + product['Image']
            data.append([product['Name'], brand, float(original_price), float(price), image_url])

In [6]:
def add_to_df():
    global df
    df = pd.DataFrame(data, columns=['Name', 'Brand', 'Original Price', 'Discounted Price', 'Image Url'])
    df.to_csv('output.csv', index=False)

In [7]:
def main():
    
    global data
    data = []
    url_main = "https://www.next.co.uk/clearance/search?w=*&af=gender:men"
    map_brands, brands = list_of_brands()
    
    for brand in tqdm(brands):
        brand_link = f"%20{brand}%20gender:men"
        link = url_main + brand_link
        parse_products_of_brand(link, map_brands[brand])
        
#     add_to_df()

In [8]:
if __name__ == '__main__':
    main()

100%|██████████| 84/84 [02:54<00:00,  2.08s/it]


In [9]:
len(data)

2401

In [10]:
df = pd.DataFrame(data, columns=['Name', 'Brand', 'Original Price', 'Discounted Price', 'Image Url'])

In [11]:
df

Unnamed: 0,Name,Brand,Original Price,Discounted Price,Image Url
0,Ted Baker Francj Debonair Slim Fit Wool Jacket,ted baker,289.0,120.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
1,Ted Baker Regdebj Wool Birdseye Suit Jacket,ted baker,289.0,120.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
2,Ted Baker Franct Debonair Slim Fit Wool Trousers,ted baker,139.0,65.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
3,Ted Baker Diaryt Debonair Semi Plain Wool Trou...,ted baker,139.0,65.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
4,Ted Baker Purple Shecan Floral Cotton Blend Shirt,ted baker,95.0,45.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
...,...,...,...,...,...
2396,Sik Silk Logo Joggers,sik silk,45.0,18.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
2397,RI Black Sandals,the north face,32.0,16.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
2398,Slim Fit TimberlandÂ® Blue Long Sleeve Stretch...,timberland,40.0,40.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...
2399,TOMS Grey Washed Canvas Espadrilles,toms,50.0,22.0,https://xcdn.next.co.uk/COMMON/Items/Default/D...


In [12]:
df.dtypes

Name                 object
Brand                object
Original Price      float64
Discounted Price    float64
Image Url            object
dtype: object