In [1]:
from bs4 import BeautifulSoup
import requests
import json
import math
import time
import pandas as pd
from nested_lookup import nested_lookup
from tqdm import tqdm
from requests_ip_rotator import ApiGateway



In [2]:
def parseNextCache(url, session):
    page = session.get(url)
    soup = BeautifulSoup(page.text, 'html') 
    script = soup.find('script', {'id': '__NEXT_DATA__'})
    data = json.loads(script.text)
    return data

In [3]:
def parseIndexPages(url):

    # # Create gateway object and initialise in AWS
    gateway = ApiGateway(url)
    gateway.start()

    # Assign gateway to session
    session = requests.Session()
    session.mount(url, gateway)

    data = parseNextCache(url, session)
    
    _first_page_results = nested_lookup("results", data)[0]
    _paging_info = _first_page_results["pageInfo"]
    total_pages = _paging_info["pageCount"] or math.ceil(_paging_info["total"] / _paging_info["limit"])
    product_previews = [edge["node"] for edge in _first_page_results["edges"]]
    product_sales = [salesInfo['lastSale'] for salesInfo in nested_lookup("salesInformation", data)]

    for i in tqdm(range(total_pages)):
        current_url = f"{url}&page={i+2}"
        data = parseNextCache(current_url, session)
        _page_results = nested_lookup("results", data)[0]
        product_previews.extend([edge["node"] for edge in _page_results["edges"]])
        product_sales.extend([salesInfo['lastSale'] for salesInfo in nested_lookup("salesInformation", data)])
    
    gateway.shutdown()
    return product_previews, product_sales
    


In [4]:

url = "https://stockx.com/search/sneakers/top-selling?s=indigo"
preview, sales = parseIndexPages(url)
df  = pd.DataFrame(preview)
df['last sale'] = sales

Starting API gateways in 10 regions.
Using 10 endpoints with name 'https://stockx.com/search/sneakers/top-selling?s=indigo - IP Rotate API' (10 new).


100%|██████████| 12/12 [00:06<00:00,  1.74it/s]


Deleting gateways for site 'https://stockx.com/search/sneakers/top-selling?s=indigo'.
Deleted 10 endpoints with for site 'https://stockx.com/search/sneakers/top-selling?s=indigo'.


Check if gateways are open: aws apigateway get-rest-apis --output text

In [10]:
df.to_csv(path_or_buf="/Users/FranklinZhao/TensorFlowProjects/ImageBasedSneakerPrediction/data/raw" + "output.csv", sep='\t')

In [118]:
# run to close gateways
gateway = ApiGateway("https://stockx.com/search/sneakers/top-selling?s=indigo")
gateway.shutdown()

Deleting gateways for site 'https://stockx.com/search/sneakers/top-selling?s=indigo'.
Deleted 10 endpoints with for site 'https://stockx.com/search/sneakers/top-selling?s=indigo'.


['5ze6q7wxhd',
 'rtvp6qt67i',
 'tx28wwmntb',
 'c6j93974k7',
 '4vh4di1yk5',
 'd90lz0it88',
 'h0mk1zj2ne',
 'b55lvyp321',
 'bfcjd2400k',
 '7axfh6hkkh']