In [1]:
import json
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlencode

In [2]:
# Store API keys as environment variable
import os
from getpass import getpass

os.environ['SCRAPEOPS_API_KEY'] = getpass('Enter scrapeops api key: ')

Enter scrapeops api key: ··········


In [71]:
def create_walmart_product_search_page_url(key_word: str, page_num: int):
    payload = {'q': key_word, 'sort': 'best_seller', 'page': page_num, 'affinityOverride': 'default'}
    return 'https://www.walmart.com/search?' + urlencode(payload)

def create_walmart_product_item_page_url(product):
    return 'https://www.walmart.com' + product.get('canonicalUrl', '').split('?')[0]

def get_proxy_url(url, residential='false'):
    payload = {'api_key': os.environ.get('SCRAPEOPS_API_KEY'), 'url': url, 'residential': residential, 'country': 'us'}
    proxy_url = 'https://proxy.scrapeops.io/v1/?' + urlencode(payload)
    return proxy_url

headers={"User-Agent": "Mozilla/5.0 (iPad; CPU OS 12_2 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148"}

def get_product_url_list(product_keyword: str, num_pages: int = 1) -> list:
    product_url_list = []
    for page in range(1, num_pages+1):
        try:
            walmart_search_url = create_walmart_product_search_page_url(product_keyword, page)
            print(f"scraping page {page} at url: {walmart_search_url}")
            response = requests.get(get_proxy_url(walmart_search_url, residential='true'))
            # print(response)
            if response.status_code == 200:
                html_response = response.text
                soup = BeautifulSoup(html_response, "html.parser")
                script_tag = soup.find("script", {"id": "__NEXT_DATA__"})
                if script_tag is not None:
                    json_blob = json.loads(script_tag.get_text())
                    product_list = json_blob["props"]["pageProps"]["initialData"]["searchResult"]["itemStacks"][0]["items"]
                    product_urls = [create_walmart_product_item_page_url(product) for product in product_list]
                    product_url_list.extend(product_urls)
                    if len(product_urls) == 0:
                        break
            else:
                print(response.status_code)
        except Exception as ex:
            print("Error: ", ex)

    return product_url_list

def get_review_data(review_item: dict, product_id) -> dict:
    return {
        'id': review_item.get('reviewId'),
        'productId': product_id,
        'rating': review_item.get('rating'),
        'submissionDate': review_item.get('reviewSubmissionTime'),
        'reviewTitle': review_item.get('reviewTitle'),
        'reviewText': review_item.get('reviewText'),
        'reviewAuthor': review_item.get('userNickname')
    }

def get_product_item_data(items_url_list: list, num_items: int = 3):
    product_data_list = []
    review_data_list = []
    for url in items_url_list[:num_items]:
        try:
            print(f"scraping product item data from url: {url}")
            response = requests.get(get_proxy_url(url, residential='false'))
            if response.status_code == 200:
                html_response = response.text
            soup = BeautifulSoup(html_response, "html.parser")
            script_tag = soup.find("script", {"id": "__NEXT_DATA__"})
            if script_tag is not None:
                json_blob = json.loads(script_tag.get_text())
                raw_product_data = json_blob["props"]["pageProps"]["initialData"]["data"]["product"]
                product_id = raw_product_data.get('id')
                imageInfo = raw_product_data['imageInfo'] if raw_product_data.get('imageInfo') is not None else {}
                priceInfo = raw_product_data['priceInfo'] if raw_product_data.get('priceInfo') is not None else {}
                currentPrice = priceInfo['currentPrice'] if priceInfo.get('currentPrice') is not None else {}
                wasPrice = priceInfo['wasPrice'] if priceInfo.get('wasPrice') is not None else {}
                returnPolicy = raw_product_data['returnPolicy'] if raw_product_data.get('returnPolicy') is not None else {}
                returnWindow = returnPolicy['returnWindow'] if returnPolicy.get('returnWindow') is not None else {}
                product_metadata = {}
                product_metadata.update({
                    'id':  product_id,
                    'url': url,
                    'type':  raw_product_data.get('type'),
                    'name':  raw_product_data.get('name'),
                    'brand':  raw_product_data.get('brand'),
                    'manufacturerName':  raw_product_data.get('manufacturerName'),
                    'sellerName': raw_product_data.get('sellerName'),
                    'sellerAverageRating': raw_product_data.get('sellerAverageRating'),
                    'thumbnailUrl':  imageInfo.get('thumbnailUrl'),
                    'currentPrice':  currentPrice.get('price'),
                    'printPrice': wasPrice.get('price'),
                    'currencyUnit':  wasPrice.get('currencyUnit'),
                    'giftingEligibility': raw_product_data.get('giftingEligibility'),
                    'returnable': returnPolicy.get('returnable'),
                    'returnWindow': returnWindow.get('value'),
                    'returnPolicy': returnPolicy.get('returnPolicyText'),
                })
                raw_review_data = json_blob["props"]["pageProps"]["initialData"]["data"]["reviews"]
                for item in raw_review_data.get('customerReviews', []):
                    review_data_list.append(get_review_data(item, product_id))
                if raw_review_data.get('topPositiveReview') is not None:
                    review_data_list.append(get_review_data(raw_review_data.get('topPositiveReview'), product_id))
                    product_metadata.update({'topPostiveReviewId': raw_review_data['topPositiveReview'].get('reviewId')})
                else:
                    product_metadata.update({'topPostiveReviewId': None})
                if raw_review_data.get('topNegativeReview') is not None:
                    review_data_list.append(get_review_data(raw_review_data.get('topNegativeReview'), product_id))
                    product_metadata.update({'topNegativeReviewId': raw_review_data['topNegativeReview'].get('reviewId')})
                else:
                    product_metadata.update({'topNegativeReviewId': None})
                product_metadata.update({
                    'averageOverallRating': raw_review_data.get('averageOverallRating'),
                    'ratingValueFiveCount': raw_review_data.get('ratingValueFiveCount'),
                    'ratingValueFourCount': raw_review_data.get('ratingValueFourCount'),
                    'ratingValueThreeCount': raw_review_data.get('ratingValueThreeCount'),
                    'ratingValueTwoCount': raw_review_data.get('ratingValueTwoCount'),
                    'ratingValueOneCount': raw_review_data.get('ratingValueOneCount'),
                    'totalReviewCount': raw_review_data.get('totalReviewCount'),
                })
                raw_description_data = json_blob["props"]["pageProps"]["initialData"]["data"]["idml"]
                long_description = raw_description_data.get('longDescription')
                if raw_description_data.get('productHighlights'):
                    full_description = "Product Description: \n" + str(long_description) + "\nProduct Highlights: \n"
                    for item in raw_description_data.get('productHighlights', []):
                        full_description += f"{item.get('name')}: {item.get('value')},\n"
                else:
                    full_description = "Product Description: \n" + str(long_description)
                product_metadata.update({
                    'shortDescription': raw_description_data.get('shortDescription'),
                    'fullDescription': full_description
                })
                product_data_list.append(product_metadata)


        except Exception as ex:
            print('Error: ', ex)

    return product_data_list, review_data_list



In [17]:
x = get_product_url_list("iphone 13", 1)

scraping page 1 at url: https://www.walmart.com/search?q=iphone+13&sort=best_seller&page=1&affinityOverride=default


In [75]:
x = ['https://www.walmart.com/ip/AT-T-iPhone-13-128GB-Midnight/549396996',
 'https://www.walmart.com/ip/Verizon-iPhone-13-128GB-Midnight/772471610',
 'https://www.walmart.com/ip/AT-T-iPhone-13-Pro-256GB-Graphite/657377016',
 'https://www.walmart.com/ip/Verizon-iPhone-13-Pro-256GB-Graphite/778945307',
 'https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-Pro-Max-128GB-Graphite-Prepaid-Smartphone-Locked-to-Straight-Talk/889062044',
 'https://www.walmart.com/ip/Walmart-Family-Mobile-Apple-iPhone-13-128GB-Midnight-Prepaid-Smartphone-Locked-to-Walmart-Family-Mobile/130350152',
 'https://www.walmart.com/ip/Simple-Mobile-Apple-iPhone-13-128GB-Midnight-Prepaid-Smartphone/829730484',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-128-GB-Fully-Unlocked-Midnight-Refurbished/352285146',
 'https://www.walmart.com/ip/Total-By-Verizon-Apple-iPhone-13-128GB-Midnight-Prepaid-Smartphone-Locked-to-Total-by-Verizon/211654357',
 'https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-Pro-128GB-Graphite-Prepaid-Smartphone-Locked-to-Straight-Talk/253784511',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-Mini-128GB-Fully-Unlocked-Blue-Refurbished/558978516',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-Mini-128GB-Midnight-Unlocked-Refurbished/141543487',
 'https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-Pro-128GB-Gold-Prepaid-Smartphone-Locked-to-Straight-Talk/556756614',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-Pro-Graphite-Unlocked-Refurbished/780230612',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-128GB-Unlocked-Refurbished/1227157204',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-Pro-Max-128GB-Gold-Unlocked-Refurbished/1326632182',
 'https://www.walmart.com/ip/Walmart-Family-Mobile-Apple-iPhone-13-Pro-Max-128GB-Graphite-Prepaid-Smartphone-Locked-to-Walmart-Family-Mobile/591387888',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-Pro-256GB-Graphite-Unlocked-Refurbished/1945491001',
 'https://www.walmart.com/ip/Total-By-Verizon-Apple-iPhone-13-Pro-Max-128GB-Graphite-Prepaid-Smartphone-Locked-to-Total-by-Verizon/866139391',
 'https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-128GB-Midnight-Prepaid-Smartphone-Locked-to-Straight-Talk/454408250',
 'https://www.walmart.com/ip/Pre-Owned-Apple-iPhone-13-Pro-Max-128GB-Sierra-Blue-Unlocked-Excellent-Condition/1711181226',
 'https://www.walmart.com/ip/RESTORED-IPHONE-13-PRO-MAX-UNLOCKED-REFURBISHED-GRADE-A-128-GB-ALPINE-GREEN/1309418111',
 'https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-128GB-Blue-Prepaid-Smartphone-Locked-to-Straight-Talk/881595680',
 'https://www.walmart.com/ip/Simple-Mobile-Apple-iPhone-13-Pro-Max-128GB-Graphite-Prepaid-Smartphone-Locked-to-Simple-Mobile/647338100',
 'https://www.walmart.com/ip/AT-T-iPhone-13-mini-128GB-Blue/634815121']
x

['https://www.walmart.com/ip/AT-T-iPhone-13-128GB-Midnight/549396996',
 'https://www.walmart.com/ip/Verizon-iPhone-13-128GB-Midnight/772471610',
 'https://www.walmart.com/ip/AT-T-iPhone-13-Pro-256GB-Graphite/657377016',
 'https://www.walmart.com/ip/Verizon-iPhone-13-Pro-256GB-Graphite/778945307',
 'https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-Pro-Max-128GB-Graphite-Prepaid-Smartphone-Locked-to-Straight-Talk/889062044',
 'https://www.walmart.com/ip/Walmart-Family-Mobile-Apple-iPhone-13-128GB-Midnight-Prepaid-Smartphone-Locked-to-Walmart-Family-Mobile/130350152',
 'https://www.walmart.com/ip/Simple-Mobile-Apple-iPhone-13-128GB-Midnight-Prepaid-Smartphone/829730484',
 'https://www.walmart.com/ip/Restored-Apple-iPhone-13-128-GB-Fully-Unlocked-Midnight-Refurbished/352285146',
 'https://www.walmart.com/ip/Total-By-Verizon-Apple-iPhone-13-128GB-Midnight-Prepaid-Smartphone-Locked-to-Total-by-Verizon/211654357',
 'https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-Pro-128GB-Gr

In [70]:
y = ["https://www.walmart.com/ip/Barbie-Color-Reveal-Scented-Sweet-Fruit-Fashion-Doll-with-Accessories-Color-Change-Styles-May-Vary/1618685574"]
y

['https://www.walmart.com/ip/Barbie-Color-Reveal-Scented-Sweet-Fruit-Fashion-Doll-with-Accessories-Color-Change-Styles-May-Vary/1618685574']

In [76]:
a, b = get_product_item_data(x, 5)

scraping product item data from url: https://www.walmart.com/ip/AT-T-iPhone-13-128GB-Midnight/549396996
scraping product item data from url: https://www.walmart.com/ip/Verizon-iPhone-13-128GB-Midnight/772471610
scraping product item data from url: https://www.walmart.com/ip/AT-T-iPhone-13-Pro-256GB-Graphite/657377016
scraping product item data from url: https://www.walmart.com/ip/Verizon-iPhone-13-Pro-256GB-Graphite/778945307
scraping product item data from url: https://www.walmart.com/ip/Straight-Talk-Apple-iPhone-13-Pro-Max-128GB-Graphite-Prepaid-Smartphone-Locked-to-Straight-Talk/889062044


In [77]:
a

[{'id': '2OU1ULRE8ESG',
  'url': 'https://www.walmart.com/ip/AT-T-iPhone-13-128GB-Midnight/549396996',
  'type': 'Cell Phones',
  'name': 'AT&T iPhone 13 128GB Midnight',
  'brand': 'Apple',
  'manufacturerName': None,
  'sellerName': 'Walmart.com',
  'sellerAverageRating': None,
  'thumbnailUrl': 'https://i5.walmartimages.com/seo/AT-T-iPhone-13-128GB-Midnight_05c6c9f4-df18-4a14-b607-e0f5c69c2544.88a057d63437c13a21504856f1b872d0.jpeg',
  'currentPrice': None,
  'printPrice': 797.87,
  'currencyUnit': 'USD',
  'giftingEligibility': False,
  'returnable': True,
  'returnWindow': 14,
  'returnPolicy': 'Free 14-day returns',
  'topPostiveReviewId': None,
  'topNegativeReviewId': None,
  'averageOverallRating': 3.8333,
  'ratingValueFiveCount': 3,
  'ratingValueFourCount': 1,
  'ratingValueThreeCount': 1,
  'ratingValueTwoCount': 0,
  'ratingValueOneCount': 1,
  'totalReviewCount': 6,
  'shortDescription': 'iPhone 13. The most advanced dual-camera system ever on iPhone. Lightning-fast A15 B

In [81]:
b

[{'id': '270497516',
  'productId': '2OU1ULRE8ESG',
  'rating': 5,
  'submissionDate': '10/14/2021',
  'reviewTitle': 'how iphones are better',
  'reviewText': 'IPhones are the best phones on the market compared to any other cell phone brand. This is because IPhones have a better touch ID. And a much better face id. The  Iphone is also at less risk of downloading apps with malware. The Iphones have more space than androids because they can hold more photos and videos and if you do run out of space you can buy more. It is expensive though. Critics may say that Iphones are not customizable like androids and it is true but iphones have better tech things and more useful apps',
  'reviewAuthor': 'Adrian'},
 {'id': '272820743',
  'productId': '2OU1ULRE8ESG',
  'rating': 5,
  'submissionDate': '12/9/2021',
  'reviewTitle': None,
  'reviewText': 'Got it before shipping confirmation date and all good.',
  'reviewAuthor': 'Shreyans'},
 {'id': '294901627',
  'productId': '2OU1ULRE8ESG',
  'ratin

In [79]:
len(b)

35

In [80]:
len(a)

5