# Insert Taxonomy Data as Collection

In [1]:
import requests, json, time
from Crypto.Hash import SHA256
from Crypto.PublicKey import RSA
from Crypto.Signature import PKCS1_v1_5
import base64
from urllib.parse import urljoin

consumerId = '7915ddea-aa6c-405d-a2ed-6d2dc09836f9'
epoxTime = str(int(time.time()*1000))
keyVersion = '1'

rootURL = "https://developer.api.walmart.com"
URL = "/api-proxy/service/affil/product/v2/taxonomy"

hashDict = { 'WM_CONSUMER.ID' : consumerId,
            'WM_CONSUMER.INTIMESTAMP' : epoxTime,
            'WM_SEC.KEY_VERSION' : keyVersion
            }
sortedHashString = hashDict['WM_CONSUMER.ID'] +'\n'+ hashDict['WM_CONSUMER.INTIMESTAMP'] +'\n'+ hashDict['WM_SEC.KEY_VERSION']+'\n'
encodedHashString = sortedHashString.encode()

try:
    with open('../WM_IO_private_key.pem', 'r') as f:
        key = RSA.importKey(f.read())
except IOError as e:
    print(e)

In [2]:
hasher = SHA256.new(encodedHashString)
signer = PKCS1_v1_5.new(key)
signature = signer.sign(hasher)

signature_enc = str(base64.b64encode(signature),'utf-8')

headers = { 'WM_CONSUMER.ID' : consumerId,
            'WM_CONSUMER.INTIMESTAMP' : epoxTime,
            'WM_SEC.AUTH_SIGNATURE' : signature_enc,
            'WM_SEC.KEY_VERSION' : keyVersion
            }

r = requests.get(urljoin(rootURL, URL), headers=headers)

In [3]:
import pymongo

In [4]:
client = pymongo.MongoClient("mongodb+srv://DSCI551:Dsci551@cluster0.ydii8.mongodb.net/Walmart?retryWrites=true&w=majority")
db = client.Walmart

In [5]:
collection = db.taxonomy
insert_taxonomy = collection.insert_many(r.json()['categories'])

In [6]:
if "Walmart" in client.list_database_names():
    print("You successfully created Walmart database.")
else:
    print("test database was not created.")

You successfully created Walmart database.


# Insert Paginated Data as Collection

## Request the first page

In [7]:
import requests, json, time
from Crypto.Hash import SHA256
from Crypto.PublicKey import RSA
from Crypto.Signature import PKCS1_v1_5
import base64
from urllib.parse import urljoin

consumerId = '7915ddea-aa6c-405d-a2ed-6d2dc09836f9'
epoxTime = str(int(time.time()*1000))
keyVersion = '1'

hashDict = { 'WM_CONSUMER.ID' : consumerId,
            'WM_CONSUMER.INTIMESTAMP' : epoxTime,
            'WM_SEC.KEY_VERSION' : keyVersion
            }
sortedHashString = hashDict['WM_CONSUMER.ID'] +'\n'+ hashDict['WM_CONSUMER.INTIMESTAMP'] +'\n'+ hashDict['WM_SEC.KEY_VERSION']+'\n'
encodedHashString = sortedHashString.encode()

try:
    with open('../WM_IO_private_key.pem', 'r') as f:
        key = RSA.importKey(f.read())
except IOError as e:
    print(e)

In [8]:
rootURL = "https://developer.api.walmart.com"
# URL = "/api-proxy/service/affil/product/v2/paginated/items"
URL = "/api-proxy/service/affil/product/v2/paginated/items?count=200&lastDoc=10314200&remainingHits=180236124"

In [9]:
hasher = SHA256.new(encodedHashString)
signer = PKCS1_v1_5.new(key)
signature = signer.sign(hasher)

signature_enc = str(base64.b64encode(signature),'utf-8')

headers = { 'WM_CONSUMER.ID' : consumerId,
            'WM_CONSUMER.INTIMESTAMP' : epoxTime,
            'WM_SEC.AUTH_SIGNATURE' : signature_enc,
            'WM_SEC.KEY_VERSION' : keyVersion
            }

r = requests.get(urljoin(rootURL, URL), headers=headers)

In [10]:
r.json()['items'][0]

{'itemId': 10314202,
 'parentItemId': 10314202,
 'name': "TOM'S OF MAINE Fluoride Whitening Toothpaste Fresh Mint 24 PC",
 'salePrice': 10.95,
 'upc': '071249015681',
 'categoryPath': 'Home Page/Beauty/Makeup/Lip',
 'shortDescription': "TOM'S OF MAINE Fluoride Whitening Toothpaste Fresh Mint 24 PC",
 'longDescription': 'Lp Generic Loreal Endless Kissable Lipstick',
 'brandName': "Tom's of Maine",
 'thumbnailImage': 'https://i5.walmartimages.com/asr/21af6062-81b0-43b7-ba5a-84e0ebccc99b_1.85371883edfd64a4f32fa54478e322c4.jpeg?odnHeight=100&odnWidth=100&odnBg=ffffff',
 'mediumImage': 'https://i5.walmartimages.com/asr/21af6062-81b0-43b7-ba5a-84e0ebccc99b_1.85371883edfd64a4f32fa54478e322c4.jpeg?odnHeight=180&odnWidth=180&odnBg=ffffff',
 'largeImage': 'https://i5.walmartimages.com/asr/21af6062-81b0-43b7-ba5a-84e0ebccc99b_1.85371883edfd64a4f32fa54478e322c4.jpeg?odnHeight=450&odnWidth=450&odnBg=ffffff',
 'productTrackingUrl': 'https://goto.walmart.com/c/|PUBID|/568844/9383?veh=aff&sourceid=imp

## Iterate through pages and store them into the database

In [None]:
import requests, json, time, pandas as pd
from Crypto.Hash import SHA256
from Crypto.PublicKey import RSA
from Crypto.Signature import PKCS1_v1_5
import base64
from urllib.parse import urljoin
import sys

from pyspark import SparkConf, SparkContext
from pyspark import SQLContext
import json

conf = SparkConf().setMaster("local[*]")
sc = SparkContext.getOrCreate(conf)
sc.setLogLevel("ERROR")

i = 0

In [None]:
# each loop requests one page
while r.json()['nextPageExist']:
    i += 1
    sys.stdout.write('\r'+str(i))
    
    # ------ API acess BEGIN --------
    # the code is needed to be repeatitve because the api needs the time stamp
    consumerId = '7915ddea-aa6c-405d-a2ed-6d2dc09836f9'
    epoxTime = str(int(time.time()*1000))
    keyVersion = '1'

    rootURL = "https://developer.api.walmart.com"
    URL = "/api-proxy/service/affil/product/v2/taxonomy"

    hashDict = { 'WM_CONSUMER.ID' : consumerId,
                'WM_CONSUMER.INTIMESTAMP' : epoxTime,
                'WM_SEC.KEY_VERSION' : keyVersion
                }
    sortedHashString = hashDict['WM_CONSUMER.ID'] +'\n'+ hashDict['WM_CONSUMER.INTIMESTAMP'] +'\n'+ hashDict['WM_SEC.KEY_VERSION']+'\n'
    encodedHashString = sortedHashString.encode()

    try:
        with open('../WM_IO_private_key.pem', 'r') as f:
            key = RSA.importKey(f.read())
    except IOError as e:
        print(e)
        
    URL = r.json()['nextPage']
    
    hasher = SHA256.new(encodedHashString)
    signer = PKCS1_v1_5.new(key)
    signature = signer.sign(hasher)

    signature_enc = str(base64.b64encode(signature),'utf-8')

    headers = { 'WM_CONSUMER.ID' : consumerId,
                'WM_CONSUMER.INTIMESTAMP' : epoxTime,
                'WM_SEC.AUTH_SIGNATURE' : signature_enc,
                'WM_SEC.KEY_VERSION' : keyVersion
                }

    r = requests.get(urljoin(rootURL, URL), headers=headers)
    # ------ API acess END --------
    inputRDD = sc.parallelize(r.json()['items']) \
                    .filter(lambda x: x.get('stock') == "Available") \
                    .filter(lambda x: x.get('availableOnline') == True) \
                    .collect()
    
    if not inputRDD:
        continue
    
    collection = db.products
    insert_page_items = collection.insert_many(inputRDD)
    
    if i > 6000:
        break

147

In [13]:
r.json()

{'nextPage': '/api-proxy/service/affil/product/v2/paginated/items?count=200&lastDoc=22099601&remainingHits=179035724',
 'format': 'json',
 'totalPages': 792754,
 'nextPageExist': True,
 'items': [{'itemId': 22097965,
   'parentItemId': 22097924,
   'name': 'Berkley Trilene XT Monofilament Filler Spools',
   'salePrice': 21.39,
   'upc': '028632623107',
   'categoryPath': 'Home Page/Sports & Outdoors/Outdoor Sports/Fishing/Fishing Tackle/Fishing Line',
   'shortDescription': 'Berkley Trilene XT Fishing Line:Tough against all rough or sharp objectsSuperior strength to fight fish in heavy coverManageable for flipping, pitching or castingMaterial: nylon',
   'longDescription': "Built to deliver the super strength and durability needed when fishing around rocky ledges, heavy weeds, docks and rough brush piles, this abrasion-resistant Trilene XT Extra Tough monofilament line excels in heavy cover. An excellent high-performing, low-memory line that casts very well and won't put a large dent i