In [88]:
import requests
from bs4 import BeautifulSoup as bs
from datetime import datetime
import re
import pickle

In [89]:
class AmazonDataFetcher:
    def __init__(self, session, BASE_URL, default_parser):
        self.session = session
        self.BASE_URL = BASE_URL
        self.DEFAULT_PARSER = default_parser
        self.PRODUCT_REVIEWS = "%s/product-reviews" % self.BASE_URL
        self.PRODUCT_PAGE = "%s/dp" % (self.BASE_URL)
        self.price_pat = r'CDN\$ ([0-9.]+)'
        
    def convert_price(self, price_string):
        return re.findall(self.price_pat, price_string)[0]
        
    def parse_product_page(self, page_content):
        souped = bs(page_content, self.DEFAULT_PARSER)
        return {
            "price": self.convert_price(souped.find("span", {"id": "priceblock_ourprice"}).text),
            "name": souped.find("span", {"id": "productTitle"}).text.strip()
        }
        
    def fetch_product_page(self, URL):
        req = self.session.get(URL)
        return self.parse_product_page(req.content)
        
    def fetch_product_page_from_id(self, product_id):
        return self.fetch_product_page("%s/%s" % (self.PRODUCT_PAGE, product_id,))
        
    def convert_date(self, date_string):
        return datetime.strptime(date_string, "on %B %d, %Y")
        
    def parse_review_box(self, review_box):
        return {
            "review_date": self.convert_date(review_box.find("span", {"data-hook": "review-date"}).text),
            "review_body": review_box.find("span", {"data-hook": "review-body"}).text,
            "review_author": review_box.find("a", {"data-hook": "review-author"})["href"]
        }
        
        
    def parse_page_content(self, page_content):
        souped = bs(page_content, self.DEFAULT_PARSER)
        review_box = souped.find("div", {"id": "cm_cr-review_list"})
        review_boxes = review_box.find_all("div", {"class": "review"})
        next_button = souped.find("li", {"class": "a-last"}).find("a")
        return {
            "reviews": [self.parse_review_box(x) for x in review_boxes],
            "next_page": next_button["href"] if next_button else None
        }
        
        
    def fetch_reviews_from_id(self, product_id):
        req = self.session.get("%s/%s" % (self.PRODUCT_REVIEWS, product_id,))
        return self.parse_page_content(req.content)
        
    def fetch_reviews_on_page(self, URL):
        req = self.session.get(URL)
        return self.parse_page_content(req.content)
    
    def fetch_all_reviews_from_id(self, product_id, max_pages = -1):
        return self.fetch_all_reviews("%s/%s" % (self.PRODUCT_REVIEWS, product_id))
    
    def fetch_all_reviews(self, URL, max_pages=-1):
        url = URL
        reviews = []
        pages = 0
        while (url and pages != max_pages):
            pages += 1
            data = self.fetch_reviews_on_page(url)
            reviews += data["reviews"]
            if (not data["next_page"]):
                break
                
            url = "%s%s" % (self.BASE_URL,data["next_page"],)
        return {
            "reviews": reviews
        }
    
    def fetch_all_data_from_id(self, product_id):
        return {
            "reviews": self.fetch_all_reviews_from_id(product_id),
            "details": self.fetch_product_page_from_id(product_id)
        }

In [90]:
class WalmartDataFetcher:
    def __init__(self, session):
        self.session = session
        self.BASE_URL = "https://stock.ga/wm"
        self.NEAREST_STORE_URL = "%s/nearest-store.php" % (self.BASE_URL)
        self.AVAILABILITY_URL = "%s/availability.php" % (self.BASE_URL)
        self.SEARCH_URL = "%s/search.php" % (self.BASE_URL)
        
    def fetch_availability(self, upc, storeId):
        req = self.session.get(
            self.AVAILABILITY_URL,
            params = {
                "storeId": storeId,
                "upc": upc,
                "src": "upc"
            }
        )
        return req.json()
    
    def fetch_availability_multi_store(self, upc, stores):
        return [{
            "store": store,
            "details": self.fetch_availability(upc, store.get("storeId"))
        } for store in stores]
    
    def fetch_nearest_store(self, lon, lat, numstores=1):
        req = self.session.get(
            self.NEAREST_STORE_URL, 
            params = {
                "numstores": numstores, 
                "lattitude": lat, 
                "longitude": lon, 
                "showpickuplocation": "false",
                "mystore": ""
            })
        
        return req.json()
    
    def fetch_all_nearest_stores(self, coordinates):
        return [self.fetch_nearest_store(coords["lon"], coords["lat"]) for coords in coordinates]
    
    def parse_search_results(self, search_results):
        return search_results.get("result")[0].get("sku")
    
    
    def fetch_search_and_data(self, query, stores):
        return self.fetch_availability_multi_store(
            self.parse_search_results(self.fetch_search_results(query)),
            stores
        )
    
    def fetch_search_results(self, query):
        req = self.session.get(
            self.SEARCH_URL,
            params = {
                "q": query,
                "n": "60",
                "p": "1"
            }
        )
        
        return req.json()

#### Parsing text into proper format

In [91]:
pat = r'(.*), Canada\nLatitude: ([0-9.\-]+) \| Longitude: ([0-9.\-]+)'
data = '''
Winnipeg, MB R3B 1B9, Canada
Latitude: 49.899861 | Longitude: -97.139199
Regina, SK S4P 3C8, Canada
Latitude: 50.451903 | Longitude: -104.614313
Toronto, ON M5H 2N2, Canada
Latitude: 43.652549 | Longitude: -79.383512
Montreal, QC H2Y 1C6, Canada
Latitude: 45.508848 | Longitude: -73.554642
Halifax, NS B3J 3A5, Canada
Latitude: 44.64747 | Longitude: -63.572177
Fredericton, NB E3B 1B5, Canada
Latitude: 45.963394 | Longitude: -66.643218
St. John’s, NL A1C 1J3, Canada
Latitude: 47.560938 | Longitude: -52.713105
Charlottetown, PE C1A 4B7, Canada
Latitude: 46.235603 | Longitude: -63.12992
Edmonton, AB T5J 3E9, Canada
Latitude: 53.544979 | Longitude: -113.49349
Vancouver, BC V5Y 1V4, Canada
Latitude: 49.261057 | Longitude: -123.114077
'''

stores = re.findall(pat, data)

In [92]:
walmartDataFetcher = WalmartDataFetcher(requests)

In [93]:
stores = [
    {
        "location": x[0],
        "lat": x[1],
        "lon": x[2]
    } for x in stores
]

In [94]:
all_store_details = walmartDataFetcher.fetch_all_nearest_stores(stores)

### Sanatize data

In [95]:
all_store_details = [x.get("stores")[0] for x in all_store_details]

In [102]:
class Main:
    def __init__(self, all_stores):
        self.session = requests.Session()
        self.session.headers.update({"user-agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.67 Safari/537.36"})
        self.walmartDataFetcher = WalmartDataFetcher(self.session)
        self.amazonDataFetcher = AmazonDataFetcher(self.session, "https://amazon.ca", "lxml")
        self.all_store_details = all_stores
        
    def fetch_product_info_by_id(amazonId, walmartId):
        return {
            "amazon": self.amazonDataFetcher.fetch_all_data_from_id(amazonId),
            "walmart": self.walmartDataFetcher.fetch_availability_multi_store(walmartId, self.all_store_details)
        }
    
    def fetch_product_info_by_amazonId(self, amazonId):
        result = {
            "amazon": self.amazonDataFetcher.fetch_all_data_from_id(amazonId)
        }
        
        result["walmart"] = self.walmartDataFetcher.fetch_search_and_data(result["amazon"]["details"].get("name"), self.all_store_details)
            
            
        return result

In [103]:
main = Main(all_store_details)

In [105]:
data = main.fetch_product_info_by_amazonId("B00IZUKDJK")

In [107]:
data

{'amazon': {'reviews': {'reviews': [{'review_date': datetime.datetime(2016, 8, 17, 0, 0),
     'review_body': "This deferent smells very clean & I've found it even lasts when the laundry has dried! The inside of the cap shows 1/2/3 for different load sizes but is hard to see. Will buy again, this beats the cheap detergent I normally buy, by far!",
     'review_author': '/gp/profile/amzn1.account.AEMJ7ISM3CAXP3N4NG3QH5ZHBEJA/ref=cm_cr_arp_d_pdp?ie=UTF8'},
    {'review_date': datetime.datetime(2016, 11, 17, 0, 0),
     'review_body': 'Not quite as good as the real thing, but for the price it still cleans better than the bargain brands. Smells good and makes the laundry smell fresh too.If you found this review helpful, please click the button below to let me know. Thanks!',
     'review_author': '/gp/profile/amzn1.account.AHKLH2VG5WGDG5M5UEANSYEHR27Q/ref=cm_cr_arp_d_pdp?ie=UTF8'},
    {'review_date': datetime.datetime(2016, 2, 7, 0, 0),
     'review_body': 'I usually use fragrance free. A