# Web Scraping

Imagine that we are a group of developers who want to get fresh goods price data from a supermarket website daily.

Carrefour link:
https://online.carrefour.com.tw/en/fresh--goods?start=0#

In [44]:
# import section
import requests
from bs4 import BeautifulSoup
import time

In [41]:
# Specify User Agent (Web Browser) for get request
headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}

In [4]:
# Use the request module to perfomr a get request
r = requests.get('https://online.carrefour.com.tw/en/fresh--goods?start=0#', headers=headers)

In [10]:
# BeautifulSoup to turn html text into soup object
soup = BeautifulSoup(r.text, 'html.parser')

In [17]:
# Find all div tags with class = "hot-recommend-item line", and save the list
lst_goods = soup.find_all("div", {"class": "hot-recommend-item line"})

In [25]:
dct_goods = {} # empty dict to save data

# For each element in the list, get the name & price of the goods, 
# then create a key value pair of name:price in the dict
for ele in lst_goods:
    name_goods = ele.find("div", {"class": "commodity-desc"}).find("a").text
    price_goods = ele.find("div", {"class": "current-price"}).find("em").text
    dct_goods[name_goods] = price_goods

In [26]:
# Show the dict
dct_goods

{'CFBIO Bunashimezi': '$28',
 'CFBIO Bunabii': '$28',
 'CQL Carrot 600g/Bag': '$39',
 'CFBIO Bok Choy': '$39',
 'CFBIO Emokitake': '$10',
 'CFPLB Mushroom 150g': '$42',
 'CFBIO Taiwan Lettuce': '$45',
 'CFBIO Ching Greeng': '$39',
 'CQL Little Cucumber 300g': '$49',
 'CFBIO Green Beans Sprouts': '$19',
 'CFPLB Ginger Root': '$65',
 'CFBIO fungus': '$42',
 'CQL Potato': '$59',
 'CFBIO Bunashimezi and Bunabii': '$39',
 'CQL Fruit Corn ': '$69',
 'Beef Tomato 600g/box': '$69',
 'CQL Papaya Pumpkin': '$79',
 'CQL Puli Waterbamboo': '$109',
 'CFBIO Romaine': '$45',
 'CFBIO King Oyster Mushroom': '$109',
 'CFBIO Soybean sprouts': '$19',
 'CFBIO King Oyster': '$42',
 'CQL Yu-nu Cherry Tomato': '$129',
 'CQL yellow sweet potatoes': '$69'}

## Just in case you want to get all the data from all the pages

In [35]:
# Specify User Agent (Web Browser) for get request
headers = {'User-Agent' : 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; InfoPath.3)'}

# Compute number of pages and the numbers to substitute in the url
num_goods_total = 1933
num_goods_per_page = 24
num_pages = -1 * (-num_goods_total // num_goods_per_page) # ceiling division

In [45]:
# For all the pages, get all the item name and price

dct_goods = {} # empty dict to save data

# Request and store all the data of all pages
for j in range(0, num_pages*num_goods_per_page, num_goods_per_page):  # range(start=0, stop=81*24, step=24)
    # Get request, use % to substitute string
    r = requests.get('https://online.carrefour.com.tw/en/fresh--goods?start=%s#' %(j), headers=headers)
    soup = BeautifulSoup(r.text, 'html.parser')
    
    # Find all div tags with class = "hot-recommend-item line", and save the list
    lst_goods = soup.find_all("div", {"class": "hot-recommend-item line"})
    
    # For each element in the list, get the name & price of the goods, 
    # then create a key value pair of name:price in the dict
    for ele in lst_goods:
        name_goods = ele.find("div", {"class": "commodity-desc"}).find("a").text
        price_goods = ele.find("div", {"class": "current-price"}).find("em").text
        dct_goods[name_goods] = price_goods
    
    # !!! Very important!!! Pause 15 seconds to not turn this into a Denial-of-Service (DoS) attack
    time.sleep(15)

# Display the dict
dct_goods

{'CFBIO Bunashimezi': '$28',
 'CFBIO Bunabii': '$28',
 'CQL Carrot 600g/Bag': '$39',
 'CFBIO Bok Choy': '$39',
 'CFBIO Emokitake': '$10',
 'CFPLB Mushroom 150g': '$42',
 'CFBIO Taiwan Lettuce': '$45',
 'CFBIO Ching Greeng': '$39',
 'CQL Little Cucumber 300g': '$49',
 'CFBIO Green Beans Sprouts': '$19',
 'CFPLB Ginger Root': '$65',
 'CFBIO fungus': '$42',
 'CQL Potato': '$59',
 'CFBIO Bunashimezi and Bunabii': '$39',
 'CQL Fruit Corn ': '$69',
 'Beef Tomato 600g/box': '$69',
 'CQL Papaya Pumpkin': '$79',
 'CQL Puli Waterbamboo': '$109',
 'CFBIO Romaine': '$45',
 'CFBIO King Oyster Mushroom': '$109',
 'CFBIO Soybean sprouts': '$19',
 'CFBIO King Oyster': '$42',
 'CQL Yu-nu Cherry Tomato': '$129',
 'CQL yellow sweet potatoes': '$69',
 'CFBIO White Swordbelt Mushroom': '$45',
 'CFBIO Alfalfa sprouts': '$23',
 'CFBIO Oyster Mushroom': '$42',
 'CQL Diamond Pineapple': '$69',
 'CQL Pingdong Lemon': '$59',
 'CFBIO Black Oyster Mushroom': '$42',
 'C-Green Bean': '$55',
 'Imported Potato': '$13'

In [47]:
# Print the whole dict
print(dct_goods)

{'CFBIO Bunashimezi': '$28', 'CFBIO Bunabii': '$28', 'CQL Carrot 600g/Bag': '$39', 'CFBIO Bok Choy': '$39', 'CFBIO Emokitake': '$10', 'CFPLB Mushroom 150g': '$42', 'CFBIO Taiwan Lettuce': '$45', 'CFBIO Ching Greeng': '$39', 'CQL Little Cucumber 300g': '$49', 'CFBIO Green Beans Sprouts': '$19', 'CFPLB Ginger Root': '$65', 'CFBIO fungus': '$42', 'CQL Potato': '$59', 'CFBIO Bunashimezi and Bunabii': '$39', 'CQL Fruit Corn ': '$69', 'Beef Tomato 600g/box': '$69', 'CQL Papaya Pumpkin': '$79', 'CQL Puli Waterbamboo': '$109', 'CFBIO Romaine': '$45', 'CFBIO King Oyster Mushroom': '$109', 'CFBIO Soybean sprouts': '$19', 'CFBIO King Oyster': '$42', 'CQL Yu-nu Cherry Tomato': '$129', 'CQL yellow sweet potatoes': '$69', 'CFBIO White Swordbelt Mushroom': '$45', 'CFBIO Alfalfa sprouts': '$23', 'CFBIO Oyster Mushroom': '$42', 'CQL Diamond Pineapple': '$69', 'CQL Pingdong Lemon': '$59', 'CFBIO Black Oyster Mushroom': '$42', 'C-Green Bean': '$55', 'Imported Potato': '$13', 'Carrefour BIO Organic Banana

In [48]:
# Print the length of the dict
len(dct_goods) # The length of the dict (number of goods) shows 1705 other than 1933, 
               # my guess is that therea are items with the same name.
               # We can find a better way to store our data next week.


1705