## Web Scraping

In [1]:
# import library
import requests

In [2]:
# make a get request
r = requests.get('https://httpbin.org/basic-auth/user/pass', auth=('user', 'pass'))

In [3]:
r.status_code

200

In [4]:
g = requests.get('https://www.google.com/ola')

In [5]:
g.status_code

404

## Make a request to Shoprite.

In [6]:
res = requests.get(url='https://shoprite.ng/shop/')

In [7]:
res.status_code

200

## Make a Soup

In [8]:
!pip install beautifulsoup4



In [9]:
from bs4 import BeautifulSoup

In [10]:
# create an object of BeautifulSoup
soup = BeautifulSoup(res.content, 'html.parser')

In [11]:
soup.find('h2').get_text()

''

In [12]:
all_product_name = soup.find_all('h2') 
names = [] # an empty list to store all names

for product in all_product_name:
    prod = product.get_text()
    
    # append each name to list
    names.append(prod)
    

# print out all names
print(names)

['', '18 Glenfiddich 750Ml', '2sure Bath Soap Herbal Plus 120g', '2X Snickers 80G', '33 Export 500Ml Can', '5 Alive 780Ml, Berry Blast', '5 Alive 780Ml, Citrus Burst', '7 Up 500Ml NRB, Original', '7Up 50Cl', 'Active Go Milo 20G', 'Active Go Milo 400G Sachet', 'Air Freshener Gel Airwick 45G, Rose', 'Air freshener Ref Airwick 250Ml, Aqua', 'Air freshener Ref Airwick 250Ml, Lav', 'Air Freshner Gel Airwick 45G, Lav', 'Airfryer Model Baf-3501 Binatone 3.5L', 'Airwick 2X250Ml', 'Airwick 2X250Ml, Citrus\xa0', 'Airwick Fmat 250Ml, Citrus\xa0', 'Amazing Day Golden Penny 600G', 'American Aviation 750Ml', 'American Honey 750Ml', 'Ankara & Colour Ariel 2Kg\xa0', 'Ankara Ariel 400G', 'Antiseptic Liquid Savlon 250Ml Pack', 'Antiseptic Liquid Savlon 500Ml Pack', 'Antiseptic Liquid Savlon 500Ml Pack', 'Antiseptic Liquid Savlon 750Ml', 'Aperitif Herb Bitters Orijin 330Ml Can', 'Aperitif Herb Bitters Orijin 750Ml', 'Aperitifs Bitters Confam 750Ml', 'Apperito Bitters  375Ml', 'Apple Caprisonne 100Ml Pouc

In [13]:
all_prices = soup.find_all('bdi') 
prices = [] # an empty list to store all prices

for price in all_prices:
    price = price.get_text()
    
    # append each price to list
    prices.append(price)
    

# print out all prices
print(prices)

['₦1,799.99', '₦54,999.99', '₦349.99', '₦899.99', '₦349.99', '₦499.99', '₦499.99', '₦199.99', '₦199.99', '₦79.99', '₦1,549.99', '₦399.99', '₦4,199.99', '₦4,199.99', '₦399.99', '₦77,799.99', '₦6,699.99', '₦6,699.99', '₦3,199.99', '₦1,199.99', '₦16,999.99', '₦750.00', '₦3,499.99', '₦1,009.99', '₦1,099.99', '₦1,899.99', '₦1,899.99', '₦2,699.99', '₦399.99', '₦1,799.99', '₦1,699.99', '₦1,499.99', '₦89.99', '₦109.99', '₦2,749.99', '₦2,749.99', '₦1,699.99', '₦4,199.99', '₦1,899.99', '₦3,599.99']


## Refractoring Code

In [15]:
# make a requests
try:
    res = requests.get(url='https://shoprite.ng/shop/')
    status = res.status_code
    
    if status == 200:
        print(f"You have just successfully make a request to {res.url}.")
    else:
        print(f'Status Code: {status}') 
except:
    print(f'Status Code: {status}')
finally:
    content = res.content

You have just successfully make a request to https://shoprite.ng/shop/.


In [16]:
# create an object of BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')

In [17]:
# define an empty dictionary
products = {}

In [18]:
# get all the product name
names = get_contents('h2')

# get all the product price
prices = get_contents('bdi')

In [19]:
products.update({"names":names, "prices":prices})

In [20]:
import pandas as pd

In [21]:
shop_df = pd.DataFrame(products)

In [22]:
shop_df.head()

Unnamed: 0,names,prices
0,,"₦1,799.99"
1,18 Glenfiddich 750Ml,"₦54,999.99"
2,2sure Bath Soap Herbal Plus 120g,₦349.99
3,2X Snickers 80G,₦899.99
4,33 Export 500Ml Can,₦349.99


## Shoprite Data

In [123]:
# empty dictionary
all_products = {
    "names": [],
    "prices": [],
    "sku": [],
    'categories': [],
    'tag': []
}

In [124]:
# get the content of an HTML tag
def get_contents(tag):
    items = soup.find_all(tag) # find all give tag
    values = [] # empty list
    
    for item in items:
        if item.get_text() != None:
            text = item.get_text() # get text of each tag
            values.append(text)
        else:
            text = 'Null'
            values.append(text)
        
    return values

In [125]:
# get other details of an HTML tag
def get_details(name):
    single_prod_url = f"https://shoprite.ng/product/{name}".replace(' ','-')
    r = requests.get(single_prod_url)
        
    soup = BeautifulSoup(r.content, 'html.parser')
    
    if soup.find('span', class_="sku") != None:
        sku = soup.find('span', class_="sku").get_text()
    else:
        sku = 'Null'

    if soup.find('span', class_="posted_in") != None:
        category = soup.find('span', class_="posted_in").get_text()
    else:
        category = 'Null'

    if soup.find('span', class_="tagged_as") != None:
        tag = soup.find('span', class_="tagged_as").get_text()
    else:
        tag = 'Null'
    
    return (sku, category, tag)

In [126]:
# make a requests to 5 pages
for page in range(1,2):
    url = f"https://shoprite.ng/products/?product-page={page}"
    res = requests.get(url)
    
    # create an object of BeautifulSoup
    soup = BeautifulSoup(res.content, 'html.parser')
    
    # get product name
    names = get_contents('h2')
    
    #get all the product price
    prices = get_contents('bdi')
    
    
    #get all the product sku
    skus = []
    for name in names:
        sku = get_details(name)[0]
        skus.append(sku)
    
    
    #get all the product categories
    categories = []
    for name in names:
        category = get_details(name)[1]
        categories.append(category)
    
    
    #get all the product tags
    tags = []
    for name in names:
        tag = get_details(name)[2]
        tags.append(tag)
              
            
    # Extends the dictionary
    all_products['names'].extend(names[1:])
    all_products['prices'].extend(prices)
    all_products['sku'].extend(skus[1:])
    all_products['categories'].extend(categories[1:])
    all_products['tag'].extend(tags[1:])

In [129]:
# create DataFrame
prod_df = pd.DataFrame(all_products)
prod_df

Unnamed: 0,names,prices,sku,categories,tag
0,Chewy Caramel Alpenliebe 6.5G,₦34.99,Candy Milk Chewy Caramel Alpenliebe 6.5G,Category: Confectionery and Snacks,Tag: Candy
1,Milk Chewy Caramel Alpenliebe 6.5G,₦34.99,Milk Chewy Caramel Alpenliebe 6.5G,Category: Candy & bubble Gum,Tag: Alpenliebe
2,Seasoning Powder Jollof Maggi 8G,₦52.99,Seasoning Powder Jollof Maggi 8G,"Category: Condiments, Oils & Spices",Tag: Seasoning
3,Seasoning Powder Chicken Maggi 10G,₦52.99,Seasoning Powder Chicken Maggi 10G,"Category: Condiments, Oils & Spices",Tag: Seasoning
4,Chocolate Slab Milk Dune 5G,₦54.99,Chocolate Slab Milk Dune 5G,Category: Chocolate,Tag: Chocolate
5,Haansbro 39G Malt & Milk,₦59.99,Biscuits Malt & Milk Haansbro 39G,Category: Biscuits & Cookies,Tag: Biscuits
6,Mcvities 16G Crackers,₦59.99,Crackers Mcvities 16G,Category: Biscuits & Cookies,Tag: Crackers
7,Mcvities 17.5G Tea,₦59.99,Biscuits Tea Mcvities 17.5G,Category: Biscuits & Cookies,Tag: Biscuits
8,Water Still Bigi 750Ml,₦64.99,Water Still Bigi 750Ml,Category: Water,Tag: Water
9,Biscuits Nice Coconut Haansbro 57G,₦64.99,Biscuits Nice Coconut Haansbro 57G,Category: Biscuits & Cookies,Tag: Biscuit


In [132]:
len(prod_df['categories'].unique())

11