## Web Scraping

In [1]:
# import library
import requests

In [2]:
# make a get request
r = requests.get('https://httpbin.org/basic-auth/user/pass', auth=('user', 'pass'))

In [3]:
r.status_code

200

In [4]:
g = requests.get('https://www.google.com')

In [5]:
g.status_code

200

## Make a request to Shoprite

In [6]:
res = requests.get('https://shoprite.ng/shop/')

In [7]:
res.status_code

200

## Make a Soup

In [8]:
!pip install beautifulsoup4

Defaulting to user installation because normal site-packages is not writeable


In [9]:
from bs4 import BeautifulSoup

In [10]:
# CREATE an object of beautifulsoup
soup = BeautifulSoup(res.content, 'html.parser')

In [11]:
all_product_name = soup.find_all('h2')
names = [] # an empty list to store all na

for product in all_product_name:
    prod = product.get_text()
    #append each names to list
    names.append(prod)
    
#print out all names
print(names)

['', '18 Glenfiddich 750Ml', '2sure Bath Soap Herbal Plus 120g', '2X Snickers 80G', '33 Export 500Ml Can', '5 Alive 780Ml, Berry Blast', '5 Alive 780Ml, Citrus Burst', '7 Up 500Ml NRB, Original', '7Up 50Cl', 'Active Go Milo 20G', 'Active Go Milo 400G Sachet', 'Air Freshener Gel Airwick 45G, Rose', 'Air freshener Ref Airwick 250Ml, Aqua', 'Air freshener Ref Airwick 250Ml, Lav', 'Air Freshner Gel Airwick 45G, Lav', 'Airfryer Model Baf-3501 Binatone 3.5L', 'Airwick 2X250Ml', 'Airwick 2X250Ml, Citrus\xa0', 'Airwick Fmat 250Ml, Citrus\xa0', 'Amazing Day Golden Penny 600G', 'American Aviation 750Ml', 'American Honey 750Ml', 'Ankara & Colour Ariel 2Kg\xa0', 'Ankara Ariel 400G', 'Antiseptic Liquid Savlon 250Ml Pack', 'Antiseptic Liquid Savlon 500Ml Pack', 'Antiseptic Liquid Savlon 500Ml Pack', 'Antiseptic Liquid Savlon 750Ml', 'Aperitif Herb Bitters Orijin 330Ml Can', 'Aperitif Herb Bitters Orijin 750Ml', 'Aperitifs Bitters Confam 750Ml', 'Apperito Bitters  375Ml', 'Apple Caprisonne 100Ml Pouc

In [12]:
soup.find_all('bdi')

[<bdi><span class="woocommerce-Price-currencySymbol">₦</span>1,799.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>54,999.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>349.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>899.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>349.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>499.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>499.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>199.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>199.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>79.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>1,549.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>399.99</bdi>,
 <bdi><span class="woocommerce-Price-currencySymbol">₦</span>4,199.99</bdi>,
 <bdi><span class

In [13]:
all_prices = soup.find_all('bdi')
prices = [] # an empty list to store all na

for price in all_prices:
    price = price.get_text()
    #append each price to list
    prices.append(price)
    
#print out all prices
print(prices)

['₦1,799.99', '₦54,999.99', '₦349.99', '₦899.99', '₦349.99', '₦499.99', '₦499.99', '₦199.99', '₦199.99', '₦79.99', '₦1,549.99', '₦399.99', '₦4,199.99', '₦4,199.99', '₦399.99', '₦77,799.99', '₦6,699.99', '₦6,699.99', '₦3,199.99', '₦1,199.99', '₦16,999.99', '₦750.00', '₦3,499.99', '₦1,009.99', '₦1,099.99', '₦1,899.99', '₦1,899.99', '₦2,699.99', '₦399.99', '₦1,799.99', '₦1,699.99', '₦1,499.99', '₦89.99', '₦109.99', '₦2,749.99', '₦2,749.99', '₦1,699.99', '₦4,199.99', '₦1,899.99', '₦3,599.99']


## Refactoring Code

In [14]:
# get the content of an HTML tag
def get_contents(tag):
    items = soup.find_all(tag) # find all give tag
    values = [] # empty list
    
    for item in items:
        text = item.get_text() # get text of each tag
        values.append(text)
        
    return values

In [15]:
# make a request
try:
    res = requests.get(url = 'https://shoprite.ng/shop/')
    status = res.status_code
    
    if status == 200:
        print(f"You have successfully made a request to {res.url}.")
    else:
        print(f'Status Code: {status}')
except:
    print(f'Status Code: {status}')
finally:
    content = res.content

You have successfully made a request to https://shoprite.ng/shop/.


In [16]:
# create an object of beautiful shop
soup = BeautifulSoup(content,"html.parser")

In [17]:
# define an empty dictionary
products = {}

In [18]:
# get all the product name
names = get_contents('h2')

# get all the product price
prices = get_contents('bdi')

In [19]:
products.update({"names":names,"prices":prices})

In [20]:
import pandas as pd

In [21]:
shop_df = pd.DataFrame(products)

In [22]:
shop_df

Unnamed: 0,names,prices
0,,"₦1,799.99"
1,18 Glenfiddich 750Ml,"₦54,999.99"
2,2sure Bath Soap Herbal Plus 120g,₦349.99
3,2X Snickers 80G,₦899.99
4,33 Export 500Ml Can,₦349.99
5,"5 Alive 780Ml, Berry Blast",₦499.99
6,"5 Alive 780Ml, Citrus Burst",₦499.99
7,"7 Up 500Ml NRB, Original",₦199.99
8,7Up 50Cl,₦199.99
9,Active Go Milo 20G,₦79.99


In [23]:
# empty dictionary
all_products = {
    "names": [],
    "prices": []
}
# make a request to 5 pages
for page in range(1,6):
    url = f"https://shoprite.ng/shop/?product-page={page}"
    res = requests.get(url)
    
    #create an object of BeautifulShop
    soup = BeautifulSoup(res.content,"html.parser")
    
    # get product name
    names = get_contents('h2')

    # get all the product price
    prices = get_contents('bdi')
    
    all_products['names'].extend(names)
    all_products['prices'].extend(prices)

all_df = pd.DataFrame(all_products)
all_df.shape

(200, 2)

In [24]:
all_df

Unnamed: 0,names,prices
0,,"₦1,799.99"
1,18 Glenfiddich 750Ml,"₦54,999.99"
2,2sure Bath Soap Herbal Plus 120g,₦349.99
3,2X Snickers 80G,₦899.99
4,33 Export 500Ml Can,₦349.99
...,...,...
195,Coffee INST 3 in1 Nescafe 250G,₦999.99
196,Cognac Blue Swift Vsop Martell 750Ml,"₦52,499.99"
197,Cognac Vs Hennessy 750Ml,"₦35,599.99"
198,Cognac Vs Hennessy 750Ml,"₦30,499.99"


In [26]:
# empty dictionary
all_products = {
    "names": [],
    "prices": [],
    "sku": [],
    "categories": [],
    "tag": []
}

# make a request to 5 pages
for page in range(1,2):
    url = f"https://shoprite.ng/?product-page={page}"
    res = requests.get(url)
    
    #create an object of BeautifulShop
    soup = BeautifulSoup(res.content,"html.parser")
    
    # get product name
    names = get_contents('h2')

    # get all the product price
    prices = get_contents('bdi')
    
    for name in names:
        single_prod_url = f"https://shoprite.ng/product/{name}".replace(' ','-')
        r =requests.get(single_prod_url)
        
        soup = BeautifulSoup(r.content, 'html.parser')
        
        skus = []
        categories = []
        tags = []
        
        if soup.find('span',class_ = "sku") != None:
            sku = soup.find('span', class_ ="sku").get_text()
            skus.append(sku)
        else:
            sku = 'Null'
            skus.append(sku)
            
        if soup.find('span',class_ = "posted_in") != None:
            category = soup.find('span', class_ ="posted_in").get_text()
            categories.append(category)
        else:
            category= 'Null'
            categories.append(category)
            
        if soup.find('span',class_ = "tagged_as") != None:
            tag = soup.find('span', class_ ="tagged_as").get_text()
            tags.append(tag)
        else:
            tag = 'Null'
            tags.append(sku)
        
    
    all_products['names'].extend(names)
    all_products['prices'].extend(prices)
    all_products['sku'].extend(skus)
    all_products['categories'].extend(categories)
    all_products['tag'].extend(tags)




#all_df = pd.DataFrame(all_products)
#all_df.head
all_products

{'names': ['The Retailer of Choice Delivering Convenience, Quality, Exceptional Value  to Our Customers and Communities',
  'Best Prices, Great Value',
  'Snack Chin Chin Minimie 900G',
  'Lemon Imported X6',
  'Chips Pringles 165G, Original',
  'Royalty Ginger Nuts Biscuits 300G',
  'Belmont Milk Chocolate Biscuit 300G',
  'Grain Smart Golden Morn 900G Pack',
  'Fox Chunky Chocolate Cookies',
  'Castilo Vino Rosado Rose Wine 750Ml',
  'Apples X 6',
  'Milk Powder Peak 400G Pack',
  'Mich & Kay Plain Greek Yoghurt 400Ml',
  'Goslo Cookies&Cream Ice Cream 320Ml',
  'Chocolate Mini Snickers 150G Pack',
  'Bounty Miniatures 150G',
  'Titus Frozen Fish Gsf 1Kg',
  'Oreo White Chocolate Biscuit 246G',
  'Mens Deo Deep Nivea 200Ml',
  'Nivea Pearl And Beauty Ladies Deodorant 200Ml',
  'Fiesta French Fries Frozen Potatoes 1Kg',
  'Tampax Super Plus Tampons 20S Pack',
  'Natural Sweet Red Four Cousins 750ML',
  'Sapphire Bombay Gin 750Ml',
  'Nasco Corn Flakes 350ml',
  'Ferrero Rocher Chocola