# WEB SCRAPING

There are different ways to scrape data from the net, 1. Using python. 2. Using softwares like Parsehub

Refer: https;//udemy.com/course/data-project-with-beautiful-soup-web-scraping-e-commerce/learn/lecture/28122518 #overview

In [9]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [10]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from lxml import html

In [11]:
website ='https://www.jumia.com.ng/electronics-accessories/'

In [12]:
website

'https://www.jumia.com.ng/electronics-accessories/'

## GET REQUEST

In [13]:
me = requests.get(website)

## Status Code

In [14]:
me.status_code

200

### Soup

In [15]:
soup = BeautifulSoup(me.content, 'html.parser')

In [16]:
soup

<!DOCTYPE html>
<html dir="ltr" lang="en"><head><meta charset="utf-8"/><title>Electronics Here - Buy Electronics Accessories | Jumia Nigeria</title><meta content="product" property="og:type"/><meta content="Jumia Nigeria" property="og:site_name"/><meta content="Electronics Here - Buy Electronics Accessories | Jumia Nigeria" property="og:title"/><meta content="Shop from Biggest Electronics Accessories Collection Today - Best Electronics Online from Jumia Nigeria | Fast Delivery - Free Returns" property="og:description"/><meta content="/electronics-accessories/" property="og:url"/><meta content="https://ng.jumia.is/cms/jumialogonew.png" property="og:image"/><meta content="en_NG" property="og:locale"/><meta content="Electronics Here - Buy Electronics Accessories | Jumia Nigeria" name="title"/><meta content="index, follow" name="robots"/><meta content="Shop from Biggest Electronics Accessories Collection Today - Best Electronics Online from Jumia Nigeria | Fast Delivery - Free Returns" nam

In [17]:
# We need to store the result inside a variable
# We want to get access to all the html elements first
# I want to store the list results inside a variable

me = soup.find_all ('article',{'class': 'prd _fb col c-prd'})

In [33]:
len(me)

40

In [19]:
# Using index, we'll try to print out the first value in me

In [20]:
me[0:40]

[<article class="prd _fb col c-prd"><a class="core" data-brand="ZTE" data-category="Electronics/Accessories/Gadgets" data-dimension23="218569" data-dimension26="4" data-dimension27="4.8" data-dimension28="0" data-dimension37="0" data-dimension43="" data-dimension44="0" data-id="ZT786EA0408V8NAFAMZ" data-list="" data-name="Universal Mobile 3G/4G Wifi / Mifi" data-position="1" data-price="60.80" data-track-onclick="eecProduct" data-track-onview="eecProduct" href="/zte-universal-mobile-3g4g-wifi-mifi-65992760.html"><div class="img-c"></div><div class="info"><h3 class="name">ZTE Universal Mobile 3G/4G Wifi / Mifi</h3><div class="prc">₦ 27,900</div><div class="s-prc-w"><div class="old">₦ 35,900</div><div class="tag _dsct _sm">22%</div></div><div

## Target Necessary Data

In [21]:
# Product name
# Product price
# Product old price
# Product Rating
# Product Discount
# Product URL

## Product Name

In [22]:
# For the name we go back to the name and inspect the tag

me[3].find('h3',{'class':'name'}).get_text().strip()

'HD USB Hidden Camera Pen Video Security Camera DVR'

## Product Price

In [23]:
# we go back to the page and inspect the product price

In [24]:
me[0].find('div',{'class':'prc'}).get_text().strip()

'₦ 27,900'

## Product URL

In [25]:
me[0].find('a').get('href').strip()

'/zte-universal-mobile-3g4g-wifi-mifi-65992760.html'

## Product Rating


In [26]:
me[0].find('div',{'class':'rev'}).get_text()

'4.8 out of 5(4)'

## Product Old Price

In [41]:
me[0].find('div',{'class': 'old'}).get_text().strip()

'₦ 35,900'

## Product Discount

In [42]:
me[0].find('div',{'class': 'tag _dsct _sm'}).get_text().strip()

'22%'

## Picture Image

In [37]:
# Trying to scrape and print an image from the web
# Scraped the image url

Zara = me[0].find('img').get('data-src')

print(Zara)

https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/06/729956/1.jpg?5825


In [38]:
# To print the image fromm the web

import matplotlib.pyplot as plt
import matplotlib.image as mpim

## Putting everything together 


In [29]:
product_name = []
product_oldprice =[]
product_price = []
product_discount =[]
product_rating = []
product_url = []

for result in me:
    '''Product name'''
    try:
        product_name.append(result.find('h3',{'class':'name'})).get_text().strip()
    except:
            product_name.append('n/a')
            
    # product price 
    try:
        product_price.append(result.find('div',{'class':'prc'})).get_text().strip()
    except:
            product_price.append('n/a')
            
    # Product old price
    try:
        product_oldprice.append(result.find('div',{'class': 'old'})).get_text().strip()
    except:
            product_oldprice.append('n/a')
            
    # Product Discount
    try:
        product_discount.append(result.find('div',{'class': 'tag _dsct _sm'})).get_text().strip()
    except:
            product_discount.append('n/a')
        
    # Product rating
    try:
        product_rating.append(result.find('div',{'class':'rev'})).get_text().strip()
    except:
            product_rating.append('n/a')
            
    # Product Url
    try:
        product_url.append(result.find('a').get('href')).strip()
    except:
            product_url.append('n/a')

## Pandas Framework

In [31]:
pandas_overview = pd.DataFrame ({'Product name': product_name,
                                'Old price': product_oldprice,
                                'New price': product_price,
                                'Discount': product_discount,
                                'Rating': product_rating,
                                'Url':product_url})

In [32]:
pandas_overview

Unnamed: 0,Product name,Old price,New price,Discount,Rating,Url
0,[ZTE Universal Mobile 3G/4G Wifi / Mifi],"[₦ 35,900]","[₦ 27,900]",[22%],"[[4.8 out of 5, []], (4)]",/zte-universal-mobile-3g4g-wifi-mifi-65992760....
1,,,,,,
2,[Bosto 1060 Plus Digital Graphic Drawing Paint...,"[₦ 21,915]","[₦ 14,610]",[33%],,/bosto-1060-plus-digital-graphic-drawing-paint...
3,,,,,,
4,[Fixed GSM Wireless Terminal- Gateway GSM850/1...,"[₦ 25,000]","[₦ 17,990]",[28%],,/generic-fixed-gsm-wireless-terminal-gateway-g...
...,...,...,...,...,...,...
75,,,,,,
76,[Hidden DV DVR HD Suit Button Camera 8GB 16GB ...,"[₦ 9,999]","[₦ 9,500]",[5%],"[[3.3 out of 5, []], (4)]",/hidden-dv-dvr-hd-suit-button-camera-8gb-16gb-...
77,,,,,,
78,[CameraPen Business Portable Recorder 6],"[₦ 22,000]","[₦ 12,499]",[43%],,/camerapen-business-portable-recorder-6-generi...


## Export to Excel

In [40]:
pandas_overview.to_excel('Jumi Products.xlsx')

## Multiple Pages

In [None]:
product_name = []
product_oldprice =[]
product_price = []
product_discount =[]
product_rating = []
product_url = []

for i in range(1,100):
    websiite= website ='https://www.jumia.com.ng/electronics-accessories/'
    
    #request
    me = requests.get(website)
    
    # Soup
    soup = BeautifulSoup(me.content, 'html.parser')
    
    # me
   # me = soup.find_all ('article',{'class': 'prd _fb col c-prd'})
    
    me = soup.find_all ('div',{'class': '-paxs row _no-g _4cl-3cm-shs'})
    
   
    # Loop through results
    for result in me:
    '''Product name'''
    
    #product name 
    try:
        product_name.append(result.find('h3',{'class':'name'})).get_text().strip()
    except:
            product_name.append('n/a')
            
    # product price 
    try:
        product_price.append(result.find('div',{'class':'prc'})).get_text().strip()
    except:
            product_price.append('n/a')
            
    # Product old price
    try:
        product_oldprice.append(result.find('div',{'class': 'old'})).get_text().strip()
    except:
            product_oldprice.append('n/a')
            
    # Product Discount
    try:
        product_discount.append(result.find('div',{'class': 'tag _dsct _sm'})).get_text().strip()
    except:
            product_discount.append('n/a')
        
    # Product rating
    try:
        product_rating.append(result.find('div',{'class':'rev'})).get_text().strip()
    except:
            product_rating.append('n/a')
            
    # Product Url
    try:
        product_url.append(result.find('a').get('href')).strip()
    except:
            product_url.append('n/a')

In [None]:
me_overview = pd.DataFrame ({'Product name': product_name,
                                'Old price': product_oldprice,
                                'New price': product_price,
                                'Discount': product_discount,
                                'Rating': product_rating,
                                'Url':product_url})

In [None]:
print(me_overview)