# WEB SCRAPING

There are different ways to scrape data from the net, 1. Using python. 2. Using softwares like Parsehub

Refer: https;//udemy.com/course/data-project-with-beautiful-soup-web-scraping-e-commerce/learn/lecture/28122518 #overview

In [1]:
pip install beautifulsoup4




In [2]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from lxml import html

In [3]:
website ='https://www.jumia.com.ng/electronics-accessories/'

In [4]:
website

'https://www.jumia.com.ng/electronics-accessories/'

## GET REQUEST

In [5]:
me = requests.get(website)

## Status Code

In [6]:
me.status_code

200

### Soup

In [7]:
soup = BeautifulSoup(me.content, 'html.parser')

In [8]:
soup

<!DOCTYPE html>
<html dir="ltr" lang="en"><head><meta charset="utf-8"/><title>Electronics Here - Buy Electronics Accessories | Jumia Nigeria</title><meta content="product" property="og:type"/><meta content="Jumia Nigeria" property="og:site_name"/><meta content="Electronics Here - Buy Electronics Accessories | Jumia Nigeria" property="og:title"/><meta content="Shop from Biggest Electronics Accessories Collection Today - Best Electronics Online from Jumia Nigeria | Fast Delivery - Free Returns" property="og:description"/><meta content="/electronics-accessories/" property="og:url"/><meta content="https://ng.jumia.is/cms/jumialogonew.png" property="og:image"/><meta content="en_NG" property="og:locale"/><meta content="Electronics Here - Buy Electronics Accessories | Jumia Nigeria" name="title"/><meta content="index, follow" name="robots"/><meta content="Shop from Biggest Electronics Accessories Collection Today - Best Electronics Online from Jumia Nigeria | Fast Delivery - Free Returns" nam

In [9]:
# We need to store the result inside a variable
# We want to get access to all the html elements first
# I want to store the list results inside a variable

me = soup.find_all ('article',{'class': 'prd _fb col c-prd'})

In [10]:
len(me)

40

In [11]:
# Using index, we'll try to print out the first value in me

In [12]:
me[0:40]

[<article class="prd _fb col c-prd"><a class="core" data-brand="Generic" data-category="Electronics/Accessories/Gadgets" data-dimension23="283590" data-dimension26="4" data-dimension27="4.8" data-dimension28="0" data-dimension37="0" data-dimension43="" data-dimension44="0" data-id="GE779EL15H0MSNAFAMZ" data-list="" data-name="Security Metal Detector Garrett" data-position="1" data-price="14.20" data-track-onclick="eecProduct" data-track-onview="eecProduct" href="/generic-security-metal-detector-garrett-82285696.html"><div class="img-c"></div><div class="info"><h3 class="name">Security Metal Detector Garrett</h3><div class="prc">₦ 6,500</div><div class="s-prc-w"><div class="old">₦ 9,000</div><div class="tag _dsct _sm">28%</div></div><div cla

## Target Necessary Data

In [13]:
# Product name
# Product price
# Product old price
# Product Rating
# Product Discount
# Product URL

## Product Name

In [14]:
# For the name we go back to the name and inspect the tag

me[3].find('h3',{'class':'name'}).get_text().strip()

'ZTE Universal Mobile 3G/4G Wifi / Mifi'

## Product Price

In [15]:
# we go back to the page and inspect the product price

In [16]:
me[0].find('div',{'class':'prc'}).get_text().strip()

'₦ 6,500'

## Product URL

In [48]:
me[0].find('a').get('href').strip()

'/generic-security-metal-detector-garrett-82285696.html'

## Product Rating


In [18]:
me[0].find('div',{'class':'rev'}).get_text()

'4.8 out of 5(4)'

## Product Old Price

In [19]:
me[0].find('div',{'class': 'old'}).get_text().strip()

'₦ 9,000'

## Product Discount

In [20]:
me[0].find('div',{'class': 'tag _dsct _sm'}).get_text().strip()

'28%'

## Picture Image

In [21]:
# Trying to scrape and print an image from the web
# Scraped the image url

Zara = me[0].find('img').get('data-src')

print(Zara)

https://ng.jumia.is/unsafe/fit-in/300x300/filters:fill(white)/product/69/658228/1.jpg?7708


In [22]:
# To print the image fromm the web

import matplotlib.pyplot as plt
import matplotlib.image as mpim

## Putting everything together 


In [23]:
product_name = []
product_oldprice =[]
product_price = []
product_discount =[]
product_rating = []
product_url = []

for result in me:
    '''Product name'''
    try:
        product_name.append(result.find('h3',{'class':'name'})).get_text().strip()
    except:
            product_name.append('n/a')
            
    # product price 
    try:
        product_price.append(result.find('div',{'class':'prc'})).get_text().strip()
    except:
            product_price.append('n/a')
            
    # Product old price
    try:
        product_oldprice.append(result.find('div',{'class': 'old'})).get_text().strip()
    except:
            product_oldprice.append('n/a')
            
    # Product Discount
    try:
        product_discount.append(result.find('div',{'class': 'tag _dsct _sm'})).get_text().strip()
    except:
            product_discount.append('n/a')
        
    # Product rating
    try:
        product_rating.append(result.find('div',{'class':'rev'})).get_text().strip()
    except:
            product_rating.append('n/a')
            
    # Product Url
    try:
        product_url.append(result.find('a').get('href')).strip()
    except:
            product_url.append('n/a')

## Pandas Framework

In [24]:
pandas_overview = pd.DataFrame ({'Product name': product_name,
                                'Old price': product_oldprice,
                                'New price': product_price,
                                'Discount': product_discount,
                                'Rating': product_rating,
                                'Url':product_url})

In [25]:
pandas_overview

Unnamed: 0,Product name,Old price,New price,Discount,Rating,Url
0,[Security Metal Detector Garrett],"[₦ 9,000]","[₦ 6,500]",[28%],"[[4.8 out of 5, []], (4)]",/generic-security-metal-detector-garrett-82285...
1,,,,,,
2,[ZTE Universal Mobile 3G/4G Wifi / Mifi],"[₦ 32,000]","[₦ 25,790]",[19%],,/zte-universal-mobile-3g4g-wifi-mifi-65308334....
3,,,,,,
4,[TV STREAMING DEVICE CHROME CAST],,"[₦ 8,500]",,,/generic-tv-streaming-device-chrome-cast-98909...
...,...,...,...,...,...,...
75,,,,,,
76,[Gts G T S Professional Cliper],,"[₦ 8,950]",,"[[4.6 out of 5, []], (9)]",/g-t-s-professional-cliper-gts-mpg1223900.html
77,,,,,,
78,[Commscope Cat6 Systimax Gigaspeed Pure Copper...,"[₦ 70,000]","[₦ 60,000]",[14%],,/generic-commscope-cat6-systimax-gigaspeed-pur...


## Export to Excel

In [26]:
pandas_overview.to_excel('Jumi Products.xlsx')

## Multiple Pages

In [27]:
product_name = []
product_oldprice =[]
product_price = []
product_discount =[]
product_rating = []
product_url = []

for i in range(1,100):
    websiite= website ='https://www.jumia.com.ng/electronics-accessories/'
    
    #request
    me = requests.get(website)
    
    # Soup
    soup = BeautifulSoup(me.content, 'html.parser')
    
    # me
   # me = soup.find_all ('article',{'class': 'prd _fb col c-prd'})
    
    me = soup.find_all ('div',{'class': '-paxs row _no-g _4cl-3cm-shs'})
    
   
    # Loop through results
    for result in me:
        '''Product name'''
    
    #product name 
    try:
        product_name.append(result.find('h3',{'class':'name'})).get_text().strip()
    except:
            product_name.append('n/a')
            
    # product price 
    try:
        product_price.append(result.find('div',{'class':'prc'})).get_text().strip()
    except:
            product_price.append('n/a')
            
    # Product old price
    try:
        product_oldprice.append(result.find('div',{'class': 'old'})).get_text().strip()
    except:
            product_oldprice.append('n/a')
            
    # Product Discount
    try:
        product_discount.append(result.find('div',{'class': 'tag _dsct _sm'})).get_text().strip()
    except:
            product_discount.append('n/a')
        
    # Product rating
    try:
        product_rating.append(result.find('div',{'class':'rev'})).get_text().strip()
    except:
            product_rating.append('n/a')
            
    # Product Url
    try:
        product_url.append(result.find('a').get('href')).strip()
    except:
            product_url.append('n/a')

KeyboardInterrupt: 

In [None]:
me_overview = pd.DataFrame ({'Product name': product_name,
                                'Old price': product_oldprice,
                                'New price': product_price,
                                'Discount': product_discount,
                                'Rating': product_rating,
                                'Url':product_url})

In [None]:
print(me_overview)

# Running Through Again

In [28]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [29]:
website = 'https://www.jumia.com.ng/mobile-phone-accessory-kits/'

In [30]:
response = requests.get(website)

In [31]:
response.status_code

200

In [32]:
soup = BeautifulSoup(response.content, 'html.parser')

In [33]:
soup

<!DOCTYPE html>
<html dir="ltr" lang="en"><head><meta charset="utf-8"/><title>Phone Accessory Kits | Buy Phone Accessory Kits Online in Nigeria | Jumia NG</title><meta content="product" property="og:type"/><meta content="Jumia Nigeria" property="og:site_name"/><meta content="Phone Accessory Kits | Buy Phone Accessory Kits Online in Nigeria | Jumia NG" property="og:title"/><meta content="Amazing Deals during Black Friday. Shop for Phone Accessory Kits online on Jumia Nigeria. Discover a great selection of Phone Accessory Kits ✓ Best prices in Nigeria ✓ Enjoy cash on delivery - Order Now!" property="og:description"/><meta content="/mobile-phone-accessory-kits/" property="og:url"/><meta content="https://ng.jumia.is/cms/jumialogonew.png" property="og:image"/><meta content="en_NG" property="og:locale"/><meta content="Phone Accessory Kits | Buy Phone Accessory Kits Online in Nigeria | Jumia NG" name="title"/><meta content="index,follow" name="robots"/><meta content="Amazing Deals during Blac

## Results

In [34]:
results = soup.find_all('a', {'class':'core'})

In [35]:
len(results)

48

In [36]:
results[0]

<a class="core" data-brand="Generic" data-category="Phones &amp; Tablets/Accessories/Accessory Kits" data-dimension23="287153" data-dimension26="" data-dimension27="" data-dimension28="0" data-dimension37="0" data-dimension43="TBOOST|TW22|TW_02" data-dimension44="0" data-id="GE779EA1SJ07KNAFAMZ" data-list="" data-name="Black Replacement Strap For Xiaomi Mi Band 5/6 Silicone Wristband" data-position="1" data-price="3.74" data-track-onclick="eecProduct" data-track-onview="eecProduct" href="/generic-black-replacement-strap-for-xiaomi-mi-band-56-silicone-wristband-106128380.html"><div class="img-c"><img alt="TW22" class="_ni camp" data-lazy="" data-src="https://ng.jumia.is/badges/tw22/1/138x18.png?8911" src="data:image/gif;base64,R0lGODlhAQABA

## Target  Neccessary Data

In [37]:
# Product Name

results[0].find('h3',{'class':'name'}).get_text()

'Black Replacement Strap For Xiaomi Mi Band 5/6 Silicone Wristband'

In [38]:
# New Product Price 

results[0].find('div', {'class':'prc'}).get_text()

'₦ 1,710'

In [39]:
# Old Product Price

results[0].find('div', {'class':'old'}).get_text()

'₦ 2,000'

In [40]:
# DIscount Given 

results[0].find('div', {'class':'tag _dsct _sm'}).get_text() 

'15%'

In [41]:
# Rating Stars

results[4].find('div', {'class':'stars _s'}).get_text()

'3 out of 5'

In [54]:
# Ratings Count

results[2].find('div', {'class':'rev'})
#.get_text()

<div class="rev"><div class="stars _s">4.7 out of 5<div class="in" style="width:94%"></div></div>(6)</div>

In [50]:
# Relative Url

results[2].find('a').get('href')

AttributeError: 'NoneType' object has no attribute 'get'