In [107]:
from bs4 import BeautifulSoup
import requests
import csv
import pandas as pd
import re

In [183]:
rating = re.findall(r'\d.?\d? out of 5', '4.5 out of 5(3840)')
rating

['4.5 out of 5']

In [207]:
def scrape_product_info(prod):
  link = prod.find('a', class_='core')
  prod_id = link['href'].split('-')[-1].replace('.html', '')
  full_prod_id = link['href'].replace('.html', '').replace('/', '')
  brand = link['data-gtm-brand']
  category4 = link['data-ga4-item_category4']

  info = prod.find('div', class_='info')
  name = info.h3.text
  price = info.find('div', class_='prc').text
  print({'price': info.find('div', class_='prc')})

  old_price = info.find('div', class_='old')
  print({'old_price': info.find('div', class_='old')})

  discount = info.find('div', class_='bdg _dsct _sm')
  print({'discount': info.find('div', class_='bdg _dsct _sm')})

  try:
    reviews_info = info.find('div', class_='rev').text
    rating = re.findall(r'\d.?\d? out of 5', reviews_info)[0].replace('out of 5', '').strip()
    total_reviews = re.findall(r"\(\d+\)", reviews_info)[0].replace('(', '').replace(')', '').strip()
    print({'reviews_info': reviews_info}, '\n\n')
  except AttributeError as e:
    print('reviews_info isn\'t available')
    rating = ''
    total_reviews = ''

  prod = {
    'id': prod_id,
    'full_id': full_prod_id,
    'brand': brand,
    'category4': category4,
    'name': name,
    'price': price,
    'old_price': old_price.text if old_price else '',
    'discount': discount.text if discount else '',
    'rating': rating,
    'total_reviews': total_reviews,
  }
  print('In scrape_product_info: ', prod)
  return prod

In [209]:
def save_products(products):
  with open('data/products.csv', 'a') as f:
    writer = csv.DictWriter(f, fieldnames=products[0].keys(), lineterminator='\n')
    writer.writeheader()
    writer.writerows(products)

In [214]:
headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36", "Accept-Encoding":"gzip, deflate", "Accept":"text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "DNT":"1","Connection":"close", "Upgrade-Insecure-Requests":"1"}
# url = f'https://www.jumia.com.ng/catalog/?q=fans&page={page}#catalog-listing'
# response = requests.get(url, headers=headers)
# response

In [219]:
def fetch_products(page):
  pg_products = []
  try:
    url = f'https://www.jumia.com.ng/catalog/?q=fans&page={page}#catalog-listing'
    response = requests.get(url, headers=headers)
    response.raise_for_status()
    soup = BeautifulSoup(response.content, 'html.parser')
    
    for prod in soup.find_all('article', class_='prd'):
      pg_products.append(scrape_product_info(prod))
      
    return pg_products
  except Exception as e:
    print('An error happened: ', e)

In [220]:
def run():
  all_products = []
  MAX_PAGES = 50
  current_page = 1

  while (current_page <= MAX_PAGES):
    all_products.extend(fetch_products(current_page))
    print(f'Page {current_page} done!')
    current_page += 1

  save_products(all_products)
run()

{'price': <div class="prc">₦ 19,400</div>}
{'old_price': <div class="old">₦ 25,000</div>}
{'discount': <div class="bdg _dsct _sm">22%</div>}
{'reviews_info': '4.2 out of 5(270)'} 


In scrape_product_info:  {'id': '334520266', 'full_id': 'ox-spacetek-18-inches-standing-fan-334520266', 'brand': 'Ox', 'category4': 'Household Fans', 'name': 'Ox Spacetek 18 Inches Standing Fan', 'price': '₦ 19,400', 'old_price': '₦ 25,000', 'discount': '22%', 'rating': '4.2', 'total_reviews': '270'}
{'price': <div class="prc">₦ 40,740</div>}
{'old_price': <div class="old">₦ 50,740</div>}
{'discount': <div class="bdg _dsct _sm">20%</div>}
{'reviews_info': '4.3 out of 5(891)'} 


In scrape_product_info:  {'id': '76698431', 'full_id': 'binatone-16-inches-standing-fan-a1691-black-2-years-warranty-76698431', 'brand': 'Binatone', 'category4': 'Household Fans', 'name': 'Binatone 16 Inches Standing Fan (A1691) - Black + 2 Years Warranty', 'price': '₦ 40,740', 'old_price': '₦ 50,740', 'discount': '20%', 'rating': '