<a href="https://colab.research.google.com/github/michael-borck/ISYS5002_portfolio/blob/main/amazon_solution.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Amazon Item search

This notebook is a solution to the

In [1]:
# Usually run of Google Coolab, so install unique packages
!pip install kora -q

#load packages
from bs4 import BeautifulSoup
from kora.selenium import wd
import pandas as pd


def get_url(search_term):
  """Generate a url form a search term"""
  # Base template with placeholder for search term
  template='https://www.amazon.com.au/s?k={}'
  # Amazom search parameter has plus sign for each space
  search_term = search_term.replace(' ', '+')
  url = template.format(search_term)
  # add page number template, based on observing URL change
  url += '&page={}'
  return url


def extract_description(item):
  """Extract the product description"""
  description = item.h2.a.text
  return description


def extract_url(item):
  """Extract and return the URL of the item"""
  # get the HTML <a /> tag
  atag = item.h2.a
  # build the URL
  url = 'https://www.amazon.com.au' + atag.get('href')
  return(url)


def extract_price(item):
  """Extract and return the price of the item"""
  try:   # Not all item have a price
    price_parent = item.find('span', 'a-price')
    price = price_parent.find('span', 'a-offscreen').text
  except AttributeError:
    price = ''
  return price


def extract_rating(item):
  """Extract and return the number of stars"""
  try: # not all items have a star rating
    rating = item.i.text
  except AttributeError:
    rating = ''
  return rating


def extract_review_count(item):
  """Extract and return the number of reviews"""
  try: # not all items have reviews
    review_count = item.find('span', {'class': 'a-size-base'}).text
  except AttributeError:
    review_count = ''
  return review_count


def extract_record(item):
  """Extract and return data for a single record"""
  record = {
      'description': extract_description(item),
      'url': extract_url(item),
      'price': extract_price(item),
      'rating': extract_rating(item),
      'review_count': extract_review_count(item)
  }
  return record


def main():
  """ Run the main program routine"""
  search_term = input("What do you want to search for? ")
  url = get_url(search_term)

  # For each page, for each item on the page, extract the record
  records = [] # list of records (dictionarys)
  for page in range(1,21): # Amazon max of 20 pages
    wd.get(url.format(page))
    soup = BeautifulSoup(wd.page_source, 'html.parser')
    # get the results for the current page
    results = soup.find_all('div', {'data-component-type': 's-search-result'})
    for item in results:
      record = extract_record(item)
      if record['price']: # only want records with a price
        records.append(record)
  # Create the dataframe from the list of dictionarys
  # Column names will be extracted from the dict keys
  df = pd.DataFrame.from_records(records)
  return df

df = main()
# Extracted data into dataframe, could now exort to CSV or SQL database
# Let display() the first few rows to be safe
df.head()

[?25l[K     |█████▊                          | 10 kB 23.7 MB/s eta 0:00:01[K     |███████████▍                    | 20 kB 29.6 MB/s eta 0:00:01[K     |█████████████████               | 30 kB 12.0 MB/s eta 0:00:01[K     |██████████████████████▊         | 40 kB 9.2 MB/s eta 0:00:01[K     |████████████████████████████▍   | 51 kB 5.3 MB/s eta 0:00:01[K     |████████████████████████████████| 57 kB 2.7 MB/s 
[?25h[?25l[K     |█████▉                          | 10 kB 19.3 MB/s eta 0:00:01[K     |███████████▊                    | 20 kB 29.3 MB/s eta 0:00:01[K     |█████████████████▌              | 30 kB 34.8 MB/s eta 0:00:01[K     |███████████████████████▍        | 40 kB 40.6 MB/s eta 0:00:01[K     |█████████████████████████████▎  | 51 kB 46.4 MB/s eta 0:00:01[K     |████████████████████████████████| 56 kB 3.4 MB/s 
[?25hWhat do you want to search for? Laptop


Unnamed: 0,description,url,price,rating,review_count
0,"2021 Newest HP Laptop Computer, 15.6"" FHD 1080...",https://www.amazon.com.au/HP-Laptop-15-6-Dual-...,$745.30,,Only 5 left in stock.
1,"2021 Newest Dell Inspiron 15.6"" HD Business La...",https://www.amazon.com.au/Newest-Inspiron-Busi...,$961.29,4.4 out of 5 stars,88
2,"2020 Newest Lenovo IdeaPad 3 15"" HD Touch Scre...",https://www.amazon.com.au/Lenovo-IdeaPad-Dual-...,$827.63,4.5 out of 5 stars,737
3,HP Stream 14 Pink - Celeron N4000 - 4 GB RAM -...,https://www.amazon.com.au/HP-Stream-14-Pink-Bl...,$523.92,4.6 out of 5 stars,78
4,Acer Chromebook Spin 311 CP311-2H-C3KA Convert...,https://www.amazon.com.au/Chromebook-CP311-2H-...,$453.80,4.6 out of 5 stars,1701
