In [1]:
import time
import re
import requests 
import numpy as np
import pandas as pd
import concurrent.futures

from bs4 import BeautifulSoup as bs
from fake_useragent import UserAgent
from tqdm import tqdm

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.common.exceptions import ElementNotInteractableException,NoSuchWindowException 

# Scraping prices of products off amazon

### Load excel from before

In [2]:
df = pd.read_excel('pricelist.xls')
df.head()

Unnamed: 0,Item,Unnamed: 2,Designer,Category,Item Name,Price,Qty Avail,Packaging,UPC/EAN,Case Count,Order Qty,Gender
0,12672,1.0,Abercrombie & Fitch,Regular,Abercrombie & Fitch Fierce 1.7 EDC Sp Men,46.5,56,New In Box,634349765,12,,Men
1,26413,1.0,Al Haramain,Regular,Al Haramain Amber Oud Tobacco Edition 2.0 EDP ...,38.5,more than 360,New In Box,6291100132171,- None -,,Men
2,10689,1.0,Alfred Sung,Regular,Alfred Sung Hei 3.4 EDT Sp Men,12.5,more than 360,New In Box,67724200017,24,,Men
3,13377,1.0,Alfred Sung,Regular,Alfred Sung Shi 3.4 EDP Sp Women,14.5,12,New In Box,67724271116,24,,Women
4,11668,1.0,Animale,Regular,Animale 3.4 EDP Sp Women,21.5,more than 360,New In Box,892456000037,24,,Women


In [3]:
df.columns

Index(['Item', ' ', 'Designer', 'Category', 'Item Name', 'Price', 'Qty Avail',
       'Packaging', 'UPC/EAN', 'Case Count', 'Order Qty', 'Gender'],
      dtype='object')

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 12 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Item        1289 non-null   int64  
 1               1284 non-null   float64
 2   Designer    1289 non-null   object 
 3   Category    1289 non-null   object 
 4   Item Name   1289 non-null   object 
 5   Price       1289 non-null   float64
 6   Qty Avail   1289 non-null   object 
 7   Packaging   1289 non-null   object 
 8   UPC/EAN     1284 non-null   object 
 9   Case Count  1289 non-null   object 
 10  Order Qty   1289 non-null   object 
 11  Gender      1289 non-null   object 
dtypes: float64(2), int64(1), object(9)
memory usage: 121.0+ KB


In [5]:
# Make an array of the UPC values we will use to retrieve items.
upc_arr = df['UPC/EAN'].loc[df['UPC/EAN'] != '- None -'].to_numpy()
upc_arr

array(['634349765', '6291100132171', '067724200017', ..., '888874001541',
       '860002058931', '605930779929'], dtype=object)

In [6]:
# Helper function to search for product
def make_url(search_term):
    """
    Makes an amazon.ae search result url
    """
    return f'https://www.amazon.ae/s?k={search_term}'

### Try with one item

In [7]:
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("user-agent=[user-agent string]")
chrome_options.add_argument('--disable-notifications')
prefs = {"profile.default_content_setting_values.notifications" : 2}
chrome_options.add_experimental_option("prefs",prefs)
chrome_options.add_argument("window-size=1920,1080")

driver_manager = ChromeDriverManager().install()
driver = webdriver.Chrome(driver_manager,
                          options = chrome_options)


url = 'https://www.amazon.ae/s?k=3386460036757'
driver.get(url)
soup = bs(driver.page_source, 'html.parser') 

driver.quit()



Current google-chrome version is 97.0.4692
Get LATEST driver version for 97.0.4692
Driver [C:\Users\Fardin\.wdm\drivers\chromedriver\win32\97.0.4692.71\chromedriver.exe] found in cache


In [8]:
soup.find_all('span', {'class':'a-offscreen'})

[<span class="a-offscreen">AED 109.00</span>,
 <span class="a-offscreen">AED 184.38</span>,
 <span class="a-offscreen">AED 109.00</span>]

In [9]:
soup.find_all('span', {'class':'a-offscreen'})[0].text

'AED\xa0109.00'

In [10]:
def clean_price(price_text):
    """
    Cleans up the price text.
    """
    regex = r'[\d]+\.?[\d]+'
    
    return re.findall(regex,price_text)[0]

clean_price('AED\xa0184.38')

'184.38'

## Do with all items of interest