In [9]:
# Dependencies
from bs4 import BeautifulSoup
import requests
import pymongo
import datetime
import csv

In [2]:
# Initialize PyMongo to work with MongoDBs
conn = 'mongodb://localhost:27017'
client = pymongo.MongoClient(conn)

In [3]:
# Define database and collection
db = client.project_02
collection = db.gpu

In [4]:
# URL of page to be scraped
# In practice, would use the URL including query string to filter for 'in stock' only
url = 'https://www.newegg.com/p/pl?N=100007709%20601357248'

# Retrieve page with the requests module
response = requests.get(url)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [5]:
# Find all 'div' tags with class 'item-cell' on Newegg search results
results = soup.find_all('div', class_='item-cell')

# Loop through returned results
for result in results:
    try:
        # Get title by returning 'a' tag class 'item-title'
        title = result.find('a', class_='item-title').text
        
        # Get link by returning 'a' tag href
        link = result.a['href']
        
        # Get list price by returning 'li' tag class 'price-current'
        price_string = result.find('li', class_='price-current')
        
        # The price includes extra text, return just numbers under the 'strong' and 'sup' tags, drop commas
        dollars = price_string.find('strong').text.replace(',','')
        cents = price_string.find('sup').text
        price = float(dollars + cents)

        # Run only if title, price, and link are available
        if (title and link and price):
            # Print results
            print('-------------')
            print(title)
            print(price)
            print(link)
            print(datetime.datetime.now())

            # Dictionary to be inserted as a MongoDB document
            post = {
                'title': title,
                'price': price,
                'url': link,
                'time': datetime.datetime.now()
            }
            collection.insert_one(post)

    except Exception as e:
        print(e)

-------------
MSI Gaming GeForce RTX 3090 24GB GDDR6X PCI Express 4.0 SLI Support Video Card RTX 3090 GAMING X TRIO 24G
3377.99
https://www.newegg.com/msi-geforce-rtx-3090-rtx-3090-gaming-x-trio-24g/p/N82E16814137595
2021-12-09 21:08:26.170389
-------------
MSI Ventus GeForce RTX 3090 24GB GDDR6X PCI Express 4.0 SLI Support Video Card RTX 3090 VENTUS 3X 24G OC
3159.99
https://www.newegg.com/msi-geforce-rtx-3090-rtx-3090-ventus-3x-24g-oc/p/N82E16814137596
2021-12-09 21:08:26.184402
-------------
GIGABYTE GeForce RTX 3090 GAMING OC 24G Video Card, GV-N3090GAMING OC-24GD
3212.87
https://www.newegg.com/gigabyte-geforce-rtx-3090-gv-n3090gaming-oc-24gd/p/N82E16814932327
2021-12-09 21:08:26.185403
-------------
ASUS ROG Strix GeForce RTX 3090 24GB GDDR6X PCI Express 4.0 SLI Support Video Card ROG-STRIX-RTX3090-O24G-GAMING
3316.38
https://www.newegg.com/asus-geforce-rtx-3090-rog-strix-rtx3090-o24g-gaming/p/N82E16814126456
2021-12-09 21:08:26.185403
-------------
GIGABYTE AORUS GeForce RTX 3090

In [6]:
# URL of page to be scraped
# In practice, would use the URL including query string to filter for 'exclude out of stock items'
url = 'https://www.bestbuy.com/site/searchpage.jsp?_dyncharset=UTF-8&browsedCategory=abcat0507002&id=pcat17071&iht=n&ks=960&list=y&qp=gpusv_facet%3DGraphics%20Processing%20Unit%20(GPU)~NVIDIA%20GeForce%20RTX%203090&sc=Global&st=categoryid%24abcat0507002&type=page&usc=All%20Categories'

# User-Agent is necessary in the header otherwise access is denied
agent = {'User-Agent':'Mozilla/5.0'}

# Retrieve page with the requests module
response = requests.get(url, headers=agent)

# Create BeautifulSoup object; parse with 'lxml'
soup = BeautifulSoup(response.text, 'lxml')

In [7]:
# Find all 'div' tags with class 'list-item lv' on BestBuy search results
results = soup.find_all('div', class_='list-item lv')

# Loop through returned results
for result in results:
    try:
        # Get title by returning 'a' tag under 'div' tag class 'sku-title'
        title = result.find('div', class_='sku-title').find('a').text
        
        # Get link by returning 'a' tag href; it doesn't include the whole URL so add the domain
        link_string = result.a['href']
        link = 'https://www.bestbuy.com/' + link_string
        
        # Get list price by returning 'span' tag class 'sr-only'; drop the '$' and ',' to convert to float
        price_string = result.find('span', class_='sr-only').text.split('$')[1]
        price = float(price_string.replace(',',''))

        # Run only if title, price, and link are available
        if (title and link and price):
            # Print results
            print('-------------')
            print(title)
            print(price)
            print(link)
            print(datetime.datetime.now())

            # Dictionary to be inserted as a MongoDB document
            post = {
                'title': title,
                'price': price,
                'url': link,
                'time': datetime.datetime.now()
            }
            collection.insert_one(post)

    except Exception as e:
        print(e)

-------------
NVIDIA GeForce RTX 3090 24GB GDDR6X  PCI Express 4.0 Graphics Card - Titanium and Black
1499.99
https://www.bestbuy.com//site/nvidia-geforce-rtx-3090-24gb-gddr6x-pci-express-4-0-graphics-card-titanium-and-black/6429434.p?skuId=6429434
2021-12-09 21:08:27.031955
-------------
GIGABYTE - NVIDIA GeForce RTX 3090 VISION 24G GDDR6 PCI Express 4.0 Graphics Card
2199.99
https://www.bestbuy.com//site/gigabyte-nvidia-geforce-rtx-3090-vision-24g-gddr6-pci-express-4-0-graphics-card/6445108.p?skuId=6445108
2021-12-09 21:08:27.032955
-------------
GIGABYTE - NVIDIA GeForce RTX 3090 GAMING OC 24GB GDDR6X PCI Express 4.0 Graphics Card
2199.99
https://www.bestbuy.com//site/gigabyte-nvidia-geforce-rtx-3090-gaming-oc-24gb-gddr6x-pci-express-4-0-graphics-card/6430623.p?skuId=6430623
2021-12-09 21:08:27.033957
-------------
ASUS - TUF RTX 3090 24GB GDDR6X PCI Express 4.0 Graphics Card - Black
2199.99
https://www.bestbuy.com//site/asus-tuf-rtx-3090-24gb-gddr6x-pci-express-4-0-graphics-card-bl

In [14]:
# Query items in MongoDB collection with price under $2000 and export to CSV
items = db.gpu.find(
    {'price': {'$lt': 2000}},
    {'_id': 0, 'title': 1, 'price': 1, 'url': 1}
)

# Identify dictionary keys for field names
keys = ['title', 'price', 'url']

# file name for exported CSV file
file = 'RTX3090.csv'

# Write query results to CSV
try:
    with open(file, 'w') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=keys)
        writer.writeheader()
        for item in items:
            writer.writerow(item)
            print(item)
except IOError:
    print(e)

{'title': 'GIGABYTE GeForce RTX 3090 EAGLE 24GB Video Card, GV-N3090EAGLE-24GD', 'price': 1499.99, 'url': 'https://www.newegg.com/gigabyte-geforce-rtx-3090-gv-n3090eagle-24gd/p/N82E16814932366'}
{'title': 'ASUS TUF Gaming GeForce RTX 3090 TUF-RTX3090-24G-GAMING Video Card', 'price': 1874.99, 'url': 'https://www.newegg.com/asus-geforce-rtx-3090-tuf-rtx3090-24g-gaming/p/N82E16814126455'}
{'title': 'EVGA GeForce RTX 3090 XC3 BLACK GAMING Video Card, 24G-P5-3971-KR, 24GB GDDR6X, iCX3 Cooling, ARGB LED', 'price': 1639.99, 'url': 'https://www.newegg.com/evga-geforce-rtx-3090-24g-p5-3971-kr/p/N82E16814487527'}
{'title': 'ZOTAC GAMING GeForce RTX 3090 Trinity 24GB GDDR6X 384-bit 19.5 Gbps PCIE 4.0 Gaming Graphics Card, IceStorm 2.0 Advanced Cooling, SPECTRA 2.0 RGB Lighting, ZT-A30900D-10P', 'price': 1899.99, 'url': 'https://www.newegg.com/zotac-geforce-rtx-3090-zt-a30900d-10p/p/N82E16814500503'}
{'title': 'NVIDIA GeForce RTX 3090 24GB GDDR6X  PCI Express 4.0 Graphics Card - Titanium and Black