# Choose a Data Set

You can choose to analyze any data that you would like! Remember, you need 1000 rows of non-null data in order to get 5 points for the "Data" criteria of my [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing). Consider looking at [Kaggle](https://www.kaggle.com/datasets) or [free APIs](https://free-apis.github.io/#/browse) for datasets of this size. Alternatively, you can scrape the web to make your own dataset! :D

Once you have chosen your dataset, please read your data into a dataframe and call `.info()` below. If you don't call `info` I will give you 0 points for the first criteria described on the [rubric](https://docs.google.com/document/d/1s3wllcF3LLnytxwD8mZ-BCypXKnfaahnizWGNojT-B4/edit?usp=sharing).

In [12]:
import pandas as pd
from bs4 import BeautifulSoup
import requests
import random
import seaborn as sns

In [102]:
class ScrapeAmazonLink():
    def __init__(self, URL : str):
        webpage = requests.get(URL)
        self.soup = BeautifulSoup(webpage.content, "html.parser")
        
        self.title = self.GetProductTitle()
        self.price = self.GetProductCost()
        self.review = self.GetProductReviews()
        
    def GetDescription(self):
        return {"title" : self.title, "price" : self.price, "review" : self.review}
        
    def GetProductTitle(self) -> str:
        try:
            title = self.soup.find("span", attrs={"id": 'productTitle'}) 
            title_value = title.string

            title_string = title_value.strip().replace(',', '')

        except AttributeError:

            title_string = "NA"

        return title_string
    
    def GetProductCost(self) -> str:
        try:
            price = self.soup.find("span", class_="aok-nowrap a-text-strike")
            price_value = price.string

            price_string = price_value.string.strip().replace(',', '')

        except AttributeError:
            price_string = "NA"

        return price_string
    
    def GetProductReviews(self) -> str:
        try:
            stars = soup.find("span", class_="a-icon-alt")
            stars_value = stars.string

            stars_string = stars_value.string.strip().replace(',', '')

        except AttributeError:
            stars_string = "NA"

        return stars_string

In [44]:
ScrapeAmazonLink("https://www.amazon.com/Complete-Cookbook-Young-Chefs/dp/1492670022/ref=zg_d_sccl_2/141-1114911-7553503?pd_rd_w=4n115&content-id=amzn1.sym.7f37c16c-1aa6-48d9-bd2d-34f2cb3ae9e0&pf_rd_p=7f37c16c-1aa6-48d9-bd2d-34f2cb3ae9e0&pf_rd_r=S34WZ4PKRF9XAPMG037E&pd_rd_wg=qrdJz&pd_rd_r=85ff7b58-386e-453c-977e-48eb5575c129&pd_rd_i=1492670022&psc=1")

<__main__.ScrapeAmazonLink at 0x199b02caaf0>

In [86]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
import time

service = webdriver.ChromeService()
driver = webdriver.Chrome(service=service)

# Function to scrape links
def scrape_amazon_links_with_selenium(max_links=5):
    product_links = []
    categories = {
        "electronics" : 0,
        "books" : 0,
        "toys" : 0,
        "kitchen" : 0,
        "beauty" : 0,
        "fitness" : 0,
        "fashion" : 0,
        "gaming" : 0,
        "home-improvement" : 0,
        "pet-supplies" : 0,
    }
    
    base_url = "https://www.amazon.com/s?k="
    try:
        while len(product_links) <= max_links:
            random_category = random.choice(list(categories.keys()))
            print(random_category)
            page = categories[random_category]
            url = f"{base_url}{random_category}&page={page}"
            print(f"Scraping Category: {random_category}, Page: {page} - URL: {url}")
            driver.get(url)
            time.sleep(5)

            elements = driver.find_elements(By.XPATH, '//a[contains(@href, "/dp/")]')
            for element in elements:
                product_url = element.get_attribute("href").split("?")[0]
                if product_url not in product_links:
                    product_links.append(product_url)

                    print(f"Progress: {len(product_links)}")

            categories[random_category] += 1
            
    except:
        print(f"Error during scraping: {e}")

    finally:
        driver.quit()

    return product_links


links = scrape_amazon_links_with_selenium(max_links=1000)
print(f"Scraped {len(links)} product links.")

beauty
Scraping Category: beauty, Page: 0 - URL: https://www.amazon.com/s?k=beauty&page=0
home-improvement
Scraping Category: home-improvement, Page: 0 - URL: https://www.amazon.com/s?k=home-improvement&page=0
Progress: 1
Progress: 2
Progress: 3
Progress: 4
Progress: 5
Progress: 6
Progress: 7
Progress: 8
Progress: 9
Progress: 10
Progress: 11
Progress: 12
Progress: 13
Progress: 14
Progress: 15
Progress: 16
Progress: 17
Progress: 18
Progress: 19
Progress: 20
Progress: 21
Progress: 22
Progress: 23
Progress: 24
Progress: 25
Progress: 26
Progress: 27
Progress: 28
Progress: 29
Progress: 30
Progress: 31
Progress: 32
Progress: 33
Progress: 34
Progress: 35
Progress: 36
Progress: 37
Progress: 38
Progress: 39
Progress: 40
Progress: 41
Progress: 42
Progress: 43
Progress: 44
Progress: 45
Progress: 46
Progress: 47
Progress: 48
Progress: 49
Progress: 50
Progress: 51
Progress: 52
Progress: 53
Progress: 54
Progress: 55
Progress: 56
Progress: 57
Progress: 58
Progress: 59
Progress: 60
Progress: 61
Progre

fitness
Scraping Category: fitness, Page: 0 - URL: https://www.amazon.com/s?k=fitness&page=0
Progress: 546
Progress: 547
Progress: 548
Progress: 549
Progress: 550
Progress: 551
Progress: 552
Progress: 553
Progress: 554
Progress: 555
Progress: 556
Progress: 557
Progress: 558
Progress: 559
Progress: 560
Progress: 561
Progress: 562
Progress: 563
Progress: 564
Progress: 565
Progress: 566
Progress: 567
Progress: 568
Progress: 569
Progress: 570
Progress: 571
Progress: 572
Progress: 573
Progress: 574
Progress: 575
Progress: 576
Progress: 577
Progress: 578
Progress: 579
Progress: 580
Progress: 581
Progress: 582
Progress: 583
Progress: 584
Progress: 585
Progress: 586
Progress: 587
Progress: 588
Progress: 589
Progress: 590
Progress: 591
Progress: 592
Progress: 593
Progress: 594
Progress: 595
Progress: 596
Progress: 597
Progress: 598
Progress: 599
Progress: 600
Progress: 601
Progress: 602
Progress: 603
Progress: 604
Progress: 605
Progress: 606
Progress: 607
Progress: 608
Progress: 609
Progress: 6

In [122]:
file = open("URLs.txt", "w")
file.writelines(links)

In [119]:
product_descriptions = []
for link in links:
    product_descriptions.append(ScrapeAmazonLink(link).GetDescription())
    print(f"progress = {len(product_descriptions)} \n")

progress = 1 

progress = 2 

progress = 3 

progress = 4 

progress = 5 

progress = 6 

progress = 7 

progress = 8 

progress = 9 

progress = 10 

progress = 11 

progress = 12 

progress = 13 

progress = 14 

progress = 15 

progress = 16 

progress = 17 

progress = 18 

progress = 19 

progress = 20 

progress = 21 

progress = 22 

progress = 23 

progress = 24 

progress = 25 

progress = 26 

progress = 27 

progress = 28 

progress = 29 

progress = 30 

progress = 31 

progress = 32 

progress = 33 

progress = 34 

progress = 35 

progress = 36 



KeyboardInterrupt: 

In [115]:
product_descriptions

[{'title': 'Kidde Fire Extinguisher for Home 1-A:10-B:C Dry Chemical Extinguisher Red Mounting Bracket Included 2 Pack',
  'price': '$29.99',
  'review': 'NA'},
 {'title': 'NA', 'price': 'NA', 'review': 'NA'},
 {'title': 'NA', 'price': 'NA', 'review': 'NA'},
 {'title': 'Door Draft Stopper for Bottom of Door Adjustable Weather Stripping Door Seal for Cold Insulation Under Door Sweep for Exterior & Interior Doors Door Bottom Seal Blocker Noise (78inchTransparent)',
  'price': '$12.90',
  'review': 'NA'},
 {'title': 'NA', 'price': 'NA', 'review': 'NA'},
 {'title': '2PC Under Sink Organizer Rack 2 Tier Under Sliding Cabinet Basket Organizer Drawer with 4 Hooks Multi-purpose Under Sink Storage for Bathroom Kitchen Desktop（Black）',
  'price': '$26.99',
  'review': 'NA'},
 {'title': 'NA', 'price': 'NA', 'review': 'NA'},
 {'title': 'NA', 'price': 'NA', 'review': 'NA'},
 {'title': 'NA', 'price': 'NA', 'review': 'NA'},
 {'title': 'Veken 11.8 Inch High Pressure Rain Shower Head Combo with Extensi

In [118]:
filtered_descriptions = []
for description in product_descriptions:
    if description['title'] == 'NA': 
        continue
    if description['price'] == 'NA':
        continue
        
    filtered_descriptions.append(description)
        
filtered_descriptions

[{'title': 'Kidde Fire Extinguisher for Home 1-A:10-B:C Dry Chemical Extinguisher Red Mounting Bracket Included 2 Pack',
  'price': '$29.99',
  'review': 'NA'},
 {'title': 'Door Draft Stopper for Bottom of Door Adjustable Weather Stripping Door Seal for Cold Insulation Under Door Sweep for Exterior & Interior Doors Door Bottom Seal Blocker Noise (78inchTransparent)',
  'price': '$12.90',
  'review': 'NA'},
 {'title': '2PC Under Sink Organizer Rack 2 Tier Under Sliding Cabinet Basket Organizer Drawer with 4 Hooks Multi-purpose Under Sink Storage for Bathroom Kitchen Desktop（Black）',
  'price': '$26.99',
  'review': 'NA'},
 {'title': 'Veken 11.8 Inch High Pressure Rain Shower Head Combo with Extension Arm- Wide Showerhead with 5 Handheld Water Spray - Adjustable Dual Showerhead with Anti-Clog Nozzles - Matte Black',
  'price': '$69.99',
  'review': 'NA'},
 {'title': 'Key Holder for Wall Decorative Key and Mail Holder with Shelf Has Large Hooks for Bags Coats Umbrella – Paulownia Wood Key

# My Question

Given ~1000 Amazon Products what is the average 

# My Analysis

In [5]:
# Analyze here

# My Answer

### Write your answer here.