amazon product scraper added

alwenpy · alwenpy · commit 8d4d3b7d6f26 · 2023-07-30T21:17:19.000+05:30
diff --git a/Amazon_product_scraper/README.md b/Amazon_product_scraper/README.md
@@ -0,0 +1,46 @@
+# Amazon Product Scraper
+
+## Description of package/script
+
+Scrape the details of a product from Amazon using Python and Beautiful Soup.The details include
+
+- Product Name
+- Product Price
+- Product Rating
+- Product Image
+- Customer Reviews
+
+## Setup instructions
+
+We will setup the environment and install the required packages and libraries using requirements.txt file. Run the following commands in the terminal to install the required packages and libraries.
+
+```bash
+pip install -r requirements.txt
+```
+
+"""
+        Class - `Product`\n
+        Example -\n
+        ```python
+        product = Product(product_name="watch")
+        product.get_product()
+
+```
+        Return\n
+        ```python
+        return
+        {
+            "data": product_link,
+            "message": f"Product data has been fetched",
+        }
+```
+
+"""
+
+## Modules used (available in requirements.txt)
+
+- requests_html
+- BeautifulSoup
+
+
+### Developed by [Arvind Srivastav](https://github.com/alwenpy)
diff --git a/Amazon_product_scraper/products.py b/Amazon_product_scraper/products.py
@@ -0,0 +1,177 @@
+import requests
+from bs4 import BeautifulSoup
+
+# scraping amazon product page
+class Product:
+    def __init__(self, product_name: str):
+        self.product_name = product_name
+
+    def get_product(self):
+        
+        try:
+            product_name = self.product_name
+            product_name = product_name.replace(" ", "+")
+            url = f"https://www.amazon.in/s?k={product_name}"
+            headers = {
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
+                    (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
+            }
+            r = requests.get(url, headers=headers)
+            soup = BeautifulSoup(r.content, "html.parser")
+            product = soup.find("div", {"class": "s-product-image-container"})
+            product_link = product.find("a", {"class": "a-link-normal"})["href"]
+            product_link = "https://www.amazon.in" + product_link
+            return {
+                "data": product_link,
+                "message": f"Product data has been fetched",
+            }
+        except:
+            return {
+                "data": None,
+                "message": f"Unable to fetch product's data",
+            }
+
+    # Get product details
+    def get_product_details(self):
+        """
+        Class - `Product`\n
+        Example -\n
+        ```python
+        product = Product(product_name="watch")
+        product.get_product_details()
+        ```
+        Return\n
+        ```python
+        return
+        {
+            "data": product_details,
+            "message": f"Product detail has been fetched",
+        }
+        ```
+        """
+        try:
+            product_link = self.get_product()["data"]
+            headers = {
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
+                    (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
+            }
+            r = requests.get(product_link, headers=headers)
+            soup = BeautifulSoup(r.content, "html.parser")
+            product_name = soup.find("span", {"id": "productTitle"}).text.strip()
+            product_price = soup.find("span", {"class": "a-price-whole"}).text.strip()
+            product_rating = soup.find(
+                "span", {"class": "a-size-base a-color-base"}
+            ).text.strip()
+            product_details = {
+                "product_name": product_name,
+                "product_price": product_price,
+                "product_rating": product_rating,
+                "product_link": product_link,
+            }
+            return {
+                "data": product_details,
+                "message": f"Product detail has been fetched",
+            }
+        except:
+            return {
+                "data": None,
+                "message": f"Unable to fetch product detail",
+            }
+
+    # Get product image
+    def get_product_image(self):
+        """
+        Class - `Product`\n
+        Example -\n
+        ```python
+        product = Product(product_name="watch")
+        product.get_product_image()
+        ```
+        Return\n
+        ```python
+        return
+        {
+            "data": product_image,
+            "message": f"Product image has been fetched",
+        }
+        ```
+        """
+        try:
+            product_link = self.get_product()["data"]
+            headers = {
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
+                    (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
+            }
+            r = requests.get(product_link, headers=headers)
+            soup = BeautifulSoup(r.content, "html.parser")
+            product_image = soup.find(
+                "img", {"class": "a-dynamic-image a-stretch-horizontal"}
+            )["src"]
+
+            return {
+                "data": product_image,
+                "message": f"Product image has been fetched",
+            }
+        except:
+            return {
+                "data": None,
+                "message": f"Unable to fetch product image",
+            }
+
+    # Get customer reviews
+    def customer_review(self):
+        """
+        Class - `Product`\n
+        Example -\n
+        ```python
+        product = Product(product_name="watch")
+        product.customer_review()
+        ```
+        Return\n
+        ```python
+        return
+        {
+            "data": review,
+            "message": f"Product review has been fetched",
+        }
+        ```
+        """
+        try:
+            product_link = self.get_product()["data"]
+            headers = {
+                "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
+                    (KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
+            }
+            r = requests.get(product_link, headers=headers)
+            soup = BeautifulSoup(r.content, "html.parser")
+
+            review_elements = soup.find_all("div", {"data-hook": "review"})
+
+            for review_element in review_elements:
+                reviewer_name = review_element.find(
+                    "span", {"class": "a-profile-name"}
+                ).text
+                rating = (
+                    review_element.find("i", {"class": "a-icon-star"})
+                    .find("span", {"class": "a-icon-alt"})
+                    .text
+                )
+                review_title = review_element.find(
+                    "a", {"data-hook": "review-title"}
+                ).text.strip()
+                review_date = review_element.find(
+                    "span", {"data-hook": "review-date"}
+                ).text
+                review_text = review_element.find(
+                    "span", {"data-hook": "review-body"}
+                ).text.strip()
+                review = [reviewer_name, rating, review_title, review_date, review_text]
+            return {
+                "data": review,
+                "message": f"Product review has been fetched",
+            }
+        except:
+            return {
+                "data": None,
+                "message": f"Unable to fetch product review",
+            }
diff --git a/Amazon_product_scraper/requirements.txt b/Amazon_product_scraper/requirements.txt
@@ -0,0 +1,7 @@
+beautifulsoup4==4.9.1
+bs4==0.0.1
+lxml==4.9.1
+parse==1.18.0
+pyquery==1.4.1
+requests==2.31.0
+requests-html==0.10.0