Skip to content

Commit 8d4d3b7

Browse files
committed
amazon product scraper added
1 parent 0ee50ad commit 8d4d3b7

File tree

3 files changed

+230
-0
lines changed

3 files changed

+230
-0
lines changed

Amazon_product_scraper/README.md

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
# Amazon Product Scraper
2+
3+
## Description of package/script
4+
5+
Scrape the details of a product from Amazon using Python and Beautiful Soup.The details include
6+
7+
- Product Name
8+
- Product Price
9+
- Product Rating
10+
- Product Image
11+
- Customer Reviews
12+
13+
## Setup instructions
14+
15+
We will setup the environment and install the required packages and libraries using requirements.txt file. Run the following commands in the terminal to install the required packages and libraries.
16+
17+
```bash
18+
pip install -r requirements.txt
19+
```
20+
21+
"""
22+
Class - `Product`\n
23+
Example -\n
24+
```python
25+
product = Product(product_name="watch")
26+
product.get_product()
27+
28+
```
29+
Return\n
30+
```python
31+
return
32+
{
33+
"data": product_link,
34+
"message": f"Product data has been fetched",
35+
}
36+
```
37+
38+
"""
39+
40+
## Modules used (available in requirements.txt)
41+
42+
- requests_html
43+
- BeautifulSoup
44+
45+
46+
### Developed by [Arvind Srivastav](https://github.com/alwenpy)

Amazon_product_scraper/products.py

Lines changed: 177 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,177 @@
1+
import requests
2+
from bs4 import BeautifulSoup
3+
4+
# scraping amazon product page
5+
class Product:
6+
def __init__(self, product_name: str):
7+
self.product_name = product_name
8+
9+
def get_product(self):
10+
11+
try:
12+
product_name = self.product_name
13+
product_name = product_name.replace(" ", "+")
14+
url = f"https://www.amazon.in/s?k={product_name}"
15+
headers = {
16+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
17+
(KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
18+
}
19+
r = requests.get(url, headers=headers)
20+
soup = BeautifulSoup(r.content, "html.parser")
21+
product = soup.find("div", {"class": "s-product-image-container"})
22+
product_link = product.find("a", {"class": "a-link-normal"})["href"]
23+
product_link = "https://www.amazon.in" + product_link
24+
return {
25+
"data": product_link,
26+
"message": f"Product data has been fetched",
27+
}
28+
except:
29+
return {
30+
"data": None,
31+
"message": f"Unable to fetch product's data",
32+
}
33+
34+
# Get product details
35+
def get_product_details(self):
36+
"""
37+
Class - `Product`\n
38+
Example -\n
39+
```python
40+
product = Product(product_name="watch")
41+
product.get_product_details()
42+
```
43+
Return\n
44+
```python
45+
return
46+
{
47+
"data": product_details,
48+
"message": f"Product detail has been fetched",
49+
}
50+
```
51+
"""
52+
try:
53+
product_link = self.get_product()["data"]
54+
headers = {
55+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
56+
(KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
57+
}
58+
r = requests.get(product_link, headers=headers)
59+
soup = BeautifulSoup(r.content, "html.parser")
60+
product_name = soup.find("span", {"id": "productTitle"}).text.strip()
61+
product_price = soup.find("span", {"class": "a-price-whole"}).text.strip()
62+
product_rating = soup.find(
63+
"span", {"class": "a-size-base a-color-base"}
64+
).text.strip()
65+
product_details = {
66+
"product_name": product_name,
67+
"product_price": product_price,
68+
"product_rating": product_rating,
69+
"product_link": product_link,
70+
}
71+
return {
72+
"data": product_details,
73+
"message": f"Product detail has been fetched",
74+
}
75+
except:
76+
return {
77+
"data": None,
78+
"message": f"Unable to fetch product detail",
79+
}
80+
81+
# Get product image
82+
def get_product_image(self):
83+
"""
84+
Class - `Product`\n
85+
Example -\n
86+
```python
87+
product = Product(product_name="watch")
88+
product.get_product_image()
89+
```
90+
Return\n
91+
```python
92+
return
93+
{
94+
"data": product_image,
95+
"message": f"Product image has been fetched",
96+
}
97+
```
98+
"""
99+
try:
100+
product_link = self.get_product()["data"]
101+
headers = {
102+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
103+
(KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
104+
}
105+
r = requests.get(product_link, headers=headers)
106+
soup = BeautifulSoup(r.content, "html.parser")
107+
product_image = soup.find(
108+
"img", {"class": "a-dynamic-image a-stretch-horizontal"}
109+
)["src"]
110+
111+
return {
112+
"data": product_image,
113+
"message": f"Product image has been fetched",
114+
}
115+
except:
116+
return {
117+
"data": None,
118+
"message": f"Unable to fetch product image",
119+
}
120+
121+
# Get customer reviews
122+
def customer_review(self):
123+
"""
124+
Class - `Product`\n
125+
Example -\n
126+
```python
127+
product = Product(product_name="watch")
128+
product.customer_review()
129+
```
130+
Return\n
131+
```python
132+
return
133+
{
134+
"data": review,
135+
"message": f"Product review has been fetched",
136+
}
137+
```
138+
"""
139+
try:
140+
product_link = self.get_product()["data"]
141+
headers = {
142+
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \
143+
(KHTML, like Gecko) Chrome/88.0.4324.96 Safari/537.36"
144+
}
145+
r = requests.get(product_link, headers=headers)
146+
soup = BeautifulSoup(r.content, "html.parser")
147+
148+
review_elements = soup.find_all("div", {"data-hook": "review"})
149+
150+
for review_element in review_elements:
151+
reviewer_name = review_element.find(
152+
"span", {"class": "a-profile-name"}
153+
).text
154+
rating = (
155+
review_element.find("i", {"class": "a-icon-star"})
156+
.find("span", {"class": "a-icon-alt"})
157+
.text
158+
)
159+
review_title = review_element.find(
160+
"a", {"data-hook": "review-title"}
161+
).text.strip()
162+
review_date = review_element.find(
163+
"span", {"data-hook": "review-date"}
164+
).text
165+
review_text = review_element.find(
166+
"span", {"data-hook": "review-body"}
167+
).text.strip()
168+
review = [reviewer_name, rating, review_title, review_date, review_text]
169+
return {
170+
"data": review,
171+
"message": f"Product review has been fetched",
172+
}
173+
except:
174+
return {
175+
"data": None,
176+
"message": f"Unable to fetch product review",
177+
}
Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
beautifulsoup4==4.9.1
2+
bs4==0.0.1
3+
lxml==4.9.1
4+
parse==1.18.0
5+
pyquery==1.4.1
6+
requests==2.31.0
7+
requests-html==0.10.0

0 commit comments

Comments
 (0)