# Amazon Manga Scrapper

#### Import Libraries 

In [2]:
# For data analysis and manipulation ( pip inst all pandas )
import pandas as pd

# For sending requests to the website ( pip install requests )
import requests

# For reading and writing csv files ( pip install csv )
import csv

# For parsing the html content ( pip install bs4 )
from bs4 import BeautifulSoup as bs

# For parsing the html content ( pip install lxml )
import lxml

# For suspending the ssl certificate verification warning ( pip install urllib3 )
import urllib3

# For generating fake user agent to avoid blocking from the website due to multiple requests from same user agent. ( pip install fake-useragent )
from fake_useragent import UserAgent


In [3]:
# Get the url for the page to be scrapped

url = "https://www.amazon.in/s?k=manga&crid=1J9DIAJWX37ZO&qid=1693917954&sprefix=man%2Caps%2C421&ref=sr_pg_1"


In [20]:
# # Get a fake user agent to avoid getting blocked by the website
# ua = UserAgent()

# # Get a random browser user-agent string
# print(ua.random)


In [4]:
# Headers for the request

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/108.0.0.0 Safari/537.36",
    "language": "en-US,en;q=0.9",
}


In [5]:
# Suspend the warning for the SSL certificate verification using urllib3

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


In [6]:
# Lists to store scraped data

Manga_Name = []
Manga_Price = []
Manga_Rating = []
Manga_Desc = []


In [7]:
# Get the response from the website

try:
    response = requests.get(url, headers=headers, verify=False)

    # Print the response code
    response_code = response.status_code
    print("Response Code:", response_code)

    # Check the response code
    if response_code == 200:
        print("Connection Successful")
    else:
        print("Connection Failed")
except Exception as e:
    print("Error occurred:", e)


Response Code: 200
Connection Successful


In [8]:
# Let's create a soup object containing the html content of the website received from the request

soup = bs(response.content, "html.parser")
# soup.prettify()


In [9]:
# Fetch the link as list of Tag objects

links = soup.find_all(
    "a",
    attrs={
        "class": "a-link-normal s-underline-text s-underline-link-text s-link-style a-text-normal"
    },
)
# print(links)


In [10]:
link = links[0].get("href")


In [11]:
product_url = "https://www.amazon.in" + link
product_url


'https://www.amazon.in/sspa/click?ie=UTF8&spc=MTozMTE2NDMwMjQxNjM2MTI6MTY5MzkxODAyODpzcF9hdGY6MzAwMDE1NTE2ODg3MTMyOjowOjo&url=%2FRepresentation-Japanese-History-Routledge-Contemporary%2Fdp%2F1138857408%2Fref%3Dsr_1_1_sspa%3Fcrid%3D1J9DIAJWX37ZO%26keywords%3Dmanga%26qid%3D1693918028%26sprefix%3Dman%252Caps%252C421%26sr%3D8-1-spons%26sp_csd%3Dd2lkZ2V0TmFtZT1zcF9hdGY%26psc%3D1'

In [12]:
# Let's fetch the name of the manga


names = soup.find_all(
    "span", attrs={"class": "a-size-medium a-color-base a-text-normal"}
)
# print(names)


In [13]:
# Function to extract Product Title
def get_title(soup):
    try:
        # Outer Tag Object
        title = soup.find(
            "span", attrs={"class": "a-size-medium a-color-base a-text-normal"}
        )

        # Inner NavigableString Object
        title_value = title.string

        # Title as a string value
        title_string = title_value.strip()

        # # Printing types of values for efficient understanding
        # print(type(title))
        # print(type(title_value))
        # print(type(title_string))
        # print()
    except AttributeError:
        title_string = ""

    return title_string


# Print the name of the manga
print("Product Title =", get_title(soup))

Product Title = Manga and the Representation of Japanese History (Routledge Contemporary Japan Series)


In [19]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib3.exceptions import InsecureRequestWarning
import math


# Disable insecure request warnings
requests.packages.urllib3.disable_warnings()

# Lists to store scraped data
Product_Name = []
Product_Price = []
Product_Rating = []
Product_Desc = []

# Scrape data from multiple pages (2 to 4)
for i in range(2, 5):  # Change the number to fetch more pages
    url = (
        "https://www.flipkart.com/search?q=manga+comics&sid=bks&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_2_6_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_2_6_na_na_na&as-pos=2&as-type=RECENT&suggestionId=manga+comics%7CBooks&requestId=ac38a896-1938-483a-8fe8-513a470ac068&as-searchtext=manga+&page="
        + str(i)
    )

    # Send request to the URL and get the response
    response = requests.get(url, verify=False)

    # Check if the response is successful (status code 200)
    if response.status_code == 200:
        soup = bs(response.text, "lxml")

        # Find the link to the next page (NEXT Button link)
        link = soup.find("a", attrs={"class": "_1LKTO3"}).get("href")
        product_list = "https://flipkart.com" + link

        # Find the div containing mobile data on the page
        box = soup.find("div", attrs={"class": "_1YokD2 _3Mn1Gg"})

        # Fetch Name, Price, Rating, and Description of each mobile
        devices = box.find_all("a", attrs={"class": "_2rpwqI"})
        for i in devices:
            # Scrape product name from website
            name = i.text
            Product_Name.append(name)

        prices = box.find_all("div", attrs={"class": "_30jeq3"})
        for i in prices:
            # Scrape product price from website
            price = i.text.strip()
            Product_Price.append(price)

        # Scrape product rating from website
        ratings = box.find_all("div", attrs={"class": "_3LWZlK"})
        for i in ratings:
            rating = i.text.strip()
            if rating:
                Product_Rating.append(rating)
            else:
                # Append NaN when rating is not available
                Product_Rating.append(math.nan)

        # details = box.find_all("ul", attrs={"class": "_3Djpdu"})
        # for i in details:
        #     # Scrape product details from website
        #     details = i.text.strip()
        #     Product_Desc.append(details)

    else:
        print(f"Failed to fetch the webpage. Status Code: {response.status_code}")

# Check the length of each list
print("Product_Name Length:", len(Product_Name))
print("Product_Price Length:", len(Product_Price))
print("Product_Rating Length:", len(Product_Rating))
# print("Product_Desc Length:", len(Product_Desc))

# Create a DataFrame and save data to CSV if all lists have the same length
if len(Product_Name) == len(Product_Price) == len(Product_Rating) == len(Product_Desc):
    df = pd.DataFrame(
        {
            "Product Name": Product_Name,
            "Product Price": Product_Price,
            "Product Rating": Product_Rating,
            # "Product Details": Product_Desc,
        }
    )
    df.to_csv("Flipkart_Mobiles.csv", index=False)
    print("Data Exported!")
else:
    print("Error: Lists have different lengths. Check your scraping logic.")


# Create a DataFrame and save data to CSV
# df = pd.DataFrame(
#     {
#         "Product Name": Product_Name,
#         "Product Price": Product_Price,
#         "Product Rating": Product_Rating,
#         "Product Details": Product_Desc,
#     }
# )
# df.to_csv("Flipkart_Mobiles.csv", index=False)
df
print("Data Exported!")

Product_Name Length: 120
Product_Price Length: 120
Product_Rating Length: 74
Error: Lists have different lengths. Check your scraping logic.
Data Exported!


In [27]:
import requests
import pandas as pd
from bs4 import BeautifulSoup as bs
from urllib3.exceptions import InsecureRequestWarning
import math  # Import the math module for NaN

# Disable insecure request warnings
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

# Lists to store scraped data
Product_Name = []
Product_Price = []
Product_Rating = []
Product_Desc = []

# Scrape data from multiple pages (2 to 4)
for i in range(2, 5):  # Change the number to fetch more pages
    url = (
        "https://www.flipkart.com/search?q=manga+comics&sid=bks&as=on&as-show=on&otracker=AS_QueryStore_OrganicAutoSuggest_2_6_na_na_na&otracker1=AS_QueryStore_OrganicAutoSuggest_2_6_na_na_na&as-pos=2&as-type=RECENT&suggestionId=manga+comics%7CBooks&requestId=ac38a896-1938-483a-8fe8-513a470ac068&as-searchtext=manga+&page="
        + str(i)
    )

    # Send request to the URL and get the response
    response = requests.get(url, verify=False)

    # Check if the response is successful (status code 200)
    if response.status_code == 200:
        soup = bs(response.text, "lxml")

        # Find the link to the next page (NEXT Button link)
        link = soup.find("a", attrs={"class": "_1LKTO3"}).get("href")
        product_list = "https://flipkart.com" + link

        # Find the div containing mobile data on the page
        box = soup.find("div", attrs={"class": "_1YokD2 _3Mn1Gg"})

        # Fetch Name, Price, Rating, and Description of each mobile
        devices = box.find_all("a", attrs={"class": "_2rpwqI"})
        prices = box.find_all("div", attrs={"class": "_30jeq3"})
        ratings = box.find_all("div", attrs={"class": "_3LWZlK"})
        # details = box.find_all("ul", attrs={"class": "_3Djpdu"})

        # Iterate over the lists using zip
        # desc  # details
        for name, price, rating, in zip(devices, prices, ratings):
            # Scrape product name from website
            Product_Name.append(name.text)

            # Scrape product price from website
            Product_Price.append(price.text.strip())

            # Scrape product rating from website or add NaN if not available
            if rating:
                Product_Rating.append(rating.text.strip())
            else:
                Product_Rating.append(math.nan)

            # Scrape product details from website
            # Product_Desc.append(desc.text.strip())

    else:
        print(
            f"Failed to fetch the webpage. Status Code: {response.status_code}")

# Create a DataFrame and save data to CSV if all lists have the same length
if len(Product_Name) == len(Product_Price) == len(Product_Rating) == len(Product_Desc):
    df = pd.DataFrame(
        {
            "Product Name": Product_Name,
            "Product Price": Product_Price,
            "Product Rating": Product_Rating,
            "Product Details": Product_Desc,
        }
    )
    # df.to_csv("Flipkart_Mobiles.csv", index=False)
    print(df)
    # print("Data Exported!")
else:
    print("Error: Lists have different lengths. Check your scraping logic.")

Error: Lists have different lengths. Check your scraping logic.
