In [33]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
from urllib.parse import urljoin


In [34]:
def title(soup):
  try:
    title= soup.find('h1', attrs={'class':"entry-title"})
    title_name= title.text.strip()

  except AttributeError:
    title_name=""
  
  return title_name

def rating(soup):
  try:
    rating= soup.find('div', attrs={'class':"num"}).text.strip()

  except AttributeError:
    rating= ""

  return rating

def status(soup):
  try:
    status= soup.find('div', {'class':"imptdt"}).find('i').text

  except AttributeError:
    status= ""

  return status

def type_comic(soup):
  try:
    div_elements = soup.find_all('div', {'class': "imptdt"})
    for div in div_elements:
      a_tag = div.find('a')
      if a_tag: 
        type_text = a_tag.text.strip()
  except AttributeError:
    type_text=""
  return type_text

def chapter(soup):
  try:
    chap= soup.find('span', {'class':"epcur epcurlast"}).text.lstrip("Chapter").strip()
  except AttributeError:
    chap=""
  return chap

def relese_date(soup):
  try:
    relese= soup.find('time', {'itemprop':"datePublished"}).text
  except AttributeError:
    relese= ""
  return relese



In [35]:
import requests
from bs4 import BeautifulSoup

def extract_links(url, headers, num_pages=5, series_class="series"):
  """
  Extracts links from a series of webpages and removes duplicates.

  Args:
      url (str): The base URL of the series.
      headers (dict): A dictionary containing user-agent headers.
      num_pages (int, optional): The number of pages to scrape. Defaults to 5.
      series_class (str, optional): The CSS class of the links to extract. Defaults to "series".

  Returns:
      list: A list of unique extracted links.
  """

  page_url = []
  link_list = set()  # Use a set to store unique links during extraction

  # Construct full URLs for each page
  for i in range(1, num_pages + 1):
    full_url = f"{url}page{i}"
    page_url.append(full_url)

  # Iterate through each page URL
  for page in page_url:
    try:
      # Fetch the webpage content
      page_web = requests.get(page, headers=headers)
      page_web.raise_for_status()  # Raise an exception for non-200 status codes

      # Parse the content using BeautifulSoup
      page_soup = BeautifulSoup(page_web.content, "html.parser")

      # Find all links with the specified class
      links = page_soup.find_all('a', attrs={'class': series_class})

      # Extract link URLs (HREF attributes) and add to set for uniqueness
      for link in links:
        link_list.add(link.get('href'))
    except requests.exceptions.RequestException as e:
      print(f"Error fetching page {page}: {e}")  # Handle potential errors gracefully

  return list(link_list)  # Convert set back to list for final output

# Example usage (replace with your actual URL and headers)
url = "https://asuratoon.com/"
headers = ({
  "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
  "Accept_Language": 'en-US, en;q=0.5'
})

extracted_links = extract_links(url, headers)
print(extracted_links)


['https://asuratoon.com/manga/1908287720-death-is-the-only-ending-for-the-villainess/', 'https://asuratoon.com/manga/1908287720-the-frenzy-of-evolution/', 'https://asuratoon.com/manga/1908287720-maxed-out-leveling/', 'https://asuratoon.com/manga/1908287720-the-tutorial-tower-of-the-advanced-player/', 'https://asuratoon.com/manga/1908287720-standard-of-reincarnation/', 'https://asuratoon.com/manga/1908287720-the-king-of-bug/', 'https://asuratoon.com/manga/1908287720-regressor-of-the-fallen-family/', 'https://asuratoon.com/manga/1908287720-boundless-necromancer/', 'https://asuratoon.com/manga/1908287720-the-hero-returns/', 'https://asuratoon.com/manga/1908287720-the-immortal-emperor-luo-wuji-has-returned/', 'https://asuratoon.com/manga/1908287720-reincarnated-escort-warrior/', 'https://asuratoon.com/manga/1908287720-helmut-the-forsaken-child/', 'https://asuratoon.com/manga/1908287720-return-of-the-disaster-class-hero/', 'https://asuratoon.com/manga/1908287720-the-lords-coins-arent-decrea

In [38]:
if __name__ == '__main__':
  # Headers for requests
  agent = ({
      "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
      "Accept_Language": 'en-US, en;q=0.5'
  })
  url = "https://asuratoon.com/"

  data = {"title": [], "rating": [], "status": [], "type": [], "chapter": [], "release": []}

  try:
    # Extract links from the main page (assuming extracted_links is defined)
    extracted_links = extract_links(url, agent)

    for link in extracted_links:
      page_web = requests.get(link, headers=agent)
      page_soup = BeautifulSoup(page_web.content, "html.parser")

            # Call your implemented functions to extract data
      data["title"].append(title(page_soup))
      data["rating"].append(rating(page_soup))
      data["status"].append(status(page_soup))
      data["type"].append(type_comic(page_soup))
      data["chapter"].append(chapter(page_soup))
      data["release"].append(relese_date(page_soup))

  except Exception as e:
    print(f"Error scraping {url}: {e}")

  asura= pd.DataFrame.from_dict(data)
  print(asura)
    

                                           title rating   status    type  \
0    Death Is the Only Ending for the Villainess    9.4  Ongoing  Manhwa   
1                        The Frenzy Of Evolution   9.00  Dropped  Manhua   
2                             Maxed Out Leveling    9.8  Ongoing  Manhwa   
3      The Tutorial Tower of the Advanced Player    8.9  Dropped  Manhwa   
4                      Standard of Reincarnation    9.8  Ongoing  Manhwa   
..                                           ...    ...      ...     ...   
220                        On My Way to Kill God    9.3  Dropped  Manhwa   
221                                 Weapon Maker    9.3  Ongoing  Manhwa   
222               Return of the SSS-Class Ranker    9.6  Ongoing  Manhwa   
223                      The World After The End    9.9  Ongoing  Manhwa   
224     Archmage Transcending Through Regression    9.8  Ongoing  Manhwa   

              chapter             release  
0                 148    October 18, 2020  

In [39]:
asura.to_csv("asura.csv", header=True, index=False)