In [13]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time
import re
import random

In [None]:
headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 Chrome/115.0.0.0 Safari/537.36"
}

In [None]:
# URL
base_url = "https://myanimelist.net/topmanga.php"

In [None]:
manga_data = {
    "ranking": [],
    "nombre": [],
    "score": [],
    "votos": [],
    "ranked": [],
    "popularity": [],
    "members": [],
    "status": [],
    "demografia": [],
    "autores": [],
    "volumenes": [],
    "capitulos": [],
    "generos": [],
}

In [None]:
# Scrapear 500 mangas (50 por página)
for i in range(0, 500, 50):
    print(f"Scrapeando página {i // 50 + 1} (offset {i})...")
    url = f"{base_url}?limit={i}"
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")
    rows = soup.find_all("tr", class_="ranking-list")

    for row in rows:
        try:
            ranking = row.find("td", class_="rank").text.strip()
            title_tag = row.find("h3", class_="manga_h3").find("a")
            title = title_tag.text.strip()
            manga_url = title_tag["href"]

            score_text = row.find("td", class_="score").text.strip()
            score = score_text if "." in score_text else score_text[:-2] + "." + score_text[-2:]

            # Retry al acceder a la página individual
            for intento in range(3):
                manga_resp = requests.get(manga_url, headers=headers)
                if manga_resp.status_code == 200:
                    break
                print(f"Reintentando acceso a {title} (intento {intento + 1})")
                time.sleep(2)
            else:
                print(f"Falló acceso a {title} tras 3 intentos. URL: {manga_url}")
                continue

            manga_soup = BeautifulSoup(manga_resp.text, "html.parser")

            votos = ""
            score_stat_tag = manga_soup.find("span", string=re.compile("scored by"))
            if score_stat_tag:
                match = re.search(r"scored by ([\d,]+) users", score_stat_tag.text)
                votos = match.group(1) if match else ""

            ranked = ""
            popu = ""
            ranked_tag = manga_soup.find("span", string="Ranked:")
            if ranked_tag:
                ranked = ranked_tag.next_sibling.strip().replace("#", "")
            popu_tag = manga_soup.find("span", string="Popularity:")
            if popu_tag:
                popu = popu_tag.next_sibling.strip().replace("#", "")

            members = ""
            members_tag = manga_soup.find("span", string="Members:")
            if members_tag:
                members = members_tag.next_sibling.strip()

            status = ""
            status_tag = manga_soup.find("span", string="Status:")
            if status_tag:
                sibling = status_tag.next_sibling
                status = sibling.strip() if isinstance(sibling, str) else sibling.get_text(strip=True)

            demografia = ""
            demo_tag = manga_soup.find("span", string=re.compile("Demographic[s]*:"))
            if demo_tag:
                demo_a_tags = demo_tag.find_next_siblings("a")
                if demo_a_tags:
                    demografia = ", ".join([a.text.strip() for a in demo_a_tags])
                else:
                    next_item = demo_tag.next_sibling
                    if isinstance(next_item, str):
                        demografia = next_item.strip()
            autores = ""
            autor_tag = manga_soup.find("span", string="Authors:")
            if autor_tag:
                autores = ", ".join([a.text.strip() for a in autor_tag.find_next_siblings("a")])

            volumenes = ""
            vol_tag = manga_soup.find("span", string="Volumes:")
            if vol_tag:
                volumenes = vol_tag.next_sibling.strip()

            capitulos = ""
            cap_tag = manga_soup.find("span", string="Chapters:")
            if cap_tag:
                capitulos = cap_tag.next_sibling.strip()

            generos = ""
            genre_tag = manga_soup.find("span", string="Genres:")
            if genre_tag:
                generos = ", ".join([a.text.strip() for a in genre_tag.find_next_siblings("a")])

            if not title:
                print(f"Datos incompletos para ranking {ranking}, se omite.")
                continue

            manga_data["ranking"].append(ranking)
            manga_data["nombre"].append(title)
            manga_data["score"].append(score)
            manga_data["votos"].append(votos)
            manga_data["ranked"].append(ranked)
            manga_data["popularity"].append(popu)
            manga_data["members"].append(members)
            manga_data["status"].append(status)
            manga_data["demografia"].append(demografia)
            manga_data["autores"].append(autores)
            manga_data["volumenes"].append(volumenes)
            manga_data["capitulos"].append(capitulos)
            manga_data["generos"].append(generos)

            time.sleep(random.uniform(1.0, 1.5))

        except Exception as e:
            print(f"Error en manga #{ranking} ({title if 'title' in locals() else 'desconocido'}): {e}")
            continue

Scrapeando página 1 (offset 0)...
Scrapeando página 2 (offset 50)...
Scrapeando página 3 (offset 100)...
Scrapeando página 4 (offset 150)...
Scrapeando página 5 (offset 200)...
Scrapeando página 6 (offset 250)...
Scrapeando página 7 (offset 300)...
Scrapeando página 8 (offset 350)...
Scrapeando página 9 (offset 400)...
Scrapeando página 10 (offset 450)...


In [18]:
df_manga = pd.DataFrame(manga_data)
df_manga["ranking"] = range(1, len(df_manga) + 1)
df_manga.to_csv("myanimelist_top500_manga.csv", index=False, encoding="utf-8")