In [2]:
import os
import json
import requests
from bs4 import BeautifulSoup

def scrape_manga(base_url="https://mangapark.io/latest/action", output_file="manga_data.json", max_pages=55):
    page_number = 1
    existing_data = {}

    if os.path.exists(output_file):
        with open(output_file, "r") as file:
            existing_data = json.load(file)

    while page_number <= max_pages:
        url = f"{base_url}/{page_number}"
        print(f"Scraping page: {url}")

        response = requests.get(url)
        if response.status_code != 200:
            print(f"Reached an invalid page or failed request at page {page_number}. Stopping.")
            break

        soup = BeautifulSoup(response.content, "html.parser")

        manga_list = soup.select("div.flex.border-b.border-b-base-200.pb-3")
        if not manga_list:
            print("No more manga found. Ending scraping.")
            break

        for manga in manga_list:
            title = manga.select_one("h3.font-bold span").text.strip()
            image_url = manga.select_one("div.group.relative img")["src"]
            details_url = "https://mangapark.io" + manga.select_one("h3.font-bold a")["href"]

            rating_element = manga.select_one("span.flex.items-center.text-yellow-500 span.font-bold")
            rating = rating_element.text.strip() if rating_element else "NA"

            genres = [genre.text.strip() for genre in manga.select("div.flex.flex-wrap.text-xs span span")]

            if title in existing_data:
                print(f"{title} already exists. Skipping.")
            else:
                existing_data[title] = {
                    "image_url": image_url,
                    "details_url": details_url,
                    "genres": genres,
                    "rating": rating
                }
                print(f"Added {title}")

        print(f"Finished scraping page {page_number}")
        page_number += 1

    with open(output_file, "w") as file:
        json.dump(existing_data, file, indent=4)
    print(f"Data saved to {output_file}")

scrape_manga()


Scraping page: https://mangapark.io/latest/action/1
Gomi Ika da to Tsuihousareta Shounin, Jitsu wa Zensei Kensha desu already exists. Skipping.
Added Eternal Life (Novel)
Added Beastbox: The Mecha Era
Daxia Swordsman already exists. Skipping.
Apocalyptic Super Farm already exists. Skipping.
Added Silvyre
Virus King already exists. Skipping.
The Reincarnated Assassin Is A Genius Swordsman already exists. Skipping.
Catastrophic Priest already exists. Skipping.
Trait Hoarder already exists. Skipping.
Heavenly Inquisition Sword already exists. Skipping.
Drug Devourer already exists. Skipping.
Freezing The World: I Built A Doomsday Safehouse already exists. Skipping.
I Get Stronger Just By Lying Down While My Apprentice Cultivates already exists. Skipping.
The Best Female Fairy already exists. Skipping.
Global Martial Arts already exists. Skipping.
Catastrophic Priest already exists. Skipping.
I’ll Retire After Saving the World already exists. Skipping.
Finished scraping page 1
Scraping pag

In [14]:
import requests
from bs4 import BeautifulSoup
import json
import time
import os

def scrape_and_update_anime_data(base_url, output_file):
    if os.path.exists(output_file):
        with open(output_file, 'r', encoding='utf-8') as file:
            existing_data = json.load(file)
    else:
        existing_data = []

    existing_titles = {anime['title'] for anime in existing_data}

    new_anime_data = []
    page_num = 1
    while True:
        page_url = base_url + str(page_num)
        response = requests.get(page_url)

        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')
            anime_list = soup.find_all('article', class_='bs')

            if not anime_list:
                print(f"No anime found on page {page_num}. Stopping.")
                break

            for anime in anime_list:
                title = anime.find('a')['title']

                if title in existing_titles:
                    continue

                episode_link = anime.find('a')['href']
                image_url = anime.find('img')['src']

                try:
                    episode_response = requests.get(episode_link)
                    if episode_response.status_code == 200:
                        episode_soup = BeautifulSoup(episode_response.content, 'html.parser')
                        iframe_element = episode_soup.find('iframe')
                        if iframe_element:
                            video_url = iframe_element.get('src')
                        else:
                            video_url = "No video URL found"
                    else:
                        video_url = "Failed to retrieve video URL"
                except Exception as e:
                    video_url = f"Error occurred: {str(e)}"

                new_anime_data.append({
                    'title': title,
                    'episode_link': episode_link,
                    'image_url': image_url,
                    'video_url': video_url
                })

                time.sleep(1)

            print(f"Successfully scraped page {page_num}")
        else:
            print(f"Failed to retrieve page {page_num}. Status code: {response.status_code}")
            break

        page_num += 1

    combined_data = existing_data + new_anime_data

    with open(output_file, 'w', encoding='utf-8') as json_file:
        json.dump(combined_data, json_file, ensure_ascii=False, indent=4)

    print(f"Data successfully written to {output_file}")
    print(f"New entries added: {len(new_anime_data)}")


base_url = 'https://chia-anime.su/page/'
output_file = 'all_anime_data.json'

scrape_and_update_anime_data(base_url, output_file)


Successfully scraped page 1
Successfully scraped page 2
Successfully scraped page 3
Successfully scraped page 4
Successfully scraped page 5
Successfully scraped page 6
Successfully scraped page 7
Successfully scraped page 8
Successfully scraped page 9
Successfully scraped page 10
Successfully scraped page 11
Successfully scraped page 12
Successfully scraped page 13
Successfully scraped page 14
Successfully scraped page 15
Successfully scraped page 16
Successfully scraped page 17
Successfully scraped page 18
Successfully scraped page 19
Successfully scraped page 20
Successfully scraped page 21
Successfully scraped page 22
Successfully scraped page 23
Successfully scraped page 24
Successfully scraped page 25
Successfully scraped page 26
Successfully scraped page 27
Successfully scraped page 28
Failed to retrieve page 29. Status code: 404
Data successfully written to all_anime_data.json
New entries added: 0


In [None]:
import os
import json
import requests
from bs4 import BeautifulSoup
from selenium import webdriver
import time

options = webdriver.ChromeOptions()
options.add_argument('--headless')
options.add_argument('--no-sandbox')
options.add_argument('--disable-dev-shm-usage')
driver = webdriver.Chrome(options=options)

def update_chapters_and_images_immediate(manga_file="manga_data.json"):
    if not os.path.exists(manga_file):
        print(f"File {manga_file} does not exist. Please scrape manga data first.")
        return

    with open(manga_file, "r") as file:
        manga_data = json.load(file)

    for title, details in manga_data.items():
        print(f"Checking updates for {title}...")

        response = requests.get(details["details_url"])
        if response.status_code != 200:
            print(f"Failed to fetch details for {title}. Skipping.")
            continue

        soup = BeautifulSoup(response.content, "html.parser")
        chapter_elements = soup.select("a.link-hover.link-primary.visited\\:text-accent")
        chapters_on_site = [
            {"title": chapter.text.strip(), "url": "https://mangapark.io" + chapter["href"]}
            for chapter in chapter_elements
        ]

        existing_chapters = {ch["title"] for ch in details.get("chapters", [])}
        new_chapters = [ch for ch in chapters_on_site if ch["title"] not in existing_chapters]

        if new_chapters:
            print(f"Adding {len(new_chapters)} new chapters for {title}.")
            details["chapters"] = details.get("chapters", []) + new_chapters

        for chapter in details["chapters"]:
            if "images" not in chapter or not chapter["images"]:
                print(f"Scraping images for {chapter['title']}...")
                try:
                    driver.get(chapter["url"])
                    time.sleep(5)
                    chapter_soup = BeautifulSoup(driver.page_source, "html.parser")
                    image_elements = chapter_soup.select("div[data-name='image-show'] img")
                    chapter["images"] = [img["src"] for img in image_elements]
                    print(f"Updated images for {chapter['title']}: {len(chapter['images'])} images found.")
                except Exception as e:
                    print(f"An error occurred while scraping images for {chapter['title']}: {e}")

            manga_data[title] = details
            with open(manga_file, "w") as file:
                json.dump(manga_data, file, indent=4)
            print(f"Chapter {chapter['title']} updated successfully.")

    print("All manga data updated.")

update_chapters_and_images_immediate()
driver.quit()


In [12]:
import schedule
import time
from datetime import datetime, timedelta
from pytz import timezone
from concurrent.futures import ThreadPoolExecutor, wait, ALL_COMPLETED

def run_scrape_anime():
    base_url = 'https://chia-anime.su/page/'
    output_file = 'all_anime_data.json'
    scrape_and_update_anime_data(base_url, output_file)

def run_scrape_manga():
    scrape_manga()

def run_update_chapters_and_images():
    update_chapters_and_images_immediate()

def run_all_tasks():
    with ThreadPoolExecutor() as executor:
        anime_future = executor.submit(run_scrape_anime)
        manga_future = executor.submit(run_scrape_manga)
        wait([anime_future, manga_future], return_when=ALL_COMPLETED)
        print("Both scraping tasks completed. Running update_chapters_and_images_immediate...")
        run_update_chapters_and_images()
        print("Update process completed.")

def time_until_12_am_ist():
    ist = timezone('Asia/Kolkata')
    now = datetime.now(ist)
    next_midnight = (now + timedelta(days=1)).replace(hour=0, minute=0, second=0, microsecond=0)
    if now.hour == 0 and now.minute == 0 and now.second == 0:
        return None
    return next_midnight - now

if __name__ == "__main__":
    time_left = time_until_12_am_ist()

    if time_left:
        print(
            f"Time left until 12 AM IST: "
            f"{time_left.seconds // 3600} hours, "
            f"{(time_left.seconds % 3600) // 60} minutes, "
            f"{time_left.seconds % 60} seconds, "
            f"{time_left.microseconds // 1000} milliseconds."
        )
    else:
        print("It's 12 AM IST. Running the process...")
        schedule.every().day.at("00:00").do(run_all_tasks)

        while True:
            schedule.run_pending()
            time.sleep(1)


Time left until 12 AM IST: 0 hours, 47 minutes, 39 seconds, 245 milliseconds.


In [5]:
!pip install schedule

Collecting schedule
  Downloading schedule-1.2.2-py3-none-any.whl.metadata (3.8 kB)
Downloading schedule-1.2.2-py3-none-any.whl (12 kB)
Installing collected packages: schedule
Successfully installed schedule-1.2.2


In [13]:
!pip install selenium

Collecting selenium
  Downloading selenium-4.27.1-py3-none-any.whl.metadata (7.1 kB)
Collecting trio~=0.17 (from selenium)
  Downloading trio-0.27.0-py3-none-any.whl.metadata (8.6 kB)
Collecting trio-websocket~=0.9 (from selenium)
  Downloading trio_websocket-0.11.1-py3-none-any.whl.metadata (4.7 kB)
Collecting sortedcontainers (from trio~=0.17->selenium)
  Downloading sortedcontainers-2.4.0-py2.py3-none-any.whl.metadata (10 kB)
Collecting outcome (from trio~=0.17->selenium)
  Downloading outcome-1.3.0.post0-py2.py3-none-any.whl.metadata (2.6 kB)
Collecting wsproto>=0.14 (from trio-websocket~=0.9->selenium)
  Downloading wsproto-1.2.0-py3-none-any.whl.metadata (5.6 kB)
Downloading selenium-4.27.1-py3-none-any.whl (9.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.7/9.7 MB[0m [31m74.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading trio-0.27.0-py3-none-any.whl (481 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m481.7/481.7 kB[0m [31m33.2 MB/s