In [1]:
import re
import requests
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup as bs

In [34]:
def anime_season(month: str) -> str:
    """
    This function converts a given month (as a string) into its corresponding season.

    Parameters:
    - month (str): A string representing the month in the format 'MM'. The valid values are '01' to '12'.

    Returns:
    - str: A string representing the season. The possible values are 'Winter', 'Spring', 'Summer', 'Fall', or 'Unspecified' if the input month is not within the range of 1 to 12.
    """
    month_num = int(month)
    seasons = ["Winter", "Spring", "Summer", "Fall"]
    return seasons[(month_num - 1) // 3] if 1 <= month_num <= 12 else "Unspecified"

In [48]:
def safe_text(element, default='N/A'):
    """
    This function attempts to extract the text content from a given BeautifulSoup element.
    
    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element representing the HTML element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or if the text cannot be parsed.
    
    Returns:
    - str: The text content of the element, or the default value if the element is not found or if the text cannot be parsed.
    """
    return element.text.strip() if element else default

def safe_int(element, default='N/A'):
    """
    This function attempts to extract an integer value from the text of an element.
    
    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element representing the HTML element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or if the text cannot be parsed as an integer.
    
    Returns:
    - int or str: An integer value extracted from the text of the element, or the default value if the text cannot be parsed as an integer.
    """
    try:
        return int(element.text.strip().replace(',', '')) if element else default
    except ValueError:
        return default

def safe_float(element, default='N/A'):
    """
    This function attempts to extract a float value from the text of an element.
    
    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element representing the HTML element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or if the text cannot be parsed as a float.
    
    Returns:
    - float or str: A float value extracted from the text of the element, or the default value if the text cannot be parsed as a float.
    """
    try:
        return float(element.text.strip()) if element else default
    except ValueError:
        return default

def scrape_anime_data(anime_item) -> dict[str, str]:
    """
    Extract data from the HTML content of an anime item.
    
    Parameters:
    - anime_item (BeautifulSoup): A BeautifulSoup object containing the HTML of an anime item.
    
    Returns:
    - dict: A dictionary with the number of episodes and the release year.
    """
    soup = bs(anime_item, 'html.parser')
    
    start_date_text = soup.find('span', class_='item')
    release_year = start_date_text.text.strip().split(', ')[-1] if start_date_text else 'N/A'
    
    info_div = soup.find('div', class_='info')
    if info_div:
        eps_text = info_div.get_text()
        match = re.search(r'(\d+)\s*eps', eps_text)
        number_of_episodes = match.group(1) if match else 'N/A'
    else:
        number_of_episodes = 'N/A'
    
    status_span = soup.find('span', class_='item finished') or soup.find('span', class_='item airing')
    status = status_span.text.strip() if status_span else 'N/A'
    
    genres_div = soup.find('div', class_='genres-inner js-genre-inner')
    genres = ', '.join([genre.find('a').text.strip() for genre in genres_div.find_all('span', class_='genre')]) if genres_div else 'N/A'
    
    properties_div = soup.find('div', class_='properties')
    
    def extract_property(caption):
        """Helper function to extract property values based on the caption"""
        if not properties_div:
            return 'N/A'
        property_divs = properties_div.find_all('div', class_='property')
        for div in property_divs:
            caption_span = div.find('span', class_='caption')
            if caption_span and caption_span.text.strip() == caption:
                item_spans = div.find_all('span', class_='item')
                return ', '.join(item.get_text(strip=True) for item in item_spans) if item_spans else 'N/A'
        return 'N/A'
    
    studio = extract_property('Studio')
    source = extract_property('Source')
    demographic = extract_property('Demographic')
    
    # Extract themes using regex pattern for URLs
    themes = 'N/A'
    if properties_div:
        themes_div = properties_div.find('div', class_='property')
        if themes_div:
            themes_html = str(themes_div)
            theme_matches = re.findall(
                r'<span class="item"><a href="/anime/genre/\d+/[^"]*" title="[^"]*">([^<]*)</a></span>', themes_html)
            themes = ', '.join(theme_matches) if theme_matches else 'N/A'
    
    # Extract the rating
    rating = safe_float(soup.find('div', class_='scormem-item score score-label score-8'), 'N/A')
    
    # Extract the voter count
    voters = safe_int(soup.find('div', class_='scormem-item member'), 'N/A')
    
    # Extract synopsis
    synopsis = safe_text(soup.find('div', class_='synopsis js-synopsis').find('p', class_='preline'), 'N/A')
    
    return {
        'Episodes': number_of_episodes,
        'Release Year': release_year,
        'Status': status,
        'Genres': genres,
        'Studio': studio,
        'Source': source,
        'Demographic': demographic,
        'Themes': themes,
        'Synopsis': synopsis,
        'Voters': voters,
        'Rating': rating,
    }

In [49]:
def fetch_and_scrape(url: str, page_limit: int) -> list[dict[str, str]]:
    """Fetch and scrape anime data from the given URL."""
    all_data = []
    for page_num in range(1, page_limit + 1):
        page_url = f'{url}?page={page_num}'
        print(f'Scraping {page_url}...')
        
        try:
            response = requests.get(page_url)
            response.raise_for_status()  # Raise an error for bad responses
            soup = bs(response.text, 'html.parser')

            anime_list = soup.find_all('div', class_='anime-item')  # Update with the actual class for anime items
            for anime_item in anime_list:
                anime_data = scrape_anime_data(anime_item)
                all_data.append(anime_data)
                
        except Exception as e:
            print(f'Error fetching {page_url}: {e}')
            break

    return all_data

In [None]:
def modeler(date: str, data: list[dict[str, str]]) -> None:
    """
    Processes and saves anime data to a CSV file.

    -----
    Parameters:
    - date (str): The date string used to name the CSV file.
    - data (List[Dict[str, str]]): A list of dictionaries containing anime data.

    -----
    The function converts the list of dictionaries to a DataFrame, removes duplicate entries,
    and saves the DataFrame to a CSV file in the 'data/processed' directory.
    """
    df = pd.DataFrame(data)
    df.drop_duplicates(inplace=True)
    file_path = 'data/raw'    
    df.to_csv(f'../../{file_path}/AnimeData_{date}.csv', index=False)
    print(f'Data saved to {file_path}/AnimeData_{date}.csv')

In [55]:
date = datetime.now().strftime('%d%m%y')

# EXTRACT AND TRANSFORM
url_list = [   
    'https://myanimelist.net/anime/genre/1/',  # Action
    'https://myanimelist.net/anime/genre/2/',  # Adventure
    'https://myanimelist.net/anime/genre/5/',  # Avant Garde
    'https://myanimelist.net/anime/genre/4/',  # Comedy
    'https://myanimelist.net/anime/genre/8/',  # Drama
    'https://myanimelist.net/anime/genre/10/', # Fantasy
    'https://myanimelist.net/anime/genre/47/', # Gourmet
    'https://myanimelist.net/anime/genre/14/', # Horror
    'https://myanimelist.net/anime/genre/7/',  # Mystery
    'https://myanimelist.net/anime/genre/22/', # Romance
    'https://myanimelist.net/anime/genre/24/', # Sci-fi
    'https://myanimelist.net/anime/genre/36/', # Slice-of-life
    'https://myanimelist.net/anime/genre/30/', # Sport
    'https://myanimelist.net/anime/genre/37/', # Supernatural
    'https://myanimelist.net/anime/genre/41/'  # Suspense
]

if __name__ == '__main__':
    all_data = []
    for url in url_list:
        all_data.extend(fetch_and_scrape_new(url, 1))
        
    modeler(date, all_data)

Scraping https://myanimelist.net/anime/genre/1/?page=1...
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html class="appearance-none" lang="en">
 <head>
  <link crossorigin="anonymous" href="//www.googletagmanager.com/" rel="preconnect"/>
  <link crossorigin="anonymous" href="https://cdn.myanimelist.net" rel="preconnect"/>
  <title>
   Action - Anime - MyAnimeList.net
  </title>
  <meta content="Trying to find Action anime? Discover more Action anime on MyAnimeList, the largest online anime and manga database in the world!" name="description"/>
  <meta content="anime, myanimelist, anime news, manga" name="keywords"/>
  <link href="https://myanimelist.net/anime/genre/1/Action" rel="canonical"/>
  <meta content="en_US" property="og:locale"/>
  <meta content="360769957454434" property="fb:app_id"/>
  <meta content="MyAnimeList.net" property="og:site_name"/>
  <meta content="summary" name="twitter:card"/>
  <meta c

KeyboardInterrupt: 

# Testing

# correct ones
- status_span = soup.find('span', class_='item finished') or soup.find('span', class_='item airing')
status = status_span.text.strip() if status_span else 'N/A'

-  info_div = soup.find('div', class_='info')
    if info_div:
        eps_text = info_div.get_text()
        match = re.search(r'(\d+)\s*eps', eps_text)
        number_of_episodes = match.group(1) if match else 'N/A'
    else:
        number_of_episodes = 'N/A'

-   start_date_text = soup.find('span', class_='item')
    release_year = start_date_text.text.strip().split(', ')[-1] if start_date_text else 'N/A'

In [17]:
html_content = '''
<div class="prodsrc">
      <div class="video"><a href="https://myanimelist.net/anime/16498/Shingeki_no_Kyojin/video" class="ga-click" title="Watch Episode Video"><i class="malicon malicon-movie-episode"></i></a>      </div>
      <div class="info"><span class="item">TV, 2013</span><span class="item finished">Finished</span><span class="item">
          <span>25 eps</span>,
          <span>24 min </span>
        </span>
      </div>
      <div class="broadcast"><a href="javascript:void(0)" onclick="return false;" class="js-broadcast-button ga-click" data-title="Shingeki no Kyojin" data-subtitle="Attack on Titan" data-raw="{&quot;data&quot;:[{&quot;platform&quot;:{&quot;id&quot;:1,&quot;name&quot;:&quot;Crunchyroll&quot;,&quot;icon&quot;:&quot;crunchyroll&quot;,&quot;type&quot;:1},&quot;available&quot;:false,&quot;url&quot;:&quot;http:\/\/www.crunchyroll.com\/series-280312&quot;},{&quot;platform&quot;:{&quot;id&quot;:2,&quot;name&quot;:&quot;Netflix&quot;,&quot;icon&quot;:&quot;netflix&quot;,&quot;type&quot;:1},&quot;available&quot;:false,&quot;url&quot;:&quot;https:\/\/www.netflix.com\/title\/70299043&quot;},{&quot;platform&quot;:{&quot;id&quot;:28,&quot;name&quot;:&quot;Shahid&quot;,&quot;icon&quot;:&quot;shahid&quot;,&quot;type&quot;:2},&quot;available&quot;:false,&quot;url&quot;:&quot;https:\/\/shahid.mbc.net\/en\/series\/Attack-On-Titan\/series-922170&quot;}],&quot;count&quot;:{&quot;available&quot;:0,&quot;typicals&quot;:2,&quot;others&quot;:1,&quot;total&quot;:3}}" data-ga-click-type="broadcast-tile-streaming-icon" data-ga-click-param="aid:16498"><i class="malicon malicon-streaming-slash"></i></a></div>
    </div>
    
<div class="genres-inner js-genre-inner"><span class="genre">
        <a href="/anime/genre/1/Action" title="Action">Action</a>
      </span><span class="genre">
        <a href="/anime/genre/46/Award_Winning" title="Award Winning">Award Winning</a>
      </span><span class="genre">
        <a href="/anime/genre/8/Drama" title="Drama">Drama</a>
      </span><span class="genre">
        <a href="/anime/genre/41/Suspense" title="Suspense">Suspense</a>
      </span></div>
      
<div class="properties">
      <div class="property">
        <span class="caption">Studio</span><span class="item"><a href="/anime/producer/858/Wit_Studio" title="Wit Studio">Wit Studio</a></span></div>
      <div class="property">
        <span class="caption">Source</span><span class="item">Manga</span>
      </div><div class="property">
        <span class="caption">Themes</span><span class="item"><a href="/anime/genre/58/Gore" title="Gore">Gore</a></span><span class="item"><a href="/anime/genre/38/Military" title="Military">Military</a></span><span class="item"><a href="/anime/genre/76/Survival" title="Survival">Survival</a></span></div><div class="property">
        <span class="caption">Demographic</span><span class="item"><a href="/anime/genre/27/Shounen" title="Shounen">Shounen</a></span></div></div>
        
<div class="synopsis js-synopsis">
    <p class="preline">Centuries ago, mankind was slaughtered to near extinction by monstrous humanoid creatures called Titans, forcing humans to hide in fear behind enormous concentric walls. What makes these giants truly terrifying is that their taste for human flesh is not born out of hunger but what appears to be out of pleasure. To ensure their survival, the remnants of humanity began living within defensive barriers, resulting in one hundred years without a single titan encounter. However, that fragile calm is soon shattered when a colossal Titan manages to breach the supposedly impregnable outer wall, reigniting the fight for survival against the man-eating abominations.

After witnessing a horrific personal loss at the hands of the invading creatures, Eren Yeager dedicates his life to their eradication by enlisting into the Survey Corps, an elite military unit that combats the merciless humanoids outside the protection of the walls. Eren, his adopted sister Mikasa Ackerman, and his childhood friend Armin Arlert join the brutal war against the Titans and race to discover a way of defeating them before the last walls are breached.

[Written by MAL Rewrite]</p>
    <button class="js-toggle-text toggle-text" style="display: block; margin: 0 auto; background: none; border: none;">
      <i class="fa-solid fa-angle-down" style="pointer-events: none;"></i>
    </button>

    <div class="properties">
      <div class="property">
        <span class="caption">Studio</span><span class="item"><a href="/anime/producer/858/Wit_Studio" title="Wit Studio">Wit Studio</a></span></div>
      <div class="property">
        <span class="caption">Source</span><span class="item">Manga</span>
      </div><div class="property">
        <span class="caption">Themes</span><span class="item"><a href="/anime/genre/58/Gore" title="Gore">Gore</a></span><span class="item"><a href="/anime/genre/38/Military" title="Military">Military</a></span><span class="item"><a href="/anime/genre/76/Survival" title="Survival">Survival</a></span></div><div class="property">
        <span class="caption">Demographic</span><span class="item"><a href="/anime/genre/27/Shounen" title="Shounen">Shounen</a></span></div></div>
  </div>
'''

In [52]:
scrape_anime_data(html_content)

{'Episodes': '25',
 'Release Year': '2013',
 'Status': 'Finished',
 'Genres': 'Action, Award Winning, Drama, Suspense',
 'Studio': 'Wit Studio',
 'Source': 'Manga',
 'Demographic': 'Shounen',
 'Themes': 'N/A',
 'Synopsis': 'Centuries ago, mankind was slaughtered to near extinction by monstrous humanoid creatures called Titans, forcing humans to hide in fear behind enormous concentric walls. What makes these giants truly terrifying is that their taste for human flesh is not born out of hunger but what appears to be out of pleasure. To ensure their survival, the remnants of humanity began living within defensive barriers, resulting in one hundred years without a single titan encounter. However, that fragile calm is soon shattered when a colossal Titan manages to breach the supposedly impregnable outer wall, reigniting the fight for survival against the man-eating abominations.\n\nAfter witnessing a horrific personal loss at the hands of the invading creatures, Eren Yeager dedicates his l

In [6]:
def fetch_and_scrape_new(url: str, page_limit: int) -> list[dict[str, str]]:
    all_data = []
    for page_num in range(1, page_limit + 1):
        page_url = f'{url}?page={page_num}'
        print(f'Scraping {page_url}...')

        try:
            response = requests.get(page_url)
            response.raise_for_status()
            soup = bs(response.text, 'html.parser')
            
            # Print raw HTML to inspect structure
            print(soup.prettify()[:1000])  # Print first 1000 characters for inspection
            
            anime_list = soup.find_all('div', class_='anime-item')  # Update with the actual class for anime items
            print(f'Found {len(anime_list)} anime items on page {page_num}')  # Debug print

            for anime_item in anime_list:
                anime_data = scrape_anime_data(str(anime_item))  # Convert to string
                all_data.append(anime_data)
                
                # Debug print to check scraped data
                print(anime_data)

        except Exception as e:
            print(f'Error fetching {page_url}: {e}')
            break

    return all_data

In [13]:
import re
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup as bs
import time
from typing import List, Dict

def get_current_date() -> str:
    """
    Get the current date formatted as 'DDMMYY'.

    Returns:
    - str: Current date formatted as 'DDMMYY'.
    """
    return datetime.now().strftime('%d%m%y')

def anime_season(month: str) -> str:
    """
    Converts a given month (as a string) into its corresponding season.

    Parameters:
    - month (str): A string representing the month in the format 'MM'. The valid values are '01' to '12'.

    Returns:
    - str: A string representing the season.
    """
    month_num = int(month)
    seasons = ["Winter", "Spring", "Summer", "Fall"]
    return seasons[(month_num - 1) // 3] if 1 <= month_num <= 12 else "Unspecified"

def safe_text(element, default='N/A') -> str:
    """
    Extract the text content from a given BeautifulSoup element.

    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or the text cannot be parsed.

    Returns:
    - str: The text content of the element, or the default value.
    """
    return element.text.strip() if element else default

def safe_int(element, default='N/A') -> int:
    """
    Extract an integer value from the text of an element.

    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or the text cannot be parsed as an integer.

    Returns:
    - int or str: An integer value extracted from the text of the element, or the default value.
    """
    try:
        return int(element.text.strip().replace(',', '')) if element else default
    except ValueError:
        return default

def safe_float(element, default='N/A') -> float:
    """
    Extract a float value from the text of an element.

    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or the text cannot be parsed as a float.

    Returns:
    - float or str: A float value extracted from the text of the element, or the default value.
    """
    try:
        return float(element.text.strip()) if element else default
    except ValueError:
        return default

def scrape_anime_data(anime_item) -> Dict[str, str]:
    """
    Extract data from the HTML content of an anime item.

    Parameters:
    - anime_item (BeautifulSoup): A BeautifulSoup object containing the HTML of an anime item.

    Returns:
    - dict: A dictionary with the number of episodes and the release year.
    """
    soup = bs(anime_item, 'html.parser')
    
    start_date_text = soup.find('span', class_='item')
    release_year = start_date_text.text.strip().split(', ')[-1] if start_date_text else 'N/A'
    
    info_div = soup.find('div', class_='info')
    if info_div:
        eps_text = info_div.get_text()
        match = re.search(r'(\d+)\s*eps', eps_text)
        number_of_episodes = match.group(1) if match else 'N/A'
    else:
        number_of_episodes = 'N/A'
    
    status_span = soup.find('span', class_='item finished') or soup.find('span', class_='item airing')
    status = status_span.text.strip() if status_span else 'N/A'
    
    genres_div = soup.find('div', class_='genres-inner js-genre-inner')
    genres = ', '.join([genre.find('a').text.strip() for genre in genres_div.find_all('span', class_='genre')]) if genres_div else 'N/A'
    
    properties_div = soup.find('div', class_='properties')
    
    def extract_property(caption):
        """Helper function to extract property values based on the caption"""
        if not properties_div:
            return 'N/A'
        property_divs = properties_div.find_all('div', class_='property')
        for div in property_divs:
            caption_span = div.find('span', class_='caption')
            if caption_span and caption_span.text.strip() == caption:
                item_spans = div.find_all('span', class_='item')
                return ', '.join(item.get_text(strip=True) for item in item_spans) if item_spans else 'N/A'
        return 'N/A'
    
    studio = extract_property('Studio')
    source = extract_property('Source')
    demographic = extract_property('Demographic')
    
    # Extract themes using regex pattern for URLs
    themes = 'N/A'
    if properties_div:
        themes_div = properties_div.find('div', class_='property')
        if themes_div:
            themes_html = str(themes_div)
            theme_matches = re.findall(
                r'<span class="item"><a href="/anime/genre/\d+/[^"]*" title="[^"]*">([^<]*)</a></span>', themes_html)
            themes = ', '.join(theme_matches) if theme_matches else 'N/A'
    
    # Extract the rating
    rating = safe_float(soup.find('div', class_='scormem-item score score-label score-8'), 'N/A')
    
    # Extract the voter count
    voters = safe_int(soup.find('div', class_='scormem-item member'), 'N/A')
    
    # Extract synopsis
    synopsis = safe_text(soup.find('div', class_='synopsis js-synopsis').find('p', class_='preline'), 'N/A')
    
    return {
        'Episodes': number_of_episodes,
        'Release Year': release_year,
        'Status': status,
        'Genres': genres,
        'Studio': studio,
        'Source': source,
        'Demographic': demographic,
        'Themes': themes,
        'Synopsis': synopsis,
        'Voters': voters,
        'Rating': rating,
    }

def fetch_and_scrape(url: str, page_limit: int = 1, retries: int = 3, delay: int = 5):
    """
    Fetch and scrape anime data from a given URL.

    Parameters:
    - url (str): The URL to fetch and scrape.
    - page_limit (int): The number of pages to scrape.
    - retries (int): The number of retries in case of request failure.
    - delay (int): The delay between retries in seconds.

    Returns:
    - list: A list of dictionaries containing scraped anime data.
    """
    print("Scraping started...")
    all_data = []
    for page in range(1, page_limit + 1):
        page_url = f"{url}?page={page}"
        for attempt in range(retries):
            try:
                response = requests.get(page_url)
                response.raise_for_status()
                soup = bs(response.content, 'html.parser')
                anime_list = soup.find_all('div', class_='js-anime-category-producer')
                for anime_item in anime_list:
                    anime_data = scrape_anime_data(str(anime_item))
                    all_data.append(anime_data)
                break
            except requests.RequestException as e:
                print(f"Error fetching {page_url}: {e}. Retrying in {delay} seconds...")
                time.sleep(delay)
    return all_data

def modeler(date: str, data: List[Dict[str, str]]) -> None:
    """
    Processes and saves anime data to a CSV file.

    Parameters:
    - date (str): The date string used to name the CSV file.
    - data (List[Dict[str, str]]): A list of dictionaries containing anime data.

    The function converts the list of dictionaries to a DataFrame, removes duplicate entries,
    and saves the DataFrame to a CSV file in the 'data/raw' directory.
    """
    df = pd.DataFrame(data)
    df.drop_duplicates(inplace=True)
    file_path = 'data/raw'    
    df.to_csv(f'../../{file_path}/AnimeData_{date}.csv', index=False)
    print(f'Data saved to {file_path}/AnimeData_{date}.csv')

def main():
    url_list = [   
        'https://myanimelist.net/anime/genre/1/',  # Action
        # 'https://myanimelist.net/anime/genre/2/',  # Adventure
        # 'https://myanimelist.net/anime/genre/5/',  # Avant Garde
        # 'https://myanimelist.net/anime/genre/4/',  # Comedy
        # 'https://myanimelist.net/anime/genre/8/',  # Drama
        # 'https://myanimelist.net/anime/genre/10/', # Fantasy
        # 'https://myanimelist.net/anime/genre/47/', # Gourmet
        # 'https://myanimelist.net/anime/genre/14/', # Horror
        # 'https://myanimelist.net/anime/genre/7/',  # Mystery
        # 'https://myanimelist.net/anime/genre/22/', # Romance
        # 'https://myanimelist.net/anime/genre/24/', # Sci-Fi
        # 'https://myanimelist.net/anime/genre/36/', # Slice of Life
        # 'https://myanimelist.net/anime/genre/30/', # Sports
        # 'https://myanimelist.net/anime/genre/37/', # Supernatural
        # 'https://myanimelist.net/anime/genre/41/'  # Suspense
    ]
    
    current_date = get_current_date()
    all_anime_data = []

    for url in url_list:
        scraped_data = fetch_and_scrape(url, page_limit=5)
        all_anime_data.extend(scraped_data)
    
    modeler(current_date, all_anime_data)

if __name__ == '__main__':
    main()

Scraping started...
Data saved to data/raw/AnimeData_280724.csv


In [15]:
response = requests.get('https://myanimelist.net/anime/genre/1/')
response.raise_for_status()
soup = bs(response.content, 'html.parser')
anime_list = soup.find_all('div', class_='js-anime-category-producer')

In [20]:
anime_list

[<div class="js-anime-category-producer seasonal-anime js-seasonal-anime js-anime-type-all js-anime-type-1" data-broadcast-available="0" data-genre="1,46,8,41,58,38,76,27">
 <div>
 <div class="title"><div class="title-text">
 <h2 class="h2_anime_title"><a class="link-title" href="https://myanimelist.net/anime/16498/Shingeki_no_Kyojin">Shingeki no Kyojin</a></h2></div>
 <span class="js-members" style="display: none;">3993780</span>
 <span class="js-score" style="display: none;">8.55</span>
 <span class="js-start_date" style="display: none;">20130407</span>
 <span class="js-title" style="display: none;">Shingeki no Kyojin</span>
 </div>
 <div class="prodsrc">
 <div class="video"><a class="ga-click" href="https://myanimelist.net/anime/16498/Shingeki_no_Kyojin/video" title="Watch Episode Video"><i class="malicon malicon-movie-episode"></i></a> </div>
 <div class="info"><span class="item">TV, 2013</span><span class="item finished">Finished</span><span class="item">
 <span>25 eps</span>,
   

In [21]:
import re
import pandas as pd
import requests
from datetime import datetime
from bs4 import BeautifulSoup as bs
import time
from typing import List, Dict

def get_current_date() -> str:
    """
    Get the current date formatted as 'DDMMYY'.

    Returns:
    - str: Current date formatted as 'DDMMYY'.
    """
    return datetime.now().strftime('%d%m%y')

def anime_season(month: str) -> str:
    """
    Converts a given month (as a string) into its corresponding season.

    Parameters:
    - month (str): A string representing the month in the format 'MM'. The valid values are '01' to '12'.

    Returns:
    - str: A string representing the season.
    """
    month_num = int(month)
    seasons = ["Winter", "Spring", "Summer", "Fall"]
    return seasons[(month_num - 1) // 3] if 1 <= month_num <= 12 else "Unspecified"

def safe_text(element, default='N/A') -> str:
    """
    Extract the text content from a given BeautifulSoup element.

    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or the text cannot be parsed.

    Returns:
    - str: The text content of the element, or the default value.
    """
    return element.text.strip() if element else default

def safe_int(element, default='N/A') -> int:
    """
    Extract an integer value from the text of an element.

    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or the text cannot be parsed as an integer.

    Returns:
    - int or str: An integer value extracted from the text of the element, or the default value.
    """
    try:
        return int(element.text.strip().replace(',', '')) if element else default
    except ValueError:
        return default

def safe_float(element, default='N/A') -> float:
    """
    Extract a float value from the text of an element.

    Parameters:
    - element (bs4.element.Tag): A BeautifulSoup element containing the text to be parsed.
    - default (str): A default value to return if the element is not found or the text cannot be parsed as a float.

    Returns:
    - float or str: A float value extracted from the text of the element, or the default value.
    """
    try:
        return float(element.text.strip()) if element else default
    except ValueError:
        return default

def fetch_and_scrape(url: str, page_limit: int = 1, retries: int = 3, delay: int = 5) -> List[Dict[str, str]]:
    """
    Fetch and scrape anime data from a given URL.

    Parameters:
    - url (str): The URL to fetch and scrape.
    - page_limit (int): The number of pages to scrape.
    - retries (int): The number of retries in case of request failure.
    - delay (int): The delay between retries in seconds.

    Returns:
    - list: A list of dictionaries containing scraped anime data.
    """
    def scrape_anime_data(anime_item) -> Dict[str, str]:
        """
        Extract data from the HTML content of an anime item.

        Parameters:
        - anime_item (BeautifulSoup): A BeautifulSoup object containing the HTML of an anime item.

        Returns:
        - dict: A dictionary with the number of episodes and the release year.
        """
        soup = bs(anime_item, 'html.parser')

        title = safe_text(soup.find('h2', class_='h2_anime_title').find('a'))

        start_date_text = safe_text(soup.find('span', class_='js-start_date'), 'N/A')
        release_year = start_date_text[:4] if start_date_text != 'N/A' else 'N/A'

        info_div = soup.find('div', class_='info')
        number_of_episodes = 'N/A'
        if info_div:
            eps_text = info_div.get_text()
            match = re.search(r'(\d+)\s*eps', eps_text)
            number_of_episodes = match.group(1) if match else 'N/A'

        status = safe_text(info_div.find('span', class_='item finished') or info_div.find('span', class_='item airing'))

        genres_div = soup.find('div', class_='genres-inner js-genre-inner')
        genres = ', '.join([genre.find('a').text.strip() for genre in genres_div.find_all('span', class_='genre')]) if genres_div else 'N/A'

        properties_div = soup.find('div', class_='properties')

        def extract_property(caption):
            """Helper function to extract property values based on the caption"""
            if not properties_div:
                return 'N/A'
            property_divs = properties_div.find_all('div', class_='property')
            for div in property_divs:
                caption_span = div.find('span', class_='caption')
                if caption_span and caption_span.text.strip() == caption:
                    item_spans = div.find_all('span', class_='item')
                    return ', '.join(item.get_text(strip=True) for item in item_spans) if item_spans else 'N/A'
            return 'N/A'

        studio = extract_property('Studio')
        source = extract_property('Source')
        demographic = extract_property('Demographic')

        # Extract themes using regex pattern for URLs
        themes = 'N/A'
        if properties_div:
            themes_div = properties_div.find('div', class_='property')
            if themes_div:
                themes_html = str(themes_div)
                theme_matches = re.findall(
                    r'<span class="item"><a href="/anime/genre/\d+/[^"]*" title="[^"]*">([^<]*)</a></span>', themes_html)
                themes = ', '.join(theme_matches) if theme_matches else 'N/A'

        # Extract the rating
        rating = safe_float(soup.find('div', class_='scormem-item score score-label score-8'), 'N/A')

        # Extract the voter count
        voters = safe_int(soup.find('div', class_='scormem-item member'), 'N/A')

        # Extract synopsis
        synopsis = safe_text(soup.find('div', class_='synopsis js-synopsis').find('p', class_='preline'), 'N/A')

        return {
            'Title': title,
            'Episodes': number_of_episodes,
            'Release Year': release_year,
            'Status': status,
            'Genres': genres,
            'Studio': studio,
            'Source': source,
            'Demographic': demographic,
            'Themes': themes,
            'Synopsis': synopsis,
            'Voters': voters,
            'Rating': rating,
        }

    print("Scraping started...")
    all_data = []
    for page in range(1, page_limit + 1):
        page_url = f"{url}?page={page}"
        for attempt in range(retries):
            try:
                response = requests.get(page_url)
                response.raise_for_status()
                soup = bs(response.content, 'html.parser')
                anime_list = soup.find_all('div', class_='js-anime-category-producer')
                for anime_item in anime_list:
                    anime_data = scrape_anime_data(str(anime_item))
                    all_data.append(anime_data)
                break
            except requests.RequestException as e:
                print(f"Error fetching {page_url}: {e}. Retrying in {delay} seconds...")
                time.sleep(delay)
    return all_data

def modeler(date: str, data: List[Dict[str, str]]) -> None:
    """
    Processes and saves anime data to a CSV file.

    Parameters:
    - date (str): The date string used to name the CSV file.
    - data (List[Dict[str, str]]): A list of dictionaries containing anime data.

    The function converts the list of dictionaries to a DataFrame, removes duplicate entries,
    and saves the DataFrame to a CSV file in the 'data/raw' directory.
    """
    df = pd.DataFrame(data)
    df.drop_duplicates(inplace=True)
    file_path = 'data/raw'    
    df.to_csv(f'../../{file_path}/AnimeData_{date}.csv', index=False)
    print(f'Data saved to {file_path}/AnimeData_{date}.csv')

def main():
    url_list = [   
        'https://myanimelist.net/anime/genre/1/',  # Action
        # 'https://myanimelist.net/anime/genre/2/',  # Adventure
        # 'https://myanimelist.net/anime/genre/5/',  # Avant Garde
        # 'https://myanimelist.net/anime/genre/4/',  # Comedy
        # 'https://myanimelist.net/anime/genre/8/',  # Drama
        # 'https://myanimelist.net/anime/genre/10/', # Fantasy
        # 'https://myanimelist.net/anime/genre/47/', # Gourmet
        # 'https://myanimelist.net/anime/genre/14/', # Horror
        # 'https://myanimelist.net/anime/genre/7/',  # Mystery
        # 'https://myanimelist.net/anime/genre/22/', # Romance
        # 'https://myanimelist.net/anime/genre/24/', # Sci-Fi
        # 'https://myanimelist.net/anime/genre/36/', # Slice of Life
        # 'https://myanimelist.net/anime/genre/30/', # Sports
        # 'https://myanimelist.net/anime/genre/37/', # Supernatural
        # 'https://myanimelist.net/anime/genre/41/'  # Suspense
    ]
    
    current_date = get_current_date()
    all_anime_data = []

    for url in url_list:
        scraped_data = fetch_and_scrape(url, page_limit=1)
        all_anime_data.extend(scraped_data)
    
    modeler(current_date, all_anime_data)

if __name__ == '__main__':
    main()

Scraping started...
Data saved to data/raw/AnimeData_280724.csv
