In [5]:
import pandas as pd
import requests
from bs4 import BeautifulSoup

# Load the CSV file
csv_file = '10_anime.csv'
df = pd.read_csv(csv_file)

# Define a function to scrape the image URL from MyAnimeList
def get_anime_image_url(anime_name):
    search_url = f"https://myanimelist.net/search/all?q={anime_name.replace(' ', '%20')}&cat=all"
    response = requests.get(search_url)
    
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        result = soup.find('a', {'class': 'hoverinfo_trigger'})
        if result:
            anime_page_url = result['href']
            anime_response = requests.get(anime_page_url)
            if anime_response.status_code == 200:
                anime_soup = BeautifulSoup(anime_response.text, 'html.parser')
                image_tag = anime_soup.find('img', {'itemprop': 'image'})
                if image_tag:
                    return image_tag['data-src'] if 'data-src' in image_tag.attrs else image_tag['src']
    return None

# Add a new column for image URLs
df['image_url'] = df['name'].apply(get_anime_image_url)

# Save the updated CSV file
df.to_csv('images.csv', index=False)

print("Image URLs added to the CSV file.")

Image URLs added to the CSV file.


In [13]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import nest_asyncio
import time
from aiohttp import ClientTimeout

nest_asyncio.apply()

# Load the CSV file
csv_file = '10_anime.csv'
df = pd.read_csv(csv_file)

# Exponential backoff logic and error handling for fetching the image URL
async def get_anime_image_url(session, anime_name, retries=5, base_delay=1):
    search_url = f"https://myanimelist.net/search/all?q={anime_name.replace(' ', '%20')}&cat=all"
    timeout = ClientTimeout(total=15)  # Set a timeout for requests
    
    for attempt in range(retries):
        try:
            async with session.get(search_url, timeout=timeout) as response:
                if response.status == 200:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    result = soup.find('a', {'class': 'hoverinfo_trigger'})
                    if result:
                        anime_page_url = result['href']
                        async with session.get(anime_page_url, timeout=timeout) as anime_response:
                            if anime_response.status == 200:
                                anime_html = await anime_response.text()
                                anime_soup = BeautifulSoup(anime_html, 'html.parser')
                                image_tag = anime_soup.find('img', {'itemprop': 'image'})
                                if image_tag:
                                    return image_tag['data-src'] if 'data-src' in image_tag.attrs else image_tag['src']
        except (aiohttp.ClientOSError, asyncio.TimeoutError, aiohttp.ClientResponseError) as e:
            delay = base_delay * (2 ** attempt)  # Exponential backoff
            print(f"Error fetching {anime_name}: {e}. Retrying in {delay} seconds...")
            await asyncio.sleep(delay)  # Exponential delay before retrying
    return None  # Return None if all retries fail

# Main asynchronous function to fetch image URLs with rate limiting
async def fetch_all_image_urls(anime_names, delay_between_requests=1):
    async with aiohttp.ClientSession() as session:
        tasks = []
        for anime_name in anime_names:
            tasks.append(get_anime_image_url(session, anime_name))
            await asyncio.sleep(delay_between_requests)  # Fixed delay between requests
        return await asyncio.gather(*tasks)

# Get all image URLs asynchronously with rate limiting
anime_names = df['name'].tolist()
image_urls = asyncio.run(fetch_all_image_urls(anime_names))
  
# Add the image URLs to the DataFrame
df['image_url'] = image_urls


# Save the updated CSV file
df.to_csv('fetch_with_images.csv', index=False)

print("Image URLs added to the CSV file.")


Image URLs added to the CSV file.


In [17]:
import pandas as pd
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import nest_asyncio
import time
from aiohttp import ClientTimeout

nest_asyncio.apply()

# Load the CSV file
csv_file = '100_anime.csv'
df = pd.read_csv(csv_file)

# Exponential backoff logic and error handling for fetching the image URL
async def get_anime_image_url(session, anime_name, retries=5, base_delay=10):
    search_url = f"https://myanimelist.net/search/all?q={anime_name.replace(' ', '%20')}&cat=all"
    timeout = ClientTimeout(total=30)  # Increased timeout for slower connections
    
    for attempt in range(retries):
        try:
            async with session.get(search_url, timeout=timeout) as response:
                if response.status == 200:
                    html = await response.text()
                    soup = BeautifulSoup(html, 'html.parser')
                    result = soup.find('a', {'class': 'hoverinfo_trigger'})
                    if result:
                        anime_page_url = result['href']
                        async with session.get(anime_page_url, timeout=timeout) as anime_response:
                            if anime_response.status == 200:
                                anime_html = await anime_response.text()
                                anime_soup = BeautifulSoup(anime_html, 'html.parser')
                                image_tag = anime_soup.find('img', {'itemprop': 'image'})
                                if image_tag:
                                    return image_tag['data-src'] if 'data-src' in image_tag.attrs else image_tag['src']
                else:
                    print(f"Failed to retrieve search page for {anime_name}: Status {response.status}")
        except (aiohttp.ClientOSError, asyncio.TimeoutError, aiohttp.ClientResponseError) as e:
            delay = base_delay * (2 ** attempt)  # Exponential backoff
            print(f"Error fetching {anime_name}: {e}. Retrying in {delay} seconds...")
            await asyncio.sleep(delay)  # Exponential delay before retrying
    return None  # Return None if all retries fail

# Main asynchronous function to fetch image URLs with rate limiting
async def fetch_all_image_urls(anime_names, delay_between_requests=2):  # Increased delay to 2 seconds
    async with aiohttp.ClientSession() as session:
        tasks = []
        for anime_name in anime_names:
            tasks.append(get_anime_image_url(session, anime_name))
            await asyncio.sleep(delay_between_requests)  # Fixed delay between requests
        return await asyncio.gather(*tasks)

# Get all image URLs asynchronously with rate limiting
anime_names = df['name'].tolist()

# Try to fetch the image URLs
start_time = time.time()
image_urls = asyncio.run(fetch_all_image_urls(anime_names))
end_time = time.time()

# Add the image URLs to the DataFrame
df['image_url'] = image_urls

# Save the updated CSV file
df.to_csv('fetch_with_images.csv', index=False)

print(f"Image URLs added to the CSV file in {end_time - start_time:.2f} seconds.")


Error fetching Shin Calimero: . Retrying in 10 seconds...
Error fetching Ashinaga Ojisan: . Retrying in 10 seconds...
Error fetching Kakko Kawaii Sengen! 2: . Retrying in 10 seconds...
Error fetching Suzy&#039;s Zoo: Daisuki! Witzy - Happy Birthday: . Retrying in 10 seconds...
Error fetching Ouritsu Uchuugun: Honneamise no Tsubasa - Pilot Film: . Retrying in 10 seconds...
Error fetching Nodoka Mori no Doubutsu Daisakusen: . Retrying in 10 seconds...
Error fetching Stitch!: Zutto Saikou no Tomodachi Special: . Retrying in 10 seconds...
Error fetching Nessa no Haou Gandalla: . Retrying in 10 seconds...
Error fetching Space☆Dandy Picture Drama: . Retrying in 10 seconds...
Error fetching Mitsu x Mitsu Drops: . Retrying in 10 seconds...
Error fetching Marie &amp; Gali ver. 2.0: . Retrying in 10 seconds...
Error fetching Mahou no Tenshi Creamy Mami: Lovely Serenade: . Retrying in 10 seconds...
Error fetching Utsunomiko: Heaven Chapter: . Retrying in 10 seconds...
Error fetching Hitotsubu ni 