In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import asyncio
import aiohttp
import time

In [2]:
dataset_1 = pd.read_csv("final_movie_dataset.csv")
movie_list = dataset_1['imdb_id'].tolist()  # Use .tolist() directly

In [29]:
async def fetch_movie_data_async(session, movie_id):
    
    # base_url = f"https://www.the-numbers.com/movie/{movie_title.replace(' ', '-')}#tab=summary"
    base_url = f"https://www.imdb.com/title/{movie_id}/?ref_=tt_mlt_i_8"
    try:
        async with session.get(base_url, headers={'User-Agent': 'Mozilla/5.0'}) as response:
            response.raise_for_status()
            html_content = await response.text()
            search_soup = BeautifulSoup(html_content, 'html.parser')

            data = {'Original Id': movie_id}
            # print(data)
            rating_element = search_soup.find('div', class_='sc-bf57f3f2-0 gDRYed')
            # print(rating_element)
            if rating_element:
                # print("work")
                rating_area = rating_element.find('ul', class_='ipc-inline-list ipc-inline-list--show-dividers sc-103e4e3c-2 cMcwpt baseAlt baseAlt')
                if rating_area:
                    # print("work")
                    ratings_list = rating_area.find_all('li')
                    if len(ratings_list) >= 3:
                        data['Rating'] = ratings_list[1].text.strip()
                        print(data['Rating'])
                    else:
                        data['Rating'] = None
                        print(f"Warning: not enough <li> elements found for '{movie_id}.")
                else:
                    data['Rating'] = None
                    print(f"Warning: could not find <ul> element for '{movie_id}.")
            else:
                data['Rating'] = None

            return data
       

    except aiohttp.ClientResponseError as e:
        print(f"Error fetching URL for '{movie_id}': {e.status} - {base_url}")
        return None
    except aiohttp.ClientError as e:
        print(f"Client error fetching URL for '{movie_id}': {e} - {base_url}")
        return None
    except Exception as e:
        print(f"An unexpected error occurred for '{movie_id}': {e} - {base_url}")
        return None


In [34]:
import random
async def scrape_all_movies_async(movie_titles, max_concurrent=5):
    scraped_data = []
    semaphore = asyncio.Semaphore(max_concurrent)
    async with aiohttp.ClientSession() as session:
        tasks = []
        for title in movie_titles:
            async with semaphore:
                # print(f"Scraping data for: {title}")
                task = asyncio.create_task(fetch_movie_data_async(session, title))
                tasks.append(task)
                # await asyncio.sleep(0.1)  # Polite delay
                await asyncio.sleep(random.uniform(0.2, 0.5))

        results = await asyncio.gather(*tasks)
        for result in results:
            if result:
                scraped_data.append(result)
    return scraped_data

In [35]:
start_time = time.time()
max_concurrent = 10
scraped_data = await scrape_all_movies_async(movie_list, max_concurrent=max_concurrent)
end_time = time.time()

df = pd.DataFrame(scraped_data)
print("\nScraped Data:")
print(df.head())
print(f"Number of scraped entries: {len(df)}")

csv_filename = "Ratings.csv"
df.to_csv(csv_filename, index=False)
print(f"\nData saved to {csv_filename}")
print(f"Total scraping time: {end_time - start_time:.2f} seconds")


G
PG
PG-13
R
PG
PG
PG
R
R
PG-13
PG-13
G
PG-13
R
PG-13
R
R
PG
PG-13
R
R
R
PG-13
R
R
PG-13
R
1995
R
R
R
G
G
R
R
R
G
PG
PG-13
PG-13
R
R
PG-13
PG-13
R
R
R
Unrated
R
G
R
PG
R
PG-13
PG
PG
R
PG
R
PG-13
R
PG-13
R
PG-13
R
R
R
R
R
PG
PG
R
R
R
Unrated
R
R
PG
R
PG-13
R
PG
PG-13
R
PG-13
R
R
R
R
R
Not Rated
R
R
R
1995
PG-13
R
PG-13
G
PG-13
R
R
R
PG-13
R
PG
PG-13
R
PG
R
PG-13
R
R
Not Rated
R
R
PG-13
Not Rated
PG-13
R
R
PG
R
R
R
R
PG
R
Not Rated
PG-13
Approved
R
R
PG
R
PG
PG-13
R
R
R
R
R
R
R
PG
PG-13
PG-13
R
R
R
PG-13
NC-17
R
R
R
PG-13
R
PG
R
R
R
PG-13
PG-13
R
R
R
R
R
NC-17
R
R
PG
R
Not Rated
R
PG
R
PG-13
R
PG
R
PG-13
PG-13
R
R
R
R
Not Rated
PG-13
R
PG-13
R
PG-13
PG-13
R
PG-13
R
R
R
R
R
Not Rated
PG-13
R
R
R
PG-13
PG-13
PG
G
PG
R
R
G
PG-13
R
PG-13
R
PG
PG
R
PG
R
PG-13
R
PG-13
R
PG
R
PG
PG
G
R
R
Not Rated
R
PG-13
R
R
PG-13
R
PG-13
PG
R
PG-13
PG-13
PG
R
PG-13
R
R
PG-13
R
R
R
R
PG
R
R
PG
R
R
PG-13
R
PG-13
PG-13
R
PG
R
R
R
R
PG-13
R
PG
G
R
PG-13
R
R
Not Rated
PG
R
R
R
R
Not Rated
R
R
PG
R
R
PG-13
PG-13
R
