In [1]:
import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# Start the WebDriver
driver = webdriver.Chrome()

# URL to scrape (Updated to Action genre)
url = 'https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31&genres=drama'
driver.get(url)

# Wait for the page to load completely
WebDriverWait(driver, 10).until(
    EC.presence_of_element_located((By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul'))
)

titles = []
ratings = []
votings = []
durations = []
scraped_titles = set()  # This will store titles we've already scraped
max_movies = 1000  # Limit the number of movies to scrape
current_movie_count = 0  # Track how many movies we've scraped

# Function to extract data
def extract_data():
    global current_movie_count  # Access the counter in the global scope
    movie_items = driver.find_elements(By.XPATH, '//*[@id="__next"]/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/ul/li')
    for movie_item in movie_items:
        if current_movie_count >= max_movies:
            return False  # Stop scraping once we've reached 1000 movies

        try:
            # Extract movie title
            title = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[1]/a/h3').text

            # Skip if we've already scraped this title
            if title in scraped_titles:
                continue

            # Extract rating, handle missing data gracefully
            try:
                rating = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/span/div/span/span[1]').text
            except:
                rating = "N/A"  # Default value for missing rating

            # Extract voting, handle missing data gracefully
            try:
                voting = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/span/div/span/span[2]').text
            except:
                voting = "N/A"  # Default value for missing voting

            # Extract duration if available, handle missing duration gracefully
            try:
                duration = movie_item.find_element(By.XPATH, './div/div/div/div[1]/div[2]/div[2]/span[2]').text
            except:
                duration = "N/A"  # In case duration is missing

            # Append data to respective lists
            titles.append(title)
            ratings.append(rating)
            votings.append(voting)
            durations.append(duration)

            # Add the title to the set of scraped titles
            scraped_titles.add(title)

            # Increment the counter
            current_movie_count += 1

        except Exception as e:
            print(f"Error extracting data for a movie: {e}")
            continue

    return True

# Extract data from the first page
extract_data()

# Function to click the '50 More' button safely
def click_load_more():
    try:
        # Scroll to make sure the "50 More" button is in view
        load_more_button = driver.find_element(By.XPATH, "//*[@id='__next']/main/div[2]/div[3]/section/section/div/section/section/div[2]/div/section/div[2]/div[2]/div[2]/div/span/button")
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)

        # Wait for the button to be clickable
        WebDriverWait(driver, 10).until(EC.element_to_be_clickable(load_more_button))

        # Use JavaScript to click the button (sometimes the default click doesn't work)
        driver.execute_script("arguments[0].click();", load_more_button)
        time.sleep(5)  # Wait for new content to load

        # Extract new data
        return extract_data()

    except Exception as e:
        print(f"Error: {e}. No more '50 More' button found or other issue.")
        return False

# Click the "50 More" button until we've scraped 1000 movies
while current_movie_count < max_movies:
    if not click_load_more():
        break

# Close the driver after scraping
driver.quit()

# Create a DataFrame to store the scraped data
df = pd.DataFrame({
    'Title': titles,
    'Rating': ratings,
    'Votes': votings,
    'Duration': durations
})

# Optionally save the data to a CSV file
df.to_csv('drama_movies.csv', index=False)

# Show the first few rows of the DataFrame
print(df.head())


                   Title Rating    Votes Duration
0               1. Anora    7.6   (160K)   2h 19m
1       2. The Substance    7.3   (284K)   2h 21m
2       3. The Brutalist    7.5    (71K)   3h 36m
3            4. Conclave    7.4   (135K)       2h
4  5. A Complete Unknown    7.4    (62K)   2h 21m
