## Install and Import Dependencies

In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
import time

# Initialize WebDriver
service = Service("/Users/aikyask/Desktop/chromedriver-mac-arm64/chromedriver")  # Replace with your WebDriver path
driver = webdriver.Chrome(service=service)
 
# Initialize WebDriverWait
wait = WebDriverWait(driver, 10)


In [6]:
# Open IMDb URL
url = "https://www.imdb.com/search/title/?groups=top_1000&count=100&sort=user_rating,asc"
driver.get(url)
 
# Accept cookies if present
try:
    cookie_button = wait.until(EC.element_to_be_clickable((By.ID, "onetrust-accept-btn-handler")))
    cookie_button.click()
except Exception:
    pass  # If no cookie banner, continue
 
# Dictionary to store movie details (ensures uniqueness)
movies_dict = {}

## Function to scrape movies list

In [7]:
# Function to extract movie details
def extract_movies():
    """Extracts movies from the current page and adds unique ones to the dictionary."""
    movie_containers = driver.find_elements(By.XPATH, "//li[contains(@class, 'ipc-metadata-list-summary-item')]")
    
    for movie in movie_containers:
        try:
            name = movie.find_element(By.XPATH, ".//h3").text.strip()
            
            # Ensure uniqueness
            if name in movies_dict:
                continue
            
            metadata_items = movie.find_elements(By.XPATH, ".//span[contains(@class, 'sc-ad5a2436-7 cJVQtZ dli-title-metadata-item')]")
            year = metadata_items[0].text if len(metadata_items) > 0 else "N/A"
            movie_time = metadata_items[1].text if len(metadata_items) > 1 else "N/A"
            rating = movie.find_element(By.XPATH, ".//span[contains(@class, 'ipc-rating-star--rating')]").text
            plot_sum = movie.find_element(By.XPATH, ".//div[contains(@class, 'ipc-html-content-inner-div')]").text
            poster = movie.find_element(By.XPATH, ".//a[contains(@class, 'ipc-lockup-overlay ipc-focusable')]").get_attribute("href")
            
            # Store in dictionary
            movies_dict[name] = {
                "Title": name,
                "Year": year,
                "Movie Duration": movie_time,
                "Rating": rating,
                "Poster": poster,
                "Plot Summary": plot_sum
            }
        except Exception:
            continue  # Skip any movie that causes an error
 
 

## Calling the function

In [8]:
# Extract movies from the first page
extract_movies()

In [9]:
# Click "100 more" button 9 times to get all 1000 movies
for _ in range(2):  
    try:
        more_button = wait.until(
            EC.element_to_be_clickable((By.XPATH, "//button[contains(., '100 more')]"))
        )
        
        # Scroll to the button
        driver.execute_script("arguments[0].scrollIntoView();", more_button)
        time.sleep(1)  # Ensure visibility
 
        # Click using JavaScript to avoid interception
        driver.execute_script("arguments[0].click();", more_button)
        
        time.sleep(3)  # Wait for the next set of movies to load
        
        # Extract new movie data (ensuring uniqueness)
        extract_movies()
    except Exception as e:
        print(f"Could not click '100 more': {e}")
        break  # Stop if the button is not found
 

In [10]:
# Convert dictionary to DataFrame
df = pd.DataFrame(movies_dict.values())
 

In [11]:
print(f"\n✅ Total Movies Extracted: {len(df)}")


✅ Total Movies Extracted: 300


## Data cleaning

In [12]:

df.head()

Unnamed: 0,Title,Year,Movie Duration,Rating,Poster,Plot Summary
0,1. Once Upon a Time... in Hollywood,2019,2h 41m,7.6,https://www.imdb.com/title/tt7131622/?ref_=sr_i_1,As Hollywood's Golden Age is winding down duri...
1,2. The Fifth Element,1997,2h 6m,7.6,https://www.imdb.com/title/tt0119116/?ref_=sr_i_2,"In the colorful future, a cab driver unwitting..."
2,3. 300,2006,1h 57m,7.6,https://www.imdb.com/title/tt0416449/?ref_=sr_i_3,"In the ancient battle of Thermopylae, King Leo..."
3,4. John Wick: Chapter 4,2023,2h 49m,7.6,https://www.imdb.com/title/tt10366206/?ref_=sr...,John Wick uncovers a path to defeating The Hig...
4,5. The Whale,2022,1h 57m,7.6,https://www.imdb.com/title/tt13833688/?ref_=sr...,"A reclusive, morbidly obese English teacher at..."


In [13]:
# Save data to CSV
df.to_csv("/Users/aikyask/Desktop/imdb_tv_1000_uniquetest.csv", index=False)
driver.quit()
print("Scraping completed and saved to 'imdb_top_1000_unique.csv'")

Scraping completed and saved to 'imdb_top_1000_unique.csv'


## Scraping Additional details

In [14]:
# Load dataset
file_path = "/Users/aikyask/Desktop/imdb_tv_1000_uniquetest.csv"  # Update with your file path
df = pd.read_csv(file_path)

# Setup Selenium WebDriver
chrome_options = Options()
chrome_options.add_argument("--headless")  # Run in headless mode (no UI)
chrome_options.add_argument("--disable-gpu")
chrome_options.add_argument("--window-size=1920x1080")

service = Service(ChromeDriverManager().install())
driver = webdriver.Chrome(service=service, options=chrome_options)

# Function to extract IMDb details using XPath
def scrape_imdb_details(imdb_url):
    try:
        driver.get(imdb_url)
        wait = WebDriverWait(driver, 5)  # Wait up to 5 seconds for elements to appear

        # Extract Genres
        genres = [g.text for g in driver.find_elements(By.XPATH, "//a[contains(@href, '/search/title/?genres=')]")]

        # Extract Directors
        directors = [d.text for d in driver.find_elements(By.XPATH, "//li[contains(@data-testid, 'title-pc-principal-credit')]//a")]

        # Extract Writers
        writers = [w.text for w in driver.find_elements(By.XPATH, "//li[contains(@data-testid, 'title-pc-principal-credit')][2]//a")]

        # Extract Actors (Top 5)
        actors = [a.text for a in driver.find_elements(By.XPATH, "//a[contains(@href, '/name/nm')]")][:5]

        # Extract Languages
        languages = [l.text for l in driver.find_elements(By.XPATH, "//li[contains(@data-testid, 'title-details-languages')]//a")]

        # Extract Release Date
        release_date_elem = driver.find_elements(By.XPATH, "//li[contains(@data-testid, 'title-details-releasedate')]//a")
        release_date = release_date_elem[0].text.strip() if release_date_elem else ""

        # Extract Release Country
        release_country_elem = driver.find_elements(By.XPATH, "//li[contains(@data-testid, 'title-details-origin')]//a")
        release_country = release_country_elem[0].text.strip() if release_country_elem else ""

        # Extract Box Office Collection (Worldwide Gross)
        box_office_elem = driver.find_elements(By.XPATH, "//li[contains(@data-testid, 'title-boxoffice-cumulativeworldwidegross')]//span")
        box_office = box_office_elem[0].text.strip() if box_office_elem else ""

        # Extract Production Companies
        production_companies = [p.text for p in driver.find_elements(By.XPATH, "//li[contains(@data-testid, 'title-details-companies')]//a")]

        return {
            "Genres": ", ".join(genres),
            "Directors": ", ".join(directors),
            "Writers": ", ".join(writers),
            "Actors": ", ".join(actors),
            "Languages": ", ".join(languages),
            "Release Date": release_date,
            "Release Country": release_country,
            "Box Office": box_office,
            "Production Companies": ", ".join(production_companies),
        }
    except Exception as e:
        print(f"Error scraping {imdb_url}: {e}")
        return None

# Scrape details for each movie
scraped_data = []
for index, row in df.iterrows():
    print(f"Scraping {index + 1}/{len(df)}: {row['Title']}")
    details = scrape_imdb_details(row["Poster"])
    if details:
        scraped_data.append(details)
    else:
        scraped_data.append({"Genres": "", "Directors": "", "Writers": "", "Actors": "", "Languages": "", 
                             "Release Date": "", "Release Country": "", "Box Office": "", "Production Companies": ""})

# Convert scraped data into DataFrame
scraped_df = pd.DataFrame(scraped_data)

# Combine with original data
final_df = pd.concat([df, scraped_df], axis=1)

# Save as new CSV
output_file = "imdb_movie_scraped.csv"
final_df.to_csv(output_file, index=False)

# Close browser
driver.quit()

print(f"✅ Scraping completed! Data saved as '{output_file}'.")

Scraping 1/300: 1. Once Upon a Time... in Hollywood
Scraping 2/300: 2. The Fifth Element
Scraping 3/300: 3. 300
Scraping 4/300: 4. John Wick: Chapter 4
Scraping 5/300: 5. The Whale
Scraping 6/300: 6. Hell or High Water
Scraping 7/300: 7. Watchmen
Scraping 8/300: 8. Anatomy of a Fall
Scraping 9/300: 9. Aftersun
Scraping 10/300: 10. The Others
Scraping 11/300: 11. Office Space
Scraping 12/300: 12. Star Wars: Episode III - Revenge of the Sith
Scraping 13/300: 13. RoboCop
Scraping 14/300: 14. True Grit
Scraping 15/300: 15. The Machinist
Scraping 16/300: 16. Deadpool 2
Scraping 17/300: 17. Saw
Scraping 18/300: 18. Minority Report
Scraping 19/300: 19. The Butterfly Effect
Scraping 20/300: 20. Kung Fu Panda
Scraping 21/300: 21. The Last of the Mohicans
Scraping 22/300: 22. What We Do in the Shadows
Scraping 23/300: 23. Dark Waters
Scraping 24/300: 24. The Boondock Saints
Scraping 25/300: 25. Stardust
Scraping 26/300: 26. The Thin Red Line
Scraping 27/300: 27. My Cousin Vinny
Scraping 28/300: 