In [28]:
# Import Library
from selenium import webdriver
import time
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import pandas as pd
import csv

In [29]:
# Setup Chrome options
options = Options()
options.add_argument("--start-maximized")
options.add_argument("--ignore-certificate-errors")
options.add_argument("--ignore-ssl-errors")

In [30]:
# Setup driver service
service = Service(ChromeDriverManager().install())

In [42]:
# Start webdriver
driver = webdriver.Chrome(service=service, options=options)

# load the webpage
driver.get("https://www.imdb.com/interest/all/?ref_=fn_asr_gnr")


In [None]:
# Fetch all category elements on the main page that have class 'ipc-slate-card__title'
movie_categories = driver.find_elements(By.CSS_SELECTOR, 'a.ipc-slate-card__title')

In [None]:
# Get the text of the first element in the movie_categories list
movie_categories[0].text

'Superhero'

In [None]:
# Get the value of the ‘href’ attribute from the first element in the movie_categories list
movie_categories[0].get_attribute('href')

'https://www.imdb.com/interest/in0000008/?ref_=ints_pi_in_t_1'

In [None]:
# Take the title of each movie
titles = driver.find_elements(By.CSS_SELECTOR, ".ipc-poster-card__title.ipc-poster-card__title--clamp-2.ipc-poster-card__title--clickable")

# Check the first movie title
print(titles[0].text)

Thunderbolts*


In [None]:
# Grab the first movie link
movie_link = titles[0].get_attribute('href')
print(movie_link)

https://www.imdb.com/title/tt20969586/?ref_=int_popm_t_1


In [None]:
# Take the ratings value of each movie title
rating = driver.find_elements(By.CSS_SELECTOR, 'span.ipc-rating-star--rating')
print(rating[0].text)

7.7


### Full Code


In [43]:
# Get all category elements from the main page
movie_categories = driver.find_elements(By.CSS_SELECTOR, 'a.ipc-slate-card__title')

# Save the category names and URLs into a list
categories = []
for category in movie_categories:
    # Get the name of the category by selecting the text inside the <div> tag
    name = category.find_element(By.CSS_SELECTOR, 'div.ipc-slate-card__title-text').text
    
    # Get the URL of the category (the link) by using the 'href' attribute
    url = category.get_attribute('href')
    
    # Store each category's name and URL as a dictionary in the 'categories' list
    categories.append({'name': name, 'url': url})

# Create an empty list to store the movie data
movies = []

# Loop through each category (from the 'categories' list we created earlier)
for category in categories:
    # Visit each category's URL by using the 'url' key in the dictionary
    driver.get(category['url'])
    
    # Wait for the page to load (using time.sleep for simplicity)
    WebDriverWait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.ipc-poster-card__title.ipc-poster-card__title--clamp-2.ipc-poster-card__title--clickable')))

    
    # Find all movie title elements within the category page
    titles = driver.find_elements(By.CSS_SELECTOR, '.ipc-poster-card__title.ipc-poster-card__title--clamp-2.ipc-poster-card__title--clickable')

    # Loop through each movie title found on the page
    for title in titles:
        # Get the movie name (text inside the element)
        movie_name = title.text
        
        # Get the movie link by extracting the 'href' attribute
        movie_link = title.get_attribute('href')

        try:
            # Try to get the movie rating by finding the corresponding element using XPath
            rating = title.find_element(By.XPATH, '../following-sibling::div//span[contains(@class, "ipc-rating-star--rating")]').text
        except:
            # If the rating is not found, assign 'No Rating'
            rating = 'No Rating'

        # Append the movie data (name, link, category, and rating) into the 'movies' list
        movies.append({
            'kategori': category['name'],  # Category name
            'movie_name': movie_name,      # Movie title
            'movie_link': movie_link,      # Movie link
            'rating': rating               # Movie rating
        })

# Display the results by printing each movie data
for movie in movies:
    print(movie)


{'kategori': 'Superhero', 'movie_name': 'Thunderbolts*', 'movie_link': 'https://www.imdb.com/title/tt20969586/?ref_=int_popm_t_1', 'rating': '5.8'}
{'kategori': 'Superhero', 'movie_name': 'Captain America: Brave New World', 'movie_link': 'https://www.imdb.com/title/tt14513804/?ref_=int_popm_t_2', 'rating': '6.5'}
{'kategori': 'Superhero', 'movie_name': 'Novocaine', 'movie_link': 'https://www.imdb.com/title/tt29603959/?ref_=int_popm_t_3', 'rating': '5.5'}
{'kategori': 'Superhero', 'movie_name': 'Kraven the Hunter', 'movie_link': 'https://www.imdb.com/title/tt8790086/?ref_=int_popm_t_4', 'rating': '9.0'}
{'kategori': 'Superhero', 'movie_name': 'The Dark Knight', 'movie_link': 'https://www.imdb.com/title/tt0468569/?ref_=int_popm_t_5', 'rating': '7.6'}
{'kategori': 'Superhero', 'movie_name': 'Deadpool & Wolverine', 'movie_link': 'https://www.imdb.com/title/tt6263850/?ref_=int_popm_t_6', 'rating': '6.9'}
{'kategori': 'Superhero', 'movie_name': 'Sonic the Hedgehog 3', 'movie_link': 'https://

In [None]:
# Convert a list of dicts to a DataFrame
df = pd.DataFrame(movies)

# Show DataFrame
print(df)

# Save to CSV file
df.to_csv('scraped_movie.csv', index=False, encoding='utf-8-sig')

print("Data is successfully saved to scraped_movie.csv")

           kategori                        movie_name  \
0         Superhero                     Thunderbolts*   
1         Superhero  Captain America: Brave New World   
2         Superhero                         Novocaine   
3         Superhero                 Kraven the Hunter   
4         Superhero                   The Dark Knight   
...             ...                               ...   
23865  Western Epic                           Godless   
23866  Western Epic                     Into the West   
23867  Western Epic                Hatfields & McCoys   
23868  Western Epic                      Broken Trail   
23869  Western Epic                           Bonanza   

                                              movie_link     rating  
0      https://www.imdb.com/title/tt20969586/?ref_=in...        5.8  
1      https://www.imdb.com/title/tt14513804/?ref_=in...        6.5  
2      https://www.imdb.com/title/tt29603959/?ref_=in...        5.5  
3      https://www.imdb.com/title/t