In [1]:
%pip install selenium

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [4]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time
import csv
import re

# Set up the browser
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31")
driver.maximize_window()
time.sleep(3)

# Expand Genre Filter
genre_label = driver.find_element(By.XPATH, "//label[contains(@aria-controls, 'genreAccordion')]")
driver.execute_script("window.scrollBy(0, 400);") 
time.sleep(0.5)
driver.execute_script("arguments[0].click();", genre_label)
time.sleep(2)

# Select Action Genre
action = driver.find_element(By.XPATH, "//span[text()='Action']")
genre_name = re.sub(r'\d+', '', action.text).strip()
driver.execute_script("arguments[0].click();", action)
time.sleep(3)

# Scraping Function
def scrape_movies(existing_movies_count):
    movie_items = driver.find_elements(By.XPATH, '//li[@class="ipc-metadata-list-summary-item"]')

    new_movies = []  
    for index, movie in enumerate(movie_items[existing_movies_count:], start=existing_movies_count):
        try:
            title = movie.find_element(By.XPATH, './/a[@class="ipc-title-link-wrapper"]').text
        except:
            title = "N/A"

        try:
            rating = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--rating"]').text
        except:
            rating = "N/A"

        try:
            views = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--voteCount"]').text
        except:
            views = "N/A"

        try:
            metadata_container = movie.find_element(By.XPATH, './/div[contains(@class, "metadata")]')
            metadata_spans = metadata_container.find_elements(By.XPATH, './/span')
            duration = next((span.text for span in metadata_spans if 'h' in span.text or 'min' in span.text), "N/A")
        except:
            duration = "N/A"

        movie_details = {
            "Index": index + 1,
            "Title": title,
            "Rating": rating,
            "Views": views,
            "Duration": duration,
            "Genre": genre_name
        }

        new_movies.append(movie_details)

    return new_movies

# Initial scrape
movies = scrape_movies(existing_movies_count=0)
print(f"Collected {len(movies)} movies so far...")

# Scroll and load more
click_attempts = 0
max_attempts = 25

while click_attempts < max_attempts:
    try:
        driver.execute_script("window.scrollBy(0, 100);")
        time.sleep(1)

        more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(), '▼', ''), '50 more')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", more_button)
        time.sleep(1)
        more_button.click()
        print(f"Clicked '50 more' button (attempt {click_attempts + 1})")
        time.sleep(3)

        new_movies = scrape_movies(existing_movies_count=len(movies))
        if not new_movies:
            print("No new content loaded - ending.")
            break

        movies.extend(new_movies)
        print(f"Collected {len(movies)} movies so far...")

        click_attempts += 1

    except Exception as e:
        print(f"Stopping due to error: {str(e)}")
        break

# Save to CSV
with open("Actionmovie.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Index", "Title", "Rating", "Views", "Duration", "Genre"])
    writer.writeheader()
    writer.writerows(movies)

print("Movies saved to Actionmovie.csv successfully!")

# Close browser
driver.quit()


Collected 50 movies so far...
Clicked '50 more' button (attempt 1)
Collected 100 movies so far...
Clicked '50 more' button (attempt 2)
Collected 150 movies so far...
Clicked '50 more' button (attempt 3)
Collected 200 movies so far...
Clicked '50 more' button (attempt 4)
Collected 250 movies so far...
Clicked '50 more' button (attempt 5)
Collected 300 movies so far...
Clicked '50 more' button (attempt 6)
Collected 350 movies so far...
Clicked '50 more' button (attempt 7)
Collected 400 movies so far...
Clicked '50 more' button (attempt 8)
Collected 450 movies so far...
Clicked '50 more' button (attempt 9)
Collected 500 movies so far...
Clicked '50 more' button (attempt 10)
Collected 550 movies so far...
Clicked '50 more' button (attempt 11)
Collected 600 movies so far...
Clicked '50 more' button (attempt 12)
Collected 650 movies so far...
Clicked '50 more' button (attempt 13)
Collected 700 movies so far...
Clicked '50 more' button (attempt 14)
Collected 750 movies so far...
Clicked '50 m

In [5]:
import pandas as pd

df = pd.read_csv("Actionmovie.csv")

df.head(10)

Unnamed: 0,Index,Title,Rating,Views,Duration,Genre
0,1,1. Kraven the Hunter,5.5,(52K),2h 7m,Action.K
1,2,2. Gladiator II,6.5,(225K),2h 28m,Action.K
2,3,3. Twisters,6.5,(165K),2h 2m,Action.K
3,4,4. Sonic the Hedgehog 3,6.9,(57K),1h 50m,Action.K
4,5,5. Venom: The Last Dance,6.0,(115K),1h 50m,Action.K
5,6,6. Deadpool & Wolverine,7.6,(488K),2h 8m,Action.K
6,7,7. Pushpa: The Rule - Part 2,6.1,(56K),3h 21m,Action.K
7,8,8. Dune: Part Two,8.5,(621K),2h 46m,Action.K
8,9,9. The Ministry of Ungentlemanly Warfare,6.8,(134K),2h 2m,Action.K
9,10,10. Freaky Tales,7.2,(960),1h 47m,Action.K


In [None]:

import re

# Set up the browser
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31")
driver.maximize_window()
time.sleep(3)

# Expand Genre Filter
genre_label = driver.find_element(By.XPATH, "//label[contains(@aria-controls, 'genreAccordion')]")
driver.execute_script("window.scrollBy(0, 400);") 
time.sleep(0.5)
driver.execute_script("arguments[0].click();", genre_label)
time.sleep(2)

# Select Action Genre
Adventure = driver.find_element(By.XPATH, "//span[text()='Adventure']")
genre_name = re.sub(r'\d+', '', Adventure.text).strip()
driver.execute_script("arguments[0].click();", Adventure)
time.sleep(3)

# Scraping Function
def scrape_movies(existing_movies_count):
    movie_items = driver.find_elements(By.XPATH, '//li[@class="ipc-metadata-list-summary-item"]')

    new_movies = []  
    for index, movie in enumerate(movie_items[existing_movies_count:], start=existing_movies_count):
        try:
            title = movie.find_element(By.XPATH, './/a[@class="ipc-title-link-wrapper"]').text
        except:
            title = "N/A"

        try:
            rating = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--rating"]').text
        except:
            rating = "N/A"

        try:
            views = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--voteCount"]').text
        except:
            views = "N/A"

        try:
            metadata_container = movie.find_element(By.XPATH, './/div[contains(@class, "metadata")]')
            metadata_spans = metadata_container.find_elements(By.XPATH, './/span')
            duration = next((span.text for span in metadata_spans if 'h' in span.text or 'min' in span.text), "N/A")
        except:
            duration = "N/A"

        movie_details = {
            "Index": index + 1,
            "Title": title,
            "Rating": rating,
            "Views": views,
            "Duration": duration,
            "Genre": genre_name
        }

        new_movies.append(movie_details)

    return new_movies

# Initial scrape
movies = scrape_movies(existing_movies_count=0)
print(f"Collected {len(movies)} movies so far...")

# Scroll and load more
click_attempts = 0
max_attempts = 25

while click_attempts < max_attempts:
    try:
        driver.execute_script("window.scrollBy(0, 100);")
        time.sleep(1)

        more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(), '▼', ''), '50 more')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", more_button)
        time.sleep(1)
        more_button.click()
        print(f"Clicked '50 more' button (attempt {click_attempts + 1})")
        time.sleep(3)

        new_movies = scrape_movies(existing_movies_count=len(movies))
        if not new_movies:
            print("No new content loaded - ending.")
            break

        movies.extend(new_movies)
        print(f"Collected {len(movies)} movies so far...")

        click_attempts += 1

    except Exception as e:
        print(f"Stopping due to error: {str(e)}")
        break

# Save to CSV
with open("Adventuremovie.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Index", "Title", "Rating", "Views", "Duration", "Genre"])
    writer.writeheader()
    writer.writerows(movies)

print("Movies saved to Adventuremovie.csv successfully!")

# Close browser
driver.quit()




Collected 50 movies so far...
Clicked '50 more' button (attempt 1)
Collected 100 movies so far...
Clicked '50 more' button (attempt 2)
Collected 150 movies so far...
Clicked '50 more' button (attempt 3)
Collected 200 movies so far...
Clicked '50 more' button (attempt 4)
Collected 250 movies so far...
Clicked '50 more' button (attempt 5)
Collected 300 movies so far...
Clicked '50 more' button (attempt 6)
Collected 350 movies so far...
Clicked '50 more' button (attempt 7)
Collected 400 movies so far...
Clicked '50 more' button (attempt 8)
Collected 450 movies so far...
Clicked '50 more' button (attempt 9)
Collected 500 movies so far...
Clicked '50 more' button (attempt 10)
Collected 550 movies so far...
Clicked '50 more' button (attempt 11)
Collected 600 movies so far...
Stopping due to error: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7D7EA5335+78597]
	GetHandleVerifier [0x00007FF7D7EA5390+78688]
	(No symbol) [0x00007FF7D7C591AA]
	(No symbol) [0x00007FF7D7CAF149]
	(No symbol) [0

In [7]:
import pandas as pd

# Load the CSV file
df = pd.read_csv("Adventuremovie.csv")

# Display the first 10 rows
df.head(10)

Unnamed: 0,Index,Title,Rating,Views,Duration,Genre
0,1,1. Mufasa: The Lion King,6.6,(59K),1h 58m,Adventure
1,2,2. Moana 2,6.6,(99K),1h 40m,Adventure
2,3,3. Kraven the Hunter,5.5,(52K),2h 7m,Adventure
3,4,4. Flow,7.9,(70K),1h 25m,Adventure
4,5,5. Gladiator II,6.5,(225K),2h 28m,Adventure
5,6,6. Twisters,6.5,(165K),2h 2m,Adventure
6,7,7. Sonic the Hedgehog 3,6.9,(57K),1h 50m,Adventure
7,8,8. Venom: The Last Dance,6.0,(115K),1h 50m,Adventure
8,9,9. Deadpool & Wolverine,7.6,(488K),2h 8m,Adventure
9,10,10. Dune: Part Two,8.5,(621K),2h 46m,Adventure


In [8]:

import re

# Set up the browser
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31")
driver.maximize_window()
time.sleep(3)

# Expand Genre Filter
genre_label = driver.find_element(By.XPATH, "//label[contains(@aria-controls, 'genreAccordion')]")
driver.execute_script("window.scrollBy(0, 400);") 
time.sleep(0.5)
driver.execute_script("arguments[0].click();", genre_label)
time.sleep(2)

# Select Action Genre
Comedy = driver.find_element(By.XPATH, "//span[text()='Comedy']")
genre_name = re.sub(r'\d+', '', Comedy.text).strip()
driver.execute_script("arguments[0].click();",Comedy)
time.sleep(3)

# Scraping Function
def scrape_movies(existing_movies_count):
    movie_items = driver.find_elements(By.XPATH, '//li[@class="ipc-metadata-list-summary-item"]')

    new_movies = []  
    for index, movie in enumerate(movie_items[existing_movies_count:], start=existing_movies_count):
        try:
            title = movie.find_element(By.XPATH, './/a[@class="ipc-title-link-wrapper"]').text
        except:
            title = "N/A"

        try:
            rating = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--rating"]').text
        except:
            rating = "N/A"

        try:
            views = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--voteCount"]').text
        except:
            views = "N/A"

        try:
            metadata_container = movie.find_element(By.XPATH, './/div[contains(@class, "metadata")]')
            metadata_spans = metadata_container.find_elements(By.XPATH, './/span')
            duration = next((span.text for span in metadata_spans if 'h' in span.text or 'min' in span.text), "N/A")
        except:
            duration = "N/A"

        movie_details = {
            "Index": index + 1,
            "Title": title,
            "Rating": rating,
            "Views": views,
            "Duration": duration,
            "Genre": genre_name
        }

        new_movies.append(movie_details)

    return new_movies

# Initial scrape
movies = scrape_movies(existing_movies_count=0)
print(f"Collected {len(movies)} movies so far...")

# Scroll and load more
click_attempts = 0
max_attempts = 25

while click_attempts < max_attempts:
    try:
        driver.execute_script("window.scrollBy(0, 100);")
        time.sleep(1)

        more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(), '▼', ''), '50 more')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", more_button)
        time.sleep(1)
        more_button.click()
        print(f"Clicked '50 more' button (attempt {click_attempts + 1})")
        time.sleep(3)

        new_movies = scrape_movies(existing_movies_count=len(movies))
        if not new_movies:
            print("No new content loaded - ending.")
            break

        movies.extend(new_movies)
        print(f"Collected {len(movies)} movies so far...")

        click_attempts += 1

    except Exception as e:
        print(f"Stopping due to error: {str(e)}")
        break

# Save to CSV
with open("Comedymovie.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Index", "Title", "Rating", "Views", "Duration", "Genre"])
    writer.writeheader()
    writer.writerows(movies)

print("Movies saved to Comedydmovie.csv successfully!")

# Close browser
driver.quit()

Collected 50 movies so far...
Clicked '50 more' button (attempt 1)
Collected 100 movies so far...
Clicked '50 more' button (attempt 2)
Collected 150 movies so far...
Clicked '50 more' button (attempt 3)
Collected 200 movies so far...
Clicked '50 more' button (attempt 4)
Collected 250 movies so far...
Clicked '50 more' button (attempt 5)
Collected 300 movies so far...
Clicked '50 more' button (attempt 6)
Collected 350 movies so far...
Clicked '50 more' button (attempt 7)
Collected 400 movies so far...
Clicked '50 more' button (attempt 8)
Collected 450 movies so far...
Clicked '50 more' button (attempt 9)
Collected 500 movies so far...
Clicked '50 more' button (attempt 10)
Collected 550 movies so far...
Clicked '50 more' button (attempt 11)
Collected 600 movies so far...
Clicked '50 more' button (attempt 12)
Collected 650 movies so far...
Clicked '50 more' button (attempt 13)
Collected 700 movies so far...
Clicked '50 more' button (attempt 14)
Collected 750 movies so far...
Clicked '50 m

In [9]:
import pandas as pd

df = pd.read_csv("Comedymovie.csv")

df.head(10)

Unnamed: 0,Index,Title,Rating,Views,Duration,Genre
0,1,1. Anora,7.5,(177K),2h 19m,ComedyK
1,2,2. Moana 2,6.6,(99K),1h 40m,ComedyK
2,3,3. A Real Pain,7.1,(86K),1h 30m,ComedyK
3,4,4. Sonic the Hedgehog 3,6.9,(57K),1h 50m,ComedyK
4,5,5. Y2K,4.8,(13K),1h 31m,ComedyK
5,6,6. Deadpool & Wolverine,7.6,(488K),2h 8m,ComedyK
6,7,7. The Ministry of Ungentlemanly Warfare,6.8,(134K),2h 2m,ComedyK
7,8,8. Freaky Tales,7.2,(960),1h 47m,ComedyK
8,9,9. The Day the Earth Blew Up: A Looney Tunes M...,7.0,(3.7K),1h 31m,ComedyK
9,10,10. Riff Raff,5.7,(2.8K),1h 43m,ComedyK


In [10]:

import re

# Set up the browser
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31")
driver.maximize_window()
time.sleep(3)

# Expand Genre Filter
genre_label = driver.find_element(By.XPATH, "//label[contains(@aria-controls, 'genreAccordion')]")
driver.execute_script("window.scrollBy(0, 400);") 
time.sleep(0.5)
driver.execute_script("arguments[0].click();", genre_label)
time.sleep(2)

# Select Action Genre
Crime = driver.find_element(By.XPATH, "//span[text()='Crime']")
genre_name = re.sub(r'\d+', '', Crime.text).strip()
driver.execute_script("arguments[0].click();",Crime)
time.sleep(3)

# Scraping Function
def scrape_movies(existing_movies_count):
    movie_items = driver.find_elements(By.XPATH, '//li[@class="ipc-metadata-list-summary-item"]')

    new_movies = []  
    for index, movie in enumerate(movie_items[existing_movies_count:], start=existing_movies_count):
        try:
            title = movie.find_element(By.XPATH, './/a[@class="ipc-title-link-wrapper"]').text
        except:
            title = "N/A"

        try:
            rating = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--rating"]').text
        except:
            rating = "N/A"

        try:
            views = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--voteCount"]').text
        except:
            views = "N/A"

        try:
            metadata_container = movie.find_element(By.XPATH, './/div[contains(@class, "metadata")]')
            metadata_spans = metadata_container.find_elements(By.XPATH, './/span')
            duration = next((span.text for span in metadata_spans if 'h' in span.text or 'min' in span.text), "N/A")
        except:
            duration = "N/A"

        movie_details = {
            "Index": index + 1,
            "Title": title,
            "Rating": rating,
            "Views": views,
            "Duration": duration,
            "Genre": genre_name
        }

        new_movies.append(movie_details)

    return new_movies

# Initial scrape
movies = scrape_movies(existing_movies_count=0)
print(f"Collected {len(movies)} movies so far...")

# Scroll and load more
click_attempts = 0
max_attempts = 25

while click_attempts < max_attempts:
    try:
        driver.execute_script("window.scrollBy(0, 100);")
        time.sleep(1)

        more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(), '▼', ''), '50 more')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", more_button)
        time.sleep(1)
        more_button.click()
        print(f"Clicked '50 more' button (attempt {click_attempts + 1})")
        time.sleep(3)

        new_movies = scrape_movies(existing_movies_count=len(movies))
        if not new_movies:
            print("No new content loaded - ending.")
            break

        movies.extend(new_movies)
        print(f"Collected {len(movies)} movies so far...")

        click_attempts += 1

    except Exception as e:
        print(f"Stopping due to error: {str(e)}")
        break

# Save to CSV
with open("Crimemovie.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Index", "Title", "Rating", "Views", "Duration", "Genre"])
    writer.writeheader()
    writer.writerows(movies)

print("Movies saved to Crimedmovie.csv successfully!")

# Close browser
driver.quit()



Collected 50 movies so far...
Clicked '50 more' button (attempt 1)
Collected 100 movies so far...
Clicked '50 more' button (attempt 2)
Collected 150 movies so far...
Clicked '50 more' button (attempt 3)
Collected 200 movies so far...
Clicked '50 more' button (attempt 4)
Collected 250 movies so far...
Clicked '50 more' button (attempt 5)
Collected 300 movies so far...
Clicked '50 more' button (attempt 6)
Collected 350 movies so far...
Clicked '50 more' button (attempt 7)
Collected 400 movies so far...
Clicked '50 more' button (attempt 8)
Collected 450 movies so far...
Clicked '50 more' button (attempt 9)
Collected 500 movies so far...
Clicked '50 more' button (attempt 10)
Collected 550 movies so far...
Clicked '50 more' button (attempt 11)
Collected 600 movies so far...
Clicked '50 more' button (attempt 12)
Collected 650 movies so far...
Clicked '50 more' button (attempt 13)
Collected 700 movies so far...
Clicked '50 more' button (attempt 14)
Collected 750 movies so far...
Clicked '50 m

In [11]:
import pandas as pd
df = pd.read_csv("Crimemovie.csv")

df.head(10)

Unnamed: 0,Index,Title,Rating,Views,Duration,Genre
0,1,1. Trap,5.8,(136K),1h 45m,Crime
1,2,2. Pushpa: The Rule - Part 2,6.1,(56K),3h 21m,Crime
2,3,3. Longlegs,6.6,(184K),1h 41m,Crime
3,4,4. Freaky Tales,7.2,(960),1h 47m,Crime
4,5,5. Juror #2,7.0,(96K),1h 54m,Crime
5,6,6. The Beekeeper,6.3,(155K),1h 45m,Crime
6,7,7. Carry-On,6.5,(163K),1h 59m,Crime
7,8,8. Riff Raff,5.7,(2.8K),1h 43m,Crime
8,9,9. The Order,6.8,(40K),1h 56m,Crime
9,10,10. Emilia Pérez,5.4,(87K),2h 12m,Crime


In [12]:

import re

# Set up the browser
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2024-01-01,2024-12-31")
driver.maximize_window()
time.sleep(3)

# Expand Genre Filter
genre_label = driver.find_element(By.XPATH, "//label[contains(@aria-controls, 'genreAccordion')]")
driver.execute_script("window.scrollBy(0, 400);") 
time.sleep(0.5)
driver.execute_script("arguments[0].click();", genre_label)
time.sleep(2)

# Select Action Genre
Animation = driver.find_element(By.XPATH, "//span[text()='Animation']")
genre_name = re.sub(r'\d+', '', Animation.text).strip()
driver.execute_script("arguments[0].click();",Animation)
time.sleep(3)

# Scraping Function
def scrape_movies(existing_movies_count):
    movie_items = driver.find_elements(By.XPATH, '//li[@class="ipc-metadata-list-summary-item"]')

    new_movies = []  
    for index, movie in enumerate(movie_items[existing_movies_count:], start=existing_movies_count):
        try:
            title = movie.find_element(By.XPATH, './/a[@class="ipc-title-link-wrapper"]').text
        except:
            title = "N/A"

        try:
            rating = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--rating"]').text
        except:
            rating = "N/A"

        try:
            views = movie.find_element(By.XPATH, './/span[@class="ipc-rating-star--voteCount"]').text
        except:
            views = "N/A"

        try:
            metadata_container = movie.find_element(By.XPATH, './/div[contains(@class, "metadata")]')
            metadata_spans = metadata_container.find_elements(By.XPATH, './/span')
            duration = next((span.text for span in metadata_spans if 'h' in span.text or 'min' in span.text), "N/A")
        except:
            duration = "N/A"

        movie_details = {
            "Index": index + 1,
            "Title": title,
            "Rating": rating,
            "Views": views,
            "Duration": duration,
            "Genre": genre_name
        }

        new_movies.append(movie_details)

    return new_movies

# Initial scrape
movies = scrape_movies(existing_movies_count=0)
print(f"Collected {len(movies)} movies so far...")

# Scroll and load more
click_attempts = 0
max_attempts = 25

while click_attempts < max_attempts:
    try:
        driver.execute_script("window.scrollBy(0, 100);")
        time.sleep(1)

        more_button = WebDriverWait(driver, 20).until(
            EC.element_to_be_clickable((By.XPATH, "//*[contains(translate(text(), '▼', ''), '50 more')]"))
        )
        driver.execute_script("arguments[0].scrollIntoView({block: 'center'});", more_button)
        time.sleep(1)
        more_button.click()
        print(f"Clicked '50 more' button (attempt {click_attempts + 1})")
        time.sleep(3)

        new_movies = scrape_movies(existing_movies_count=len(movies))
        if not new_movies:
            print("No new content loaded - ending.")
            break

        movies.extend(new_movies)
        print(f"Collected {len(movies)} movies so far...")

        click_attempts += 1

    except Exception as e:
        print(f"Stopping due to error: {str(e)}")
        break

# Save to CSV
with open("Animationmovie.csv", "w", newline="", encoding="utf-8") as file:
    writer = csv.DictWriter(file, fieldnames=["Index", "Title", "Rating", "Views", "Duration", "Genre"])
    writer.writeheader()
    writer.writerows(movies)

print("Movies saved to Animationmovie.csv successfully!")

# Close browser
driver.quit()





Collected 50 movies so far...
Clicked '50 more' button (attempt 1)
Collected 100 movies so far...
Clicked '50 more' button (attempt 2)
Collected 150 movies so far...
Clicked '50 more' button (attempt 3)
Collected 200 movies so far...
Clicked '50 more' button (attempt 4)
Collected 250 movies so far...
Clicked '50 more' button (attempt 5)
Collected 300 movies so far...
Clicked '50 more' button (attempt 6)
Collected 350 movies so far...
Clicked '50 more' button (attempt 7)
Collected 400 movies so far...
Stopping due to error: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF7D7EA5335+78597]
	GetHandleVerifier [0x00007FF7D7EA5390+78688]
	(No symbol) [0x00007FF7D7C591AA]
	(No symbol) [0x00007FF7D7CAF149]
	(No symbol) [0x00007FF7D7CAF3FC]
	(No symbol) [0x00007FF7D7D02467]
	(No symbol) [0x00007FF7D7CD712F]
	(No symbol) [0x00007FF7D7CFF2BB]
	(No symbol) [0x00007FF7D7CD6EC3]
	(No symbol) [0x00007FF7D7CA03F8]
	(No symbol) [0x00007FF7D7CA1163]
	GetHandleVerifier [0x00007FF7D814EEED+2870973]
	Ge

In [13]:
import pandas as pd

df = pd.read_csv("Animationmovie.csv")

df.head(10)

Unnamed: 0,Index,Title,Rating,Views,Duration,Genre
0,1,1. Mufasa: The Lion King,6.6,(59K),1h 58m,Animation
1,2,2. Moana 2,6.6,(99K),1h 40m,Animation
2,3,3. Flow,7.9,(70K),1h 25m,Animation
3,4,4. The Wild Robot,8.2,(159K),1h 42m,Animation
4,5,5. The Day the Earth Blew Up: A Looney Tunes M...,7.0,(3.7K),1h 31m,Animation
5,6,6. Solo Leveling: ReAwakening,8.8,(11K),2h 1m,Animation
6,7,7. Despicable Me 4,6.2,(65K),1h 34m,Animation
7,8,8. Transformers One,7.6,(49K),1h 44m,Animation
8,9,9. The Lord of the Rings: The War of the Rohirrim,6.3,(30K),2h 14m,Animation
9,10,10. Inside Out 2,7.5,(215K),1h 36m,Animation


In [17]:
import pandas as pd
import glob

# Define the path to your CSV files
path = r"C:\Users\Nivetha s\Desktop\webscrapping\*.csv"  
csv_files = glob.glob(path)

if not csv_files:
    print("No CSV files found in the specified directory.")
else:
    # Read all CSVs into a list of DataFrames
    df_list = [pd.read_csv(file) for file in csv_files]

    # Concatenate all DataFrames
    merged_df = pd.concat(df_list, ignore_index=True)

    # Fill all blank (NaN) values with 0
    merged_df.fillna(0, inplace=True)

    # Save merged output without index
    output_path = r"C:\Users\Nivetha s\Desktop\webscrapping\merged_output.csv"
    merged_df.to_csv(output_path, index=False)

    print(f"CSV files successfully merged into: {output_path}")


CSV files successfully merged into: C:\Users\Nivetha s\Desktop\webscrapping\merged_output.csv


In [15]:
import mysql.connector

In [16]:

mydb = mysql.connector.connect(
  host="localhost",
  user="root",
  password="Nive#1029@"
)

print(mydb)

<mysql.connector.connection_cext.CMySQLConnection object at 0x00000137AAEE4EC0>


In [None]:
# Step 1-create the database named "Scrapping" in Mysql 8.0 Command line client
# Step 2-create table named "Imdbmovies" with the columns given bellow:
# CREATE TABLE Imdbmovies(Title VARCHAR(255),Rating FLOAT,Views INT,Duration INT,Genre VARCHAR(255));

In [17]:
mycursor = mydb.cursor()

mycursor.execute("SHOW DATABASES")

for x in mycursor:
  print(x)

("b'stockdata'",)
('information_schema',)
('mysql',)
('performance_schema',)
('sakila',)
('scrapping',)
('stockdata',)
('sys',)
('world',)


In [18]:
pip install pymysql pandas sqlalchemy

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [20]:
import os

csv_file_path = r"C:\Users\Nivetha s\Desktop\webscrapping\merged_output.csv"

if os.path.exists(csv_file_path):
    print("File exists!")
else:
    print("File not found. Check the path.")


File exists!


In [21]:
import pymysql

try:
    connection = pymysql.connect(
        host="127.0.0.1",  
        user="root",
        password="Nive#1029@", 
        database="scrapping"
    )
    print("Successfully connected to MySQL!")
    connection.close()
except pymysql.MySQLError as e:
    print(f"Connection failed: {e}")


Successfully connected to MySQL!


In [22]:
pip install cryptography

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [25]:
from sqlalchemy import create_engine
from urllib.parse import quote_plus

DB_USER = "root"
DB_PASSWORD = quote_plus("Nive#1029@")  # Encodes special characters
DB_NAME = "scrapping"

engine = create_engine(f"mysql+pymysql://{DB_USER}:{DB_PASSWORD}@127.0.0.1:3306/{DB_NAME}")

try:
    with engine.connect() as conn:
        print("Connection successful!")
except Exception as e:
    print("Connection failed:", e)


Connection successful!


In [None]:
import pandas as pd

# Load the CSV file
csv_file_path = r"C:\Users\Nivetha s\Desktop\webscrapping\merged_output.csv"
df = pd.read_csv(csv_file_path)

# Remove duplicates
df.drop_duplicates(inplace=True)

# Fill missing values
df.fillna({
    "Title": "Unknown",
    "Rating": 0.0,
    "Views": 0,
    "Duration": 0,
    "Genre": "Unknown"
}, inplace=True)

# Convert 'Views' from format like '52K' to integer (52000)
df["Views"] = df["Views"].astype(str).str.replace("K", "000", regex=True)
df["Views"] = df["Views"].str.replace("[^0-9]", "", regex=True).replace("", "0").astype(int)

# Convert Duration to minutes from format like "1h 30m"
duration_extracted = df["Duration"].astype(str).str.extract(r"(?:(\d+)\s*h)?\s*(?:(\d+)\s*m)?")
duration_extracted = duration_extracted.fillna(0).astype(int)
df["Duration"] = duration_extracted[0] * 60 + duration_extracted[1]

# Remove numbering from start of title (e.g., "1. Gladiator" -> "Gladiator")
df["Title"] = df["Title"].str.replace(r"^\d+\.\s*", "", regex=True)

# Save cleaned data as SQL INSERT statements
sql_file_path = r"C:\Users\Nivetha s\Desktop\webscrapping\output.sql"
with open(sql_file_path, "w", encoding="utf-8") as f:
    for _, row in df.iterrows():
        title = row['Title'].replace('"', r'\"')  # escape quotes
        genre = row['Genre'].replace('"', r'\"')  # escape quotes
        sql_statement = (
            f'INSERT INTO Imdbmovies (Title, Rating, Views, Duration, Genre) '
            f'VALUES ("{title}", {row["Rating"]}, {row["Views"]}, {row["Duration"]}, "{genre}");\n'
        )
        f.write(sql_statement)

print(f"SQL file saved at: {sql_file_path}")


SQL file saved at: C:\Users\Nivetha s\Desktop\webscrapping\output.sql
