In [14]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import time
from selenium.webdriver.chrome.service import Service
from tqdm import tqdm

In [15]:
driver_path = r"C:\Users\user\Documents\Python stuff\IMDB Text Analytics\chromedriver.exe"
service = Service(driver_path)
driver = webdriver.Chrome(service=service)

# IMDb reviews URL
url = "https://www.imdb.com/title/tt11315808/reviews/"
driver.get(url)

# Wait for the "Load More" button to appear
wait = WebDriverWait(driver, 12)

# Simulate loading reviews dynamically
scroll_count = 0
try:
    pbar = tqdm(desc="Loading reviews", unit="scroll")  # Progress bar for loading reviews
    while True:
        # Find the button
        load_more_button = wait.until(EC.element_to_be_clickable((By.CLASS_NAME, "ipc-see-more__button")))

        # Scroll to the button to ensure it is visible
        driver.execute_script("arguments[0].scrollIntoView(true);", load_more_button)

        # Handle overlapping elements (e.g., popups or modals)
        try:
            load_more_button.click()
        except Exception as e:
            print(f"Error clicking 'Load More' button: {e}")
            # Attempt to dismiss any modal/popups that might block the button
            overlays = driver.find_elements(By.CLASS_NAME, "sc-kDvujY")
            for overlay in overlays:
                try:
                    driver.execute_script("arguments[0].style.display = 'none';", overlay)
                except Exception as dismiss_error:
                    print(f"Failed to dismiss overlay: {dismiss_error}")
            continue

        time.sleep(5)  # Allow time for reviews to load
        scroll_count += 1
        pbar.update(1)  # Update the progress bar
except Exception as e:
    print("No more reviews to load or an error occurred:", e)
finally:
    pbar.close()  # Close the progress bar

# Get the full page source after dynamic loading
html_content = driver.page_source

# Close the WebDriver
driver.quit()

# Parse the loaded page with BeautifulSoup
soup = BeautifulSoup(html_content, 'html.parser')

# Extract reviews
review_containers = soup.find_all('div', class_='ipc-html-content-inner-div', role='presentation')

# Progress bar for extracting reviews
pbar = tqdm(total=len(review_containers), desc="Extracting reviews", unit="review")
extracted_reviews = []
for container in review_containers:
    try:
        # Extract review text
        review_text = container.text.strip()

        # Locate the parent or surrounding context to extract rating and summary
        parent_block = container.find_parent('article') if container else None

        # Extract rating
        rating_tag = parent_block.find('span', class_='ipc-rating-star--rating') if parent_block else None
        rating = rating_tag.text.strip() if rating_tag else "No Rating"

        # Extract review summary
        summary_tag = parent_block.find('h3', class_='ipc-title__text') if parent_block else None
        summary = summary_tag.text.strip() if summary_tag else "No Summary"

        # Append extracted data
        extracted_reviews.append({
            'Rating': rating,
            'Summary': summary,
            'Review': review_text
        })
    except Exception as e:
        print(f"Error extracting review: {e}")
    pbar.update(1)  # Update the progress bar
pbar.close()  # Close the progress bar

# Save to DataFrame and CSV
final_reviews_df = pd.DataFrame(extracted_reviews)
final_csv_path = 'joker2_reviews.csv'
final_reviews_df.to_csv(final_csv_path, index=False)

print(f"Reviews saved to {final_csv_path}")

Loading reviews: 91scroll [08:01,  5.29s/scroll]

No more reviews to load or an error occurred: Message: 
Stacktrace:
	GetHandleVerifier [0x00007FF63EA06CC5+28821]
	(No symbol) [0x00007FF63E973850]
	(No symbol) [0x00007FF63E81578A]
	(No symbol) [0x00007FF63E8691BE]
	(No symbol) [0x00007FF63E8694AC]
	(No symbol) [0x00007FF63E8B2647]
	(No symbol) [0x00007FF63E88F33F]
	(No symbol) [0x00007FF63E8AF412]
	(No symbol) [0x00007FF63E88F0A3]
	(No symbol) [0x00007FF63E85A778]
	(No symbol) [0x00007FF63E85B8E1]
	GetHandleVerifier [0x00007FF63ED3FCCD+3408029]
	GetHandleVerifier [0x00007FF63ED5743F+3504143]
	GetHandleVerifier [0x00007FF63ED4B61D+3455469]
	GetHandleVerifier [0x00007FF63EACBDCB+835995]
	(No symbol) [0x00007FF63E97EB6F]
	(No symbol) [0x00007FF63E97A824]
	(No symbol) [0x00007FF63E97A9BD]
	(No symbol) [0x00007FF63E96A1A9]
	BaseThreadInitThunk [0x00007FFB4D797374+20]
	RtlUserThreadStart [0x00007FFB4F33CC91+33]




Extracting reviews: 100%|███████████████████████████████████████████████████| 1675/1675 [00:00<00:00, 20939.20review/s]

Reviews saved to joker2_reviews.csv



