# 1. Setting Up the Environment

## 1.1. Install dependencies:
This command updates the system's package list and installs various libraries required for running Chrome and Selenium.

In [None]:
!apt-get update -y
!apt-get install -y \
libglib2.0-0 \
libnss3 \
libdbus-glib-1-2 \
libgconf-2-4 \
libfontconfig1 \
libvulkan1 \
gconf2-common \
libwayland-server0 \
libgbm1 \
udev \
libu2f-udev 
!apt --fix-broken install -y  

## 1.2. Download and extract Chrome:

To use Selenium, you will need to download and install Chrome and Chromedriver.

* **Chrome**: Chrome is a popular web browser that is known for its speed and security.
* **Chromedriver**: Chromedriver is a tool that allows Selenium to interact with Chrome.

Downloads the latest stable version of Chrome for Linux and extracts it to the /usr/bin directory.

In [None]:
!wget -P /tmp https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5845.96/linux64/chrome-linux64.zip
!unzip /tmp/chrome-linux64.zip -d /usr/bin/

## 1.3. Download and extract Chromedriver:

As it was done in the previous code.

In [None]:
!wget -P /tmp https://edgedl.me.gvt1.com/edgedl/chrome/chrome-for-testing/116.0.5845.96/linux64/chromedriver-linux64.zip
!unzip /tmp/chromedriver-linux64.zip -d /usr/bin/

## 1.4. Install Python libraries

In [None]:
!apt install -y python3-selenium
!pip install selenium==3.141.0

# 2.Importing Libraries

You will also need to install the following Python libraries:

* **selenium**: The Selenium library provides the API for interacting with web pages.
* **webdriver**: The webdriver library provides a way to interact with web drivers, such as Chromedriver.
* **BeautifulSoup**: The BeautifulSoup library is used for parsing HTML content.

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

In [None]:
from retrying import retry
import time
import traceback

# 3. Configuring Chrome Driver

These functions define the locations of Chrome and Chromedriver executables. Additionally, initialize_driver creates a Chrome webdriver instance with specific options:

* *--headless*: Runs Chrome in headless mode, making it invisible.
* *--no-sandbox*: Disables the sandbox for improved performance.
* *--start-fullscreen*: Starts Chrome in fullscreen mode.
* *--allow-insecure-localhost*: Allows access to insecure local websites (if needed).
* *--disable-dev-shm-usage*: Disables shared memory usage for Chrome.
* *user-agent*: Sets the user agent string to mimic a regular browser.

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.action_chains import ActionChains

CHROME_BINARY_LOCATION = "/usr/bin/chrome-linux64/chrome"
CHROMEDRIVER_BINARY_LOCATION = "/usr/bin/chromedriver-linux64/chromedriver"

def add_driver_options(options):
    """
    Add configurable options
    """
    chrome_options = Options()
    for opt in options:
        chrome_options.add_argument(opt)
    return chrome_options

def initialize_driver():
    """
    Initialize the web driver
    """
    driver_config = {
        "options": [
            "--headless",
            "--no-sandbox",
            "--start-fullscreen",
            "--allow-insecure-localhost",
            "--disable-dev-shm-usage",
            "user-agent=Chrome/116.0.5845.96"
        ],
    }
    options = add_driver_options(driver_config["options"])
    options.binary_location = CHROME_BINARY_LOCATION
    driver = webdriver.Chrome(
        executable_path=CHROMEDRIVER_BINARY_LOCATION,
        options=options)
    return driver


# 4. Using Selenium to scrape movie_urls and reviews

In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from tqdm import tqdm
import pandas as pd

# Define the WebDriver initialization function
def initialize_driver():
    CHROME_BINARY_LOCATION = "/usr/bin/chrome-linux64/chrome"
    CHROMEDRIVER_BINARY_LOCATION = "/usr/bin/chromedriver-linux64/chromedriver"

    chrome_options = Options()
    chrome_options.binary_location = CHROME_BINARY_LOCATION
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--start-fullscreen")
    chrome_options.add_argument("--allow-insecure-localhost")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Chrome/116.0.5845.96")

    driver = webdriver.Chrome(
        executable_path=CHROMEDRIVER_BINARY_LOCATION,
        options=chrome_options
    )
    return driver

# Define the function to scrape movie URLs
def scrape_movie_urls(driver, num_movies):
    """
    Scrape a number of popular movie URLs from Letterboxd.

    Args:
        driver (webdriver.Chrome): The initialized Chrome driver.
        num_movies (int): Number of movie URLs to scrape.

    Returns:
        pd.DataFrame: DataFrame containing the movie URLs.
    """
    print("...Connected to selenium service!")
    movie_urls = []
    page = 1
    
    # Initialize the DataFrame at the start to avoid UnboundLocalError
    df = pd.DataFrame(columns=['url'])

    try:
        while len(movie_urls) < num_movies:
            driver.get(f"https://letterboxd.com/films/popular/page/{page}/")
            
            # Wait for the movie elements to be present
            WebDriverWait(driver, 10).until(
                EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.frame'))
            )
            
            elements = driver.find_elements(By.CSS_SELECTOR, 'a.frame')
            for element in tqdm(elements, desc=f"Processing Page {page}", leave=False):
                if len(movie_urls) >= num_movies:
                    break
                movie_urls.append(element.get_attribute('href'))
            
            print(f"Page {page} processed, {len(movie_urls)} URLs found.")
            page += 1
        
        # Convert the list to a DataFrame once the scraping loop is done
        df = pd.DataFrame(movie_urls, columns=['url'])
        print("Movie URLs successfully scraped!\nMovie URLs preview:")
        print(df.head().to_string())
    
    except Exception as e:
        print(f"An error occurred: {e}")
    
    finally:
        print("\nClosing connection to selenium webdriver...")
        driver.quit()
        print("Connection closed successfully!\n\n")
    
    return df




In [None]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from tqdm import tqdm
import pandas as pd
import os

# Initialize the WebDriver
def initialize_driver():
    CHROME_BINARY_LOCATION = "/usr/bin/chrome-linux64/chrome"
    CHROMEDRIVER_BINARY_LOCATION = "/usr/bin/chromedriver-linux64/chromedriver"

    chrome_options = Options()
    chrome_options.binary_location = CHROME_BINARY_LOCATION
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--start-fullscreen")
    chrome_options.add_argument("--allow-insecure-localhost")
    chrome_options.add_argument("--disable-dev-shm-usage")
    chrome_options.add_argument("user-agent=Chrome/116.0.5845.96")

    driver = webdriver.Chrome(
        executable_path=CHROMEDRIVER_BINARY_LOCATION,
        options=chrome_options
    )
    return driver

# Normalize the star rating
def normalize_rating(star_rating):
    rating_map = {'★': 0.2, '½': 0.1}
    return sum(rating_map[char] for char in star_rating if char in rating_map)

# Extract review details from a review element
def extract_movie_review_details(element):
    review = {}

    try:
        user_url_element = element.find_element(By.CSS_SELECTOR, 'a.avatar.-a40')
        review['user_url'] = user_url_element.get_attribute('href')
    except NoSuchElementException:
        review['user_url'] = None

    try:
        review_text_element = element.find_element(By.CSS_SELECTOR, 'div.body-text.-prose.collapsible-text')
        review['review_text'] = review_text_element.text
    except NoSuchElementException:
        review['review_text'] = None

    try:
        rating_element = element.find_element(By.CSS_SELECTOR, 'span.rating')
        review['rating'] = normalize_rating(rating_element.text)
    except NoSuchElementException:
        review['rating'] = None

    try:
        review_date_element = element.find_element(By.CSS_SELECTOR, 'span.date > span._nobr')
        review['review_date'] = review_date_element.text
    except NoSuchElementException:
        review['review_date'] = None

    return review

# Scrape reviews for a single movie
def scrape_reviews_for_movie(driver, movie_url, num_reviews_per_movie):
    movie_reviews = []
    page = 1

    try:
        driver.get(f"{movie_url}/reviews/")

        # Scrape movie title and release year
        try:
            movie_title_element = driver.find_element(By.CSS_SELECTOR, 'div.contextual-title h1.headline-2 a')
            movie_title = movie_title_element.text
        except NoSuchElementException:
            movie_title = None

        try:
            movie_year_element = driver.find_element(By.CSS_SELECTOR, 'div.contextual-title h1.headline-2 small.metadata a')
            movie_year = movie_year_element.text
        except NoSuchElementException:
            movie_year = None

        while len(movie_reviews) < num_reviews_per_movie:
            review_elements = driver.find_elements(By.CSS_SELECTOR, 'li.film-detail')

            for element in review_elements:
                if len(movie_reviews) >= num_reviews_per_movie:
                    break

                review = extract_movie_review_details(element)
                review['movie_title'] = movie_title
                review['movie_year'] = movie_year
                review['movie_url'] = movie_url

                if review['review_text'] and review['rating'] is not None:
                    movie_reviews.append(review)

            if len(movie_reviews) < num_reviews_per_movie:
                page += 1
                driver.get(f"{movie_url}/reviews/page/{page}/")
            else:
                break

    except TimeoutException:
        print(f"Timeout while loading reviews for movie {movie_url}")

    return movie_reviews

# Save the DataFrame as a checkpoint
def save_checkpoint(df, checkpoint_filename):
    df.to_csv(checkpoint_filename, index=False)
    print(f"Checkpoint saved to {checkpoint_filename}")

# Scrape reviews from a list of movie URLs
def scrape_reviews_from_movies(driver, movie_urls, num_reviews_per_movie, checkpoint_filename="movie_reviews_checkpoint.csv"):
    print("Successfully connected to WebDriver!")
    all_reviews = []
    checkpoint_interval = 10  # Save every 10 movies

    for idx, movie_url in enumerate(tqdm(movie_urls, desc="Processing Movies"), start=1):
        print(f"Scraping reviews for movie: {movie_url}")
        movie_reviews = scrape_reviews_for_movie(driver, movie_url, num_reviews_per_movie)
        all_reviews.extend(movie_reviews)

        # Save checkpoint at intervals
        if idx % checkpoint_interval == 0:
            df = pd.DataFrame(all_reviews)
            save_checkpoint(df, checkpoint_filename)

    # Save the final data
    df = pd.DataFrame(all_reviews)
    save_checkpoint(df, checkpoint_filename)

    print("Reviews successfully scraped from movie URLs!\nReviews preview:")
    print(df.head().to_string())
    print("\nClosing connection to selenium WebDriver...")
    driver.quit()
    print("Connection closed successfully!\n\n")

    return df




In [None]:
if __name__ == "__main__":
    driver = initialize_driver()
    try:
        num_movies_to_scrape = 200  # Adjust this number as needed
        movie_urls_df = scrape_movie_urls(driver, num_movies_to_scrape)
        movie_urls_df.to_csv('/kaggle/working/movie_urls.csv')
    finally:
        driver.quit()  # Ensure the driver is closed after usage

In [None]:
if __name__ == "__main__":
    driver = initialize_driver()
    try:
        num_reviews_per_movie = 100  # Adjust as needed
        movie_urls_df = pd.read_csv('movie_urls.csv')  # Assuming you have a CSV file of movie URLs
        reviews_df = scrape_reviews_from_movies(driver, movie_urls_df['url'], num_reviews_per_movie)
        # Save to file if needed
        reviews_df.to_csv('/kaggle/working/movie_reviews.csv')
    finally:
        driver.quit()  # Ensure the driver is closed after usage