In [None]:
import pandas as pd 
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

## Scrapping 2024movies data

In [13]:
# Initialize the WebDriver
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2002-01-01,2023-12-31&count=250&sort=num_votes,desc")

# Initialize a dictionary to store the movie data
Movie_data = {
    "Movie Name": [],
    "Year": [],
    "Description": [],
    "Duration": [],
    "Language": [],
    "Genres": [],
    "Rating": [],
    "Rating_count": [],
    "Content_rating": [],
    "MetaScore": [],
    "Popularity": [],
    "Director": [],
    "Writer": [],
    "Budget": [],
    "Gross World Wide": [],
    "Oscar": [],
    "Awards": [],
    "Country of Origin": [],
    "release_date": []
}

# Function to collect movie links from the current page
def collect_movie_links():
    movie_links = driver.find_elements(By.XPATH, "//a[@class='ipc-lockup-overlay ipc-focusable']")
    hrefs = [link.get_attribute("href") for link in movie_links[:5000]]
    return hrefs

# Function to load more pages until 5000 movies are loaded
def load_more_pages():
    total_movies_loaded = 0
    movies_per_page = 250  # IMDb loads 250 movies per page
    max_movies = 5000     # Limit to 5000 movies

    while total_movies_loaded < max_movies:
        try:
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(), '250 more')]]"))
            )
            actions = ActionChains(driver)
            actions.move_to_element(load_more_button).perform()
            load_more_button.click()
            time.sleep(5)  # Wait for new content to load
            total_movies_loaded += movies_per_page
            print(f"Loaded {total_movies_loaded} movies...")
        except Exception as e:
            print("No more 'Load More' button found or max movies reached, stopping.")
            break

    print(f"Total movies loaded: {min(total_movies_loaded, max_movies)}")

# Function to collect data for each movie from the list of movie links
def collect_movie_data(hrefs):
    for href in hrefs:
        driver.get(href)
        time.sleep(5)

        # Movie Name
        try:
            title = driver.find_element(By.XPATH, "//span[@class='hero__primary-text']").text
            Movie_data["Movie Name"].append(title)
        except:
            Movie_data["Movie Name"].append(None)

        # Year
        try:
            Year = driver.find_element(By.XPATH, "//*[@id='__next']/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[1]/a").text
            Movie_data["Year"].append(Year)
        except:
            Movie_data["Year"].append(None)

        # Description
        try:
            Description = driver.find_element(By.XPATH, "//p[@class='sc-3ac15c8d-3 bMUzwm']").text
            Movie_data["Description"].append(Description)
        except:
            Movie_data["Description"].append(None)

        # Duration
        try:
            Duration = driver.find_element(By.XPATH, "//*[@id='__next']/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[3]").text
            Movie_data["Duration"].append(Duration)
        except:
            Movie_data["Duration"].append(None)

        # Language
        try:
            Languages = driver.find_elements(By.XPATH, '//li[@data-testid="title-details-languages"]//a')
            languages_text = ", ".join([lang.text for lang in Languages])
            Movie_data["Language"].append(languages_text)
        except:
            Movie_data["Language"].append(None)

        # Genres
        try:
            Genres = driver.find_element(By.XPATH, "//*[@id='__next']/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[1]/section/div[1]/div[2]/a[1]").text
            Movie_data["Genres"].append(Genres)
        except:
            Movie_data["Genres"].append(None)

        # Rating
        try:
            rating = driver.find_element(By.XPATH, "//div[contains(@class, 'sc-d541859f-2')]").text
            Movie_data["Rating"].append(rating)
        except:
            Movie_data["Rating"].append(None)

        # Rating_count
        try:
            Rating_count = driver.find_element(By.XPATH, "//div[@class='sc-d541859f-3 dwhNqC']").text
            Movie_data["Rating_count"].append(Rating_count)
        except:
            Movie_data["Rating_count"].append(None)

        # Content_rating
        try:
            Content_rating = driver.find_element(By.XPATH, "//*[@id='__next']/main/div/section[1]/section/div[3]/section/section/div[2]/div[1]/ul/li[2]/a").text
            Movie_data["Content_rating"].append(Content_rating)
        except:
            Movie_data["Content_rating"].append(None)

        # MetaScore
        try:
            MetaScore = driver.find_element(By.XPATH, "//*[@id='__next']/main/div/section[1]/section/div[3]/section/section/div[3]/div[2]/div[2]/ul/li[3]/a/span/span[1]/span").text
            Movie_data["MetaScore"].append(MetaScore)
        except:
            Movie_data["MetaScore"].append(None)

        # Popularity
        try:
            Popularity = driver.find_element(By.XPATH, "//div[@class='sc-39d285cf-1 dxqvqi']").text
            Movie_data["Popularity"].append(Popularity)
        except:
            Movie_data["Popularity"].append(None)

        # Director
        try:
            Director = driver.find_element(By.XPATH, "//*[@id='__next']/main/div/section[1]/div/section/div/div[1]/section[4]/ul/li[1]/div/ul/li/a").text
            Movie_data["Director"].append(Director)
        except:
            Movie_data["Director"].append(None)

        # Writer
        try:
            Writer = driver.find_element(By.XPATH, "//*[@id='__next']/main/div/section[1]/div/section/div/div[1]/section[4]/ul/li[2]/div/ul/li[1]/a").text
            Movie_data["Writer"].append(Writer)
        except:
            Movie_data["Writer"].append(None)

        # Budget
        try:
            Budget = driver.find_element(By.XPATH, "//li[@data-testid='title-boxoffice-budget']//span[@class='ipc-metadata-list-item__list-content-item']").text
            Movie_data["Budget"].append(Budget)
        except:
            Movie_data["Budget"].append(None)

        # Gross World Wide
        try:
            Gross_World_Wide = driver.find_element(By.XPATH, "//li[@data-testid='title-boxoffice-cumulativeworldwidegross']//span[@class='ipc-metadata-list-item__list-content-item']").text
            Movie_data["Gross World Wide"].append(Gross_World_Wide)
        except:
            Movie_data["Gross World Wide"].append(None)

        # Awards
        try:
            Awards = driver.find_element(By.XPATH, "//ul[@class='ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--inline ipc-metadata-list-item__list-content base']").text
            Movie_data["Awards"].append(Awards)
        except:
            Movie_data["Awards"].append(None)

        # Oscar
        try:
            Oscar = driver.find_element(By.XPATH, "/html/body/div[2]/main/div/section[1]/div/section/div/div[1]/section[1]/div/ul/li/a[1]").text
            Movie_data["Oscar"].append(Oscar)
        except:
            Movie_data["Oscar"].append(None)

        # Release Date
        try:
            release_date = driver.find_element(By.XPATH, "//li[contains(@data-testid, 'title-details-releasedate')]//div/ul/li/a[1]").text
            Movie_data["release_date"].append(release_date)
        except:
            Movie_data["release_date"].append(None)

        # Country of Origin
        try:
            country_of_origin = driver.find_element(By.XPATH, "//li[@data-testid='title-details-origin']//ul//li//a").text
            Movie_data["Country of Origin"].append(country_of_origin)
        except:
            Movie_data["Country of Origin"].append(None)

# Call the load_more_pages function to load all pages
load_more_pages()

# Collect movie links from the loaded pages
hrefs = collect_movie_links()

# Collect movie data from the list of hrefs
collect_movie_data(hrefs)

# Close the driver when done
driver.quit()


Loaded 250 movies...
Loaded 500 movies...
Loaded 750 movies...
Loaded 1000 movies...
Loaded 1250 movies...
Loaded 1500 movies...
Loaded 1750 movies...
Loaded 2000 movies...
Loaded 2250 movies...
Loaded 2500 movies...
Loaded 2750 movies...
Loaded 3000 movies...
Loaded 3250 movies...
Loaded 3500 movies...
Loaded 3750 movies...
Loaded 4000 movies...
Loaded 4250 movies...
Loaded 4500 movies...
Loaded 4750 movies...
Loaded 5000 movies...
Total movies loaded: 5000


In [None]:
import pandas as pd

In [18]:
df = pd.DataFrame.from_dict(Movie_data, orient="index").transpose()
df

Unnamed: 0,Movie Name,Year,Description,Duration,Language,Genres,Rating,Rating_count,Content_rating,MetaScore,Popularity,Director,Writer,Budget,Gross World Wide,Oscar,Awards,Country of Origin,release_date
0,The Dark Knight,2008,When a menace known as the Joker wreaks havoc ...,2h 32m,"English, Mandarin",Action Epic,9.0\n/10,2.9M,PG-13,84,89,Christopher Nolan,Jonathan Nolan,"$185,000,000 (estimated)","$1,009,053,678",Won 2 Oscars,164 wins & 164 nominations total,United States,"July 18, 2008 (United States)"
1,Inception,2010,A thief who steals corporate secrets through t...,2h 28m,"English, Japanese, French",Action Epic,8.8\n/10,2.6M,PG-13,74,119,Christopher Nolan,Christopher Nolan,"$160,000,000 (estimated)","$839,030,630",Won 4 Oscars,159 wins & 220 nominations total,United States,"July 16, 2010 (United States)"
2,Interstellar,2014,When Earth becomes uninhabitable in the future...,2h 49m,English,Adventure Epic,8.7\n/10,2.2M,PG-13,74,37,Christopher Nolan,Jonathan Nolan,"$165,000,000 (estimated)","$730,999,801",Won 1 Oscar,44 wins & 148 nominations total,United States,"November 7, 2014 (United States)"
3,The Lord of the Rings: The Return of the King,2003,Gandalf and Aragorn lead the World of Men agai...,3h 21m,"English, Quenya, Old English, Sindarin",Adventure Epic,9.0\n/10,2M,PG-13,94,253,Peter Jackson,J.R.R. Tolkien,"$94,000,000 (estimated)","$1,137,996,691",Won 11 Oscars,215 wins & 124 nominations total,New Zealand,"December 17, 2003 (United States)"
4,The Dark Knight Rises,2012,"Bane, an imposing terrorist, attacks Gotham Ci...",2h 44m,"English, Arabic",Action Epic,8.4\n/10,1.9M,PG-13,78,260,Christopher Nolan,Jonathan Nolan,"$250,000,000 (estimated)","$1,114,975,066",Nominated for 1 BAFTA Award,45 wins & 103 nominations total,United States,"July 20, 2012 (United States)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,Not Okay,2022,An ambitious young woman finds followers and f...,1h 40m,English,Dark Comedy,6.1\n/10,20K,R,62,,Quinn Shephard,Quinn Shephard,,,Awards,2 nominations,United States,"July 29, 2022 (United States)"
4996,Black Christmas,2019,A group of female students is stalked by a str...,1h 32m,English,Conspiracy Thriller,3.5\n/10,20K,PG-13,49,,,,"$5,000,000 (estimated)","$18,529,730",,Sophia Takal,United States,"December 13, 2019 (United States)"
4997,The Lazarus Project,2008,A paroled former criminal is drawn into an end...,1h 40m,English,Drama,6.0\n/10,20K,PG-13,,,,,"$9,750,000 (estimated)","$9,162",,John Glenn,United States,2008 (Canada)
4998,Tom at the Farm,2013,"A grieving man meets his lover's family, who w...",1h 42m,French,Psychological Thriller,6.9\n/10,20K,Not Rated,67,,Xavier Dolan,Xavier Dolan,,"$687,505",Awards,9 wins & 30 nominations,France,"August 14, 2015 (United States)"


In [19]:
df.to_csv("final_5000_movie.csv")

In [105]:
df=pd.read_csv(r"C:\Users\Bhagy\Downloads\final_5000_movie.xls")
df

Unnamed: 0.1,Unnamed: 0,Movie Name,Year,Description,Duration,Language,Genres,Rating,Rating_count,Content_rating,MetaScore,Popularity,Director,Writer,Budget,Gross World Wide,Oscar,Awards,Country of Origin,release_date
0,0,The Dark Knight,2008,When a menace known as the Joker wreaks havoc ...,2h 32m,"English, Mandarin",Action Epic,9.0\n/10,2.9M,PG-13,84.0,89,Christopher Nolan,Jonathan Nolan,"$185,000,000 (estimated)","$1,009,053,678",Won 2 Oscars,164 wins & 164 nominations total,United States,"July 18, 2008 (United States)"
1,1,Inception,2010,A thief who steals corporate secrets through t...,2h 28m,"English, Japanese, French",Action Epic,8.8\n/10,2.6M,PG-13,74.0,119,Christopher Nolan,Christopher Nolan,"$160,000,000 (estimated)","$839,030,630",Won 4 Oscars,159 wins & 220 nominations total,United States,"July 16, 2010 (United States)"
2,2,Interstellar,2014,When Earth becomes uninhabitable in the future...,2h 49m,English,Adventure Epic,8.7\n/10,2.2M,PG-13,74.0,37,Christopher Nolan,Jonathan Nolan,"$165,000,000 (estimated)","$730,999,801",Won 1 Oscar,44 wins & 148 nominations total,United States,"November 7, 2014 (United States)"
3,3,The Lord of the Rings: The Return of the King,2003,Gandalf and Aragorn lead the World of Men agai...,3h 21m,"English, Quenya, Old English, Sindarin",Adventure Epic,9.0\n/10,2M,PG-13,94.0,253,Peter Jackson,J.R.R. Tolkien,"$94,000,000 (estimated)","$1,137,996,691",Won 11 Oscars,215 wins & 124 nominations total,New Zealand,"December 17, 2003 (United States)"
4,4,The Dark Knight Rises,2012,"Bane, an imposing terrorist, attacks Gotham Ci...",2h 44m,"English, Arabic",Action Epic,8.4\n/10,1.9M,PG-13,78.0,260,Christopher Nolan,Jonathan Nolan,"$250,000,000 (estimated)","$1,114,975,066",Nominated for 1 BAFTA Award,45 wins & 103 nominations total,United States,"July 20, 2012 (United States)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,4995,Not Okay,2022,An ambitious young woman finds followers and f...,1h 40m,English,Dark Comedy,6.1\n/10,20K,R,62.0,,Quinn Shephard,Quinn Shephard,,,Awards,2 nominations,United States,"July 29, 2022 (United States)"
4996,4996,Black Christmas,2019,A group of female students is stalked by a str...,1h 32m,English,Conspiracy Thriller,3.5\n/10,20K,PG-13,49.0,,,,"$5,000,000 (estimated)","$18,529,730",,Sophia Takal,United States,"December 13, 2019 (United States)"
4997,4997,The Lazarus Project,2008,A paroled former criminal is drawn into an end...,1h 40m,English,Drama,6.0\n/10,20K,PG-13,,,,,"$9,750,000 (estimated)","$9,162",,John Glenn,United States,2008 (Canada)
4998,4998,Tom at the Farm,2013,"A grieving man meets his lover's family, who w...",1h 42m,French,Psychological Thriller,6.9\n/10,20K,Not Rated,67.0,,Xavier Dolan,Xavier Dolan,,"$687,505",Awards,9 wins & 30 nominations,France,"August 14, 2015 (United States)"


## Reviews data scrapping 

In [None]:

# Initialize the WebDriver
driver = webdriver.Chrome()
driver.get("https://www.imdb.com/search/title/?title_type=feature&release_date=2002-01-01,2023-12-31&count=250&sort=num_votes,desc")

# Initialize a dictionary to store the movie data
Movie_data = {
    "Movie Name": [],
    "Reviews": []
}

# Function to collect movie links from the current page
def collect_movie_links():
    movie_links = driver.find_elements(By.XPATH, "//a[@class='ipc-lockup-overlay ipc-focusable']")
    hrefs = [link.get_attribute("href") for link in movie_links]
    return hrefs[:1]

# Function to load more pages until the "Load More" button is no longer present
def load_more_pages():
    while True:
        try:
            # Look for the "Load More" button and wait until it is clickable
            load_more_button = WebDriverWait(driver, 10).until(
                EC.element_to_be_clickable((By.XPATH, "//button[.//span[contains(text(), '250 more')]]"))
            )
            
            # Scroll to the "Load More" button and ensure it is in view
            actions = ActionChains(driver)
            actions.move_to_element(load_more_button).perform()
            time.sleep(1)  # Short wait before clicking
            
            # Click the "Load More" button
            load_more_button.click()
            time.sleep(10)  # Wait for new content to load

        except Exception as e:
            # If no more "Load More" button is found, break the loop
            print("No more 'Load More' button found, moving forward.")
            break

# Function to collect data for each movie from the list of movie links
def collect_movie_data(hrefs):
    for href in hrefs:
        driver.get(href)
        time.sleep(10)

        # Movie Name
        try:
            title = driver.find_element(By.XPATH, "//span[@class='hero__primary-text']").text
            Movie_data["Movie Name"].append(title)
        except Exception as e:
            print(f"Error retrieving title for {href}: {e}")
            Movie_data["Movie Name"].append(None)

        # Create the review URL
        review_href = href.split('?')[0] + 'reviews/?ref_=tt_urv_sm'

        # Collect reviews by visiting the review page
        try:
            driver.get(review_href)
            time.sleep(5)

            # Collect reviews from the review page
            reviews_data = []
            reviews = driver.find_elements(By.XPATH, "/html/body/div[2]/main/div/section/div/section/div/div[1]/section[1]/article[1]/div[1]/div[1]/div[3]/div/div/div")
            Movie_data["Reviews"].append(reviews[0].text)

        except Exception as e:
            print(f"Could not retrieve reviews for {href}. Error: {e}")
            Movie_data["Reviews"].append([])

load_more_pages()

hrefs = collect_movie_links()

print(hrefs)

collect_movie_data(hrefs)
print(Movie_data)

# driver.quit()


In [1]:
import pandas as pd 

In [None]:
df_1= pd.DataFrame.from_dict(Movie_data, orient="index").transpose()
df_1

In [4]:
df_1=pd.read_csv(r"C:\Users\Bhagy\OneDrive\Desktop\REVIEW_MOVIE.csv")

In [5]:
df_1

Unnamed: 0.1,Unnamed: 0,Movie Name,Reviews
0,0,The Dark Knight,Best movie ever. Heath ledger's work is phenom...
1,1,Inception,[None]
2,2,Interstellar,Sometimes I just need to see the start. Or end...
3,3,The Lord of the Rings: The Return of the King,"This movie, and trilogy in general, is a cinem..."
4,4,The Dark Knight Rises,"After eight years in seclusion, Batman resurfa..."
...,...,...,...
4995,4995,Not Okay,Not sure about why there's so much hate here. ...
4996,4996,Black Christmas,[None]
4997,4997,The Lazarus Project,a psychological thriller that will make you th...
4998,4998,Tom at the Farm,I was one of the lucky people to see this movi...
