In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import csv
import json
import os
import ast
import warnings
warnings.filterwarnings('ignore')

# Set up Splinter
browser = Browser('chrome')

# Create an empty list to store the movie data
movies = []

# Loop over all the URLs
for i in range(10):
    # Construct the URL for the current page
    url = f'https://www.imdb.com/search/title/?groups=top_1000&sort=user_rating,desc&count=100&start={i*100+1}&ref_=adv_nxt'

    # Visit the URL
    browser.visit(url)

    # Parse the website
    html = browser.html
    html_soup = soup(html, 'html.parser')

    # Find the list of movies
    movie_list = html_soup.find('div', {'class': 'lister-list'})

    # Find all the rows in the list
    movie_rows = movie_list.find_all('div', {'class': 'lister-item'})

    # Loop over the rows and extract the movie details
    for row in movie_rows:
        # Get the movie title
        title = row.find('h3', {'class': 'lister-item-header'}).find('a').text

        # Get the movie year
        year = row.find('span', {'class': 'lister-item-year'}).text.strip('()')

        # Get the movie rating
        rating = row.find('div', {'class': 'ratings-bar'}).find('strong').text

        # Get the movie director, stars, votes, and gross
        details = row.find('div', {'class': 'lister-item-content'})
        director = details.find_all('p')[2].find_all('a')[0].text
        stars = [star.text for star in details.find_all('p')[2].find_all('a')[1:]]
        votes = details.find('p', {'class': 'sort-num_votes-visible'}).find_all('span')[1].text.replace(',', '')
        gross = details.find('p', {'class': 'sort-num_votes-visible'}).find_all('span')[-1].text.replace('$', '').replace(',', '').strip()
        genres = details.find('span', {'class': 'genre'}).text.strip().split(', ')
        
        # Check if gross value is in millions or thousands and convert accordingly
        if gross.endswith('M'):
            gross = float(gross.strip('M')) * 1000000
        elif gross.endswith('K'):
            gross = float(gross.strip('K')) * 1000
            
        # Add the movie data to the list
        movies.append({'Title': title, 'Year': year, 'Rating': rating, 'Director': director, 'Stars': stars, 'Votes': votes, 'Gross': gross, 'Genres': genres})

# Create a DataFrame from the movie data
df = pd.DataFrame(movies)

print(df)

# Export the DataFrame to a CSV file
df.to_csv('imdb_top_1000_movies_final.csv', index=False)

                                             Title  Year Rating  \
0                         The Shawshank Redemption  1994    9.3   
1                                    The Godfather  1972    9.2   
2                                  The Dark Knight  2008    9.0   
3    The Lord of the Rings: The Return of the King  2003    9.0   
4                                 Schindler's List  1993    9.0   
..                                             ...   ...    ...   
995                                          Shine  1996    7.6   
996                            Eyes Without a Face  1960    7.6   
997                                 The Odd Couple  1968    7.6   
998                              The Invisible Man  1933    7.6   
999                                      Celda 211  2009    7.6   

                 Director                                              Stars  \
0          Frank Darabont  [Tim Robbins, Morgan Freeman, Bob Gunton, Will...   
1    Francis Ford Coppola  [Marlon 

In [2]:
# Read CSV
df = pd.read_csv('imdb_top_1000_movies_final.csv')

In [3]:
# Show CSV
df.head()

Unnamed: 0,Title,Year,Rating,Director,Stars,Votes,Gross,Genres
0,The Shawshank Redemption,1994,9.3,Frank Darabont,"['Tim Robbins', 'Morgan Freeman', 'Bob Gunton'...",2718055,#1,['Drama']
1,The Godfather,1972,9.2,Francis Ford Coppola,"['Marlon Brando', 'Al Pacino', 'James Caan', '...",1888630,#2,"['Crime', 'Drama']"
2,The Dark Knight,2008,9.0,Christopher Nolan,"['Christian Bale', 'Heath Ledger', 'Aaron Eckh...",2690794,#3,"['Action', 'Crime', 'Drama']"
3,The Lord of the Rings: The Return of the King,2003,9.0,Peter Jackson,"['Elijah Wood', 'Viggo Mortensen', 'Ian McKell...",1870260,#7,"['Action', 'Adventure', 'Drama']"
4,Schindler's List,1993,9.0,Steven Spielberg,"['Liam Neeson', 'Ralph Fiennes', 'Ben Kingsley...",1372956,#6,"['Biography', 'Drama', 'History']"


In [4]:
# Convert the string representation of the list to an actual list
df['Stars'] = df['Stars'].apply(lambda x: ast.literal_eval(x))
df['Genres'] = df['Genres'].apply(lambda x: ast.literal_eval(x))

In [5]:
# Clean up the 'Year' column by removing parentheses and Roman numerals I, II, and III
df['Year'] = df['Year'].str.replace('(', '').str.replace(')', '').str.replace('I', '').str.replace('II', '').str.replace('III', '')

In [6]:
# Show the dataframe
df

Unnamed: 0,Title,Year,Rating,Director,Stars,Votes,Gross,Genres
0,The Shawshank Redemption,1994,9.3,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",2718055,#1,[Drama]
1,The Godfather,1972,9.2,Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan, Diane K...",1888630,#2,"[Crime, Drama]"
2,The Dark Knight,2008,9.0,Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",2690794,#3,"[Action, Crime, Drama]"
3,The Lord of the Rings: The Return of the King,2003,9.0,Peter Jackson,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O...",1870260,#7,"[Action, Adventure, Drama]"
4,Schindler's List,1993,9.0,Steven Spielberg,"[Liam Neeson, Ralph Fiennes, Ben Kingsley, Car...",1372956,#6,"[Biography, Drama, History]"
...,...,...,...,...,...,...,...,...
995,Shine,1996,7.6,Scott Hicks,"[Geoffrey Rush, Armin Mueller-Stahl, Justin Br...",55052,35810000.0,"[Biography, Drama, Music]"
996,Eyes Without a Face,1960,7.6,Georges Franju,"[Pierre Brasseur, Alida Valli, Juliette Maynie...",32898,50000.0,"[Drama, Horror]"
997,The Odd Couple,1968,7.6,Gene Saks,"[Jack Lemmon, Walter Matthau, John Fiedler, He...",35913,44530000.0,[Comedy]
998,The Invisible Man,1933,7.6,James Whale,"[Claude Rains, Gloria Stuart, William Harrigan...",37063,37063,"[Horror, Sci-Fi]"


In [7]:
# Drop the 'Gross' column
df = df.drop('Gross', axis=1)

In [8]:
# Show the dataframe
df

Unnamed: 0,Title,Year,Rating,Director,Stars,Votes,Genres
0,The Shawshank Redemption,1994,9.3,Frank Darabont,"[Tim Robbins, Morgan Freeman, Bob Gunton, Will...",2718055,[Drama]
1,The Godfather,1972,9.2,Francis Ford Coppola,"[Marlon Brando, Al Pacino, James Caan, Diane K...",1888630,"[Crime, Drama]"
2,The Dark Knight,2008,9.0,Christopher Nolan,"[Christian Bale, Heath Ledger, Aaron Eckhart, ...",2690794,"[Action, Crime, Drama]"
3,The Lord of the Rings: The Return of the King,2003,9.0,Peter Jackson,"[Elijah Wood, Viggo Mortensen, Ian McKellen, O...",1870260,"[Action, Adventure, Drama]"
4,Schindler's List,1993,9.0,Steven Spielberg,"[Liam Neeson, Ralph Fiennes, Ben Kingsley, Car...",1372956,"[Biography, Drama, History]"
...,...,...,...,...,...,...,...
995,Shine,1996,7.6,Scott Hicks,"[Geoffrey Rush, Armin Mueller-Stahl, Justin Br...",55052,"[Biography, Drama, Music]"
996,Eyes Without a Face,1960,7.6,Georges Franju,"[Pierre Brasseur, Alida Valli, Juliette Maynie...",32898,"[Drama, Horror]"
997,The Odd Couple,1968,7.6,Gene Saks,"[Jack Lemmon, Walter Matthau, John Fiedler, He...",35913,[Comedy]
998,The Invisible Man,1933,7.6,James Whale,"[Claude Rains, Gloria Stuart, William Harrigan...",37063,"[Horror, Sci-Fi]"


In [9]:
# Export dataframe to JSON file
df.to_json('imdb_top_1000_movies.json', orient='records')