In [3]:
# set-up environment
import time

from selenium import webdriver
from bs4 import BeautifulSoup as bs
from dotenv import load_dotenv
from pathlib import Path
import os

In [4]:
# movies_dict used to store scraped data
movies_dict = {"title": [], # movie title
               "release_date": [], #  release year
               "movie_tv_show": [], # indicate whether it's a movie or tv show
               "duration":[], # duration in minutes
               "genre": [], # movie principal genre
               "other_genre": [], # other genre
               "imdb_rating": [], # rating
               "director":[], # director
               "stars": [], # star actors
               "summary": [], #brief summary about the movies
               "votes":[], # number of votes
               }

In [5]:

# driver Config
env_path = Path("file.env")
load_dotenv(env_path) # load env file
chrome_driver = os.getenv("CHROME_DRIVER") # get driver path
driver = webdriver.Chrome(executable_path=chrome_driver)
driver.set_window_size(1920, 1080)
# open imdb website
url = "https://www.imdb.com/"
driver.get(url=url)
time.sleep(5)
# click "Menu" link
menu = driver.find_element_by_xpath('//*[@id="imdbHeader-navDrawerOpen"]/span')
menu.click()
time.sleep(5)
# go to "Browse movies by genre" link
genre = driver.find_element_by_link_text('Browse Movies by Genre')
genre.click()
time.sleep(5)
# store popular genre in a list
movies_genre = driver.find_elements_by_css_selector('.bgxwho .ipc-chip-list__scroller a')
genre_list = [elem.get_attribute('href') for elem in movies_genre]
popular_genre = genre_list[:12] # get the first 12 popular genre

for link in popular_genre:
    i = 0
    driver.get(link)
    time.sleep(10)
    while i < 2:  # we choose 2 to get the 100 favorites movies from each genre (to get more just increase the number of page  )
        content = driver.find_elements_by_css_selector(".mode-advanced .lister-item-content")
        # data scraping
        for elem in content:
            soup = bs(elem.get_attribute('innerHTML'), "html.parser")

            title = ""
            try:
                title = soup.select_one("h3 a").getText()
            except AttributeError:
                pass
            finally:
                movies_dict["title"].append(title)

            release = ""
            try:
                release = soup.select_one("h3 .lister-item-year").getText()
            except AttributeError:
                pass
            finally:
                movies_dict["release_date"].append(release.strip("()"))

            mtv = ""
            try:
                mtv = soup.select_one("p .certificate").getText()
            except AttributeError:
                pass
            finally:
                movies_dict["movie_tv_show"].append(mtv)

            dur = ""
            try:
                dur = soup.select_one("p .runtime").getText()
            except AttributeError:
                pass
            finally:
                movies_dict["duration"].append(dur.strip(" min"))

            g = ","
            try:
                g = soup.select_one("p .genre").getText()
            except AttributeError:
                pass
            finally:
                gen = g.split(",")[0]
                tmp = g.split(",")[1:]
                oth_g = ""
                for elem in tmp:
                    oth_g = oth_g + f"{elem.strip()}/"
                movies_dict["genre"].append(gen.strip())
                movies_dict["other_genre"].append(oth_g)

            rate = ""
            try:
                rate = soup.select_one(".ratings-bar div strong").getText()
            except AttributeError:
                pass
            finally:
                movies_dict["imdb_rating"].append(rate)

            l = soup.select("p a")
            direc = ""
            stars = []
            try:
                direc = l[0].getText()
                for elem in l[1:]:
                    stars.append(elem.getText())
            except AttributeError:
                pass
            finally:
                movies_dict["director"].append(direc.strip())
                tmp = ""
                for elem in stars:
                    tmp = tmp + f"{elem.strip()} /"
                movies_dict["stars"].append(tmp)

            sum = ""
            try:
                s = soup.find_all(name="p", class_="text-muted")
                sum = s[1].getText()
            except AttributeError:
                pass
            finally:
                movies_dict["summary"].append(sum.strip())

            vote = ""
            try:
                vote = soup.find(attrs={"name": "nv"}).get_text()
            except AttributeError:
                pass
            finally:
                movies_dict["votes"].append(vote)

        next_page = driver.find_element_by_link_text("Next »")
        next_page.click()
        time.sleep(5)
        i += 1
driver.close()


In [6]:
# convert the scraped data to a pandas dataframe
import pandas as pd
df = pd.DataFrame(movies_dict)
df.shape

(1200, 11)

In [7]:
# display 10 random rows 
df.sample(n=10)

Unnamed: 0,title,release_date,movie_tv_show,duration,genre,other_genre,imdb_rating,director,stars,summary,votes
960,The Notebook,2004,PG-13,123.0,Drama,Romance/,7.8,Nick Cassavetes,Gena Rowlands /James Garner /Rachel McAdams /R...,A poor yet passionate young man falls in love ...,596450.0
411,Peaky Blinders,2013–2022,TV-MA,60.0,Crime,Drama/,8.8,Cillian Murphy,Paul Anderson /Sophie Rundle /Helen McCrory /,"A gangster family epic set in 1900s England, c...",602068.0
320,Bottoms,2023,R,92.0,Comedy,,6.9,Emma Seligman,Rachel Sennott /Ayo Edebiri /Ruby Cruz /Havana...,Two unpopular queer high school students start...,898.0
416,A Haunting in Venice,2023,PG-13,103.0,Crime,Drama/Horror/,,Kenneth Branagh,Kelly Reilly /Jamie Dornan /Kenneth Branagh /M...,"In post-World War II Venice, Poirot, now retir...",
1071,Blade Runner 2049,2017,R,164.0,Action,Drama/Mystery/,8.0,Denis Villeneuve,Harrison Ford /Ryan Gosling /Ana de Armas /Dav...,Young Blade Runner K's discovery of a long-bur...,620455.0
1164,Killers of the Flower Moon,2023,R,206.0,Crime,Drama/History/,,Martin Scorsese,Leonardo DiCaprio /Robert De Niro /Lily Gladst...,Members of the Osage tribe in the United State...,
502,Rebel Moon,2023,,,Action,Adventure/Drama/,,Zack Snyder,Sofia Boutella /Cleopatra Coleman /Ed Skrein /...,When a peaceful colony on the edge of a galaxy...,
735,Fear the Walking Dead,2015–2023,TV-MA,44.0,Drama,Horror/Sci-Fi/,6.8,Kim Dickens,Frank Dillane /Cliff Curtis /Rubén Blades /,"A Walking Dead spinoff set in Los Angeles, Cal...",137600.0
281,Gravity Falls,2012–2016,TV-Y7,23.0,Animation,Adventure/Comedy/,8.9,Kristen Schaal,Jason Ritter /Alex Hirsch /Linda Cardellini /,Twin siblings Dipper and Mabel Pines spend the...,124012.0
1124,X,II) (2022,R,105.0,Horror,Mystery/Thriller/,6.6,Ti West,Mia Goth /Jenna Ortega /Brittany Snow /Kid Cudi /,"In 1979, a group of young filmmakers set out t...",141762.0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1200 entries, 0 to 1199
Data columns (total 11 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   title          1200 non-null   object
 1   release_date   1200 non-null   object
 2   movie_tv_show  1200 non-null   object
 3   duration       1200 non-null   object
 4   genre          1200 non-null   object
 5   other_genre    1200 non-null   object
 6   imdb_rating    1200 non-null   object
 7   director       1200 non-null   object
 8   stars          1200 non-null   object
 9   summary        1200 non-null   object
 10  votes          1200 non-null   object
dtypes: object(11)
memory usage: 103.3+ KB


In [8]:
# save scraped data in a csv file
df.to_csv("scraped_data.csv", index=False)