# Web Scraping - Get titles, ratings, and launched years from IMDB

**Course** Python - Data Science Bootcamp

In [1]:
## install gapazpacho

!pip install gazpacho

Collecting gazpacho
  Downloading gazpacho-1.1.tar.gz (7.9 kB)
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: gazpacho
  Building wheel for gazpacho (pyproject.toml) ... [?25l[?25hdone
  Created wheel for gazpacho: filename=gazpacho-1.1-py3-none-any.whl size=7461 sha256=ec8df46db24b0a6f57cc7e3f535fbf15c5edf8e682b25329a75e68cb0ffc8886
  Stored in directory: /root/.cache/pip/wheels/9b/bf/9f/8c8849499462415fa5cdf0d9edb1103c189bdbece90c51488e
Successfully built gazpacho
Installing collected packages: gazpacho
Successfully installed gazpacho-1.1


In [2]:
!pip list | grep "gaz"

gazpacho                         1.1


In [4]:
from gazpacho import Soup
import requests

url = "https://www.imdb.com/search/title/?groups=top_100&sort=user_rating,desc"

html = requests.get(url)

html

<Response [200]>

In [10]:
imdb = Soup(html.text)

In [6]:
## get title data
titles = imdb.find("h3", {"class": "lister-item-header"})

clean_titles = [title.strip() for title in titles]

clean_titles

['1. The Shawshank Redemption (1994)',
 '2. The Godfather (1972)',
 '3. The Dark Knight (2008)',
 "4. Schindler's List (1993)",
 '5. The Lord of the Rings: The Return of the King (2003)',
 '6. The Godfather Part II (1974)',
 '7. 12 Angry Men (1957)',
 '8. Pulp Fiction (1994)',
 '9. Fight Club (1999)',
 '10. The Lord of the Rings: The Fellowship of the Ring (2001)',
 '11. Inception (2010)',
 '12. Forrest Gump (1994)',
 '13. The Lord of the Rings: The Two Towers (2002)',
 '14. The Good, the Bad and the Ugly (1966)',
 '15. Spider-Man: Across the Spider-Verse (2023)',
 '16. Interstellar (2014)',
 '17. Goodfellas (1990)',
 '18. The Matrix (1999)',
 "19. One Flew Over the Cuckoo's Nest (1975)",
 '20. Star Wars: Episode V - The Empire Strikes Back (1980)',
 '21. The Silence of the Lambs (1991)',
 '22. Se7en (1995)',
 '23. Spirited Away (2001)',
 '24. Saving Private Ryan (1998)',
 '25. The Green Mile (1999)',
 '26. Star Wars: Episode IV - A New Hope (1977)',
 '27. City of God (2002)',
 '28. Te

In [7]:
## get rating data
ratings = imdb.find("div", {"class": "ratings-imdb-rating"})

clean_ratings = [rating.strip() for rating in ratings]

clean_ratings

['9.3',
 '9.2',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '9.0',
 '8.9',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.8',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.7',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.6',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5',
 '8.5']

In [8]:
## get launched year data
years = imdb.find("span", {"class": "lister-item-year"})

clean_years = [year.strip() for year in years]

clean_years

['(1994)',
 '(1972)',
 '(2008)',
 '(1993)',
 '(2003)',
 '(1974)',
 '(1957)',
 '(1994)',
 '(1999)',
 '(2001)',
 '(2010)',
 '(1994)',
 '(2002)',
 '(1966)',
 '(2023)',
 '(2014)',
 '(1990)',
 '(1999)',
 '(1975)',
 '(1980)',
 '(1991)',
 '(1995)',
 '(2001)',
 '(1998)',
 '(1999)',
 '(1977)',
 '(2002)',
 '(1991)',
 '(1997)',
 '(1946)',
 '(1954)',
 '(1962)',
 '(I) (2023)',
 '(2014)',
 '(1979)',
 '(2000)',
 '(1960)',
 '(1985)',
 '(2006)',
 '(2019)',
 '(2006)',
 '(1994)',
 '(2012)',
 '(1998)',
 '(1994)',
 '(1995)',
 '(2002)',
 '(2011)',
 '(1942)',
 '(1954)']

In [9]:
## create dataframe
import pandas as pd

movie_data = pd.DataFrame(data = {
    "title": clean_titles,
    "rating": clean_ratings,
    "launched year": clean_years
})

movie_data.head()

Unnamed: 0,title,rating,launched year
0,1. The Shawshank Redemption (1994),9.3,(1994)
1,2. The Godfather (1972),9.2,(1972)
2,3. The Dark Knight (2008),9.0,(2008)
3,4. Schindler's List (1993),9.0,(1993)
4,5. The Lord of the Rings: The Return of the Ki...,9.0,(2003)
