In [33]:
import requests
from requests import get
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np

In [34]:
# Ensure only English-translated titles from movies
headers = {"Accept-Language": "en-US, en;q=0.5"}

In [35]:
url = 'https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv'
results = requests.get(url, headers=headers)

##### url : is the variable we create and assign the URL to
##### results : is the variable we create to store our request.get action
##### requests.get(url, headers=headers) : is the method we use to grab the contents of the URL. The headers part tells our scraper to bring us English, based on our previous line of code.

In [36]:
soup = BeautifulSoup(results.text, 'html.parser')
#print(soup.prettify())

##### soup : is the variable we create to assign the method BeatifulSoup to, which specifies a desired format of results using the HTML parser — this allows Python to read the components of the page rather than treating it as one long string
##### print(soup.prettify()) : will print what we’ve grabbed in a more structured tree format, making it easier to read

In [37]:
#initialize empty lists where you'll store your data
titles = []
years = []
time =[]
imdb_ratings = []
metascores = []
votes = []
us_gross = []

In [38]:
movie_div = soup.find_all('div',class_='lister-item-mode-advanced')

##### the find_all() method extracts all the div containers that have a class attribute of lister-item mode-advanced from what we have stored in our variable soup.

In [39]:
#initiate the for loop 
#this tells your scraper to iterate through 
#every div container we stored in move_div

In [40]:
#Compile everything into one cell

url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"
headers = {"Accept-Language": "en-US, en;q=0.5"}
results = requests.get(url, headers=headers)

soup = BeautifulSoup(results.text, "html.parser")

titles = []
years = []
time =[]
imdb_ratings = []
metascores = []
votes = []
us_gross = []

movie_div = soup.find_all('div', class_='lister-item mode-advanced')

for container in movie_div:
  
  #Name
  name = container.h3.a.text
  titles.append(name)
        
  #year
  year = container.h3.find('span', class_='lister-item-year').text
  years.append(year)

  #time
  runtime = container.p.find('span', class_='runtime').text if container.p.find('span', class_='runtime').text else '-'
  time.append(runtime)

  #IMDb rating
  imdb = float(container.strong.text)
  imdb_ratings.append(imdb)

  #metascore
  m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
  metascores.append(m_score)

  #here are two NV containers, grab both of them as they hold both the votes and the grosses
  nv = container.find_all('span', attrs={'name': 'nv'})
        
  #filter nv for votes
  vote = nv[0].text
  votes.append(vote)
        
  #filter nv for gross
  grosses = nv[1].text if len(nv) > 1 else '-'
  us_gross.append(grosses)

In [41]:
print(titles)
print(years)
print(time)
print(imdb_ratings)
print(metascores)
print(votes)
print(us_gross)

['Dune', 'Shang-Chi and the Legend of the Ten Rings', 'The Matrix', 'Shershaah', 'Avengers: Endgame', 'The Shawshank Redemption', 'Once Upon a Time... In Hollywood', 'Knives Out', 'Once Upon a Time in America', 'Wind River', "Harry Potter and the Sorcerer's Stone", 'The Wolf of Wall Street', "Zack Snyder's Justice League", 'The Father', 'Interstellar', 'The Dark Knight', 'Blade Runner 2049', 'Parasite', 'The Godfather', 'Bohemian Rhapsody', 'Inception', '1917', 'Pulp Fiction', 'Mad Max: Fury Road', 'The Lord of the Rings: The Fellowship of the Ring', 'Gone Girl', 'Joker', 'Goodfellas', 'Avengers: Infinity War', 'Gladiator', 'The Goonies', 'Forrest Gump', 'Jurassic Park', 'Man on Fire', 'Tombstone', 'Guardians of the Galaxy', 'Titanic', 'Prisoners', 'Another Round', 'American Psycho', 'Harry Potter and the Deathly Hallows: Part 2', 'Back to the Future', 'Deadpool', 'Casino Royale', 'Sicario', 'Django Unchained', 'Blade Runner', 'Inglourious Basterds', 'The Gentlemen', 'Thor: Ragnarok']


In [42]:
#Pandas DataFrame
movies = pd.DataFrame({
    'movie': titles,
    'year': years,
    'timeMin': time,
    'imdb': imdb_ratings,
    'metascore': metascores,
    'votes': votes,
    'us_grossMillions': us_gross,
})

In [43]:
movies.head()

Unnamed: 0,movie,year,timeMin,imdb,metascore,votes,us_grossMillions
0,Dune,(2021),155 min,8.5,75,36088,-
1,Shang-Chi and the Legend of the Ten Rings,(2021),132 min,7.9,71,85955,-
2,The Matrix,(1999),136 min,8.7,73,1757491,$171.48M
3,Shershaah,(2021),135 min,8.8,-,101531,-
4,Avengers: Endgame,(2019),181 min,8.4,78,938583,$858.37M


In [44]:
movies.dtypes

movie                object
year                 object
timeMin              object
imdb                float64
metascore            object
votes                object
us_grossMillions     object
dtype: object

In [45]:
movies['year'] = movies['year'].str.extract('(\d+)').astype(int)
#('(\d+)') to extract all digits in the string

In [46]:
movies['year'].dtype

dtype('int32')

In [47]:
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)

In [48]:
#movies['metascore'] = movies['metascore'].astype(int)
#couldn't convert int

In [49]:
movies['votes'] = movies['votes'].str.replace(',', '').astype(int)

In [50]:
movies.dtypes

movie                object
year                  int32
timeMin               int32
imdb                float64
metascore            object
votes                 int32
us_grossMillions     object
dtype: object

In [51]:
movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x:x.lstrip('$').rstrip('M'))
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'],errors='coerce')

In [52]:
movies['metascore'] = movies['metascore'].str.extract('(\d+)')
movies['metascore'] = pd.to_numeric(movies['metascore'], errors = 'coerce')

In [53]:
# turns out some movies have no metascore, giving an error.
# decided to turn into float instead
# movies['metascore'] = movies['metascore'].astype(float) gave an error

In [54]:
movies.dtypes

movie                object
year                  int32
timeMin               int32
imdb                float64
metascore           float64
votes                 int32
us_grossMillions    float64
dtype: object

In [55]:
print(movies.head())
print(movies.dtypes)

                                       movie  year  timeMin  imdb  metascore  \
0                                       Dune  2021      155   8.5       75.0   
1  Shang-Chi and the Legend of the Ten Rings  2021      132   7.9       71.0   
2                                 The Matrix  1999      136   8.7       73.0   
3                                  Shershaah  2021      135   8.8        NaN   
4                          Avengers: Endgame  2019      181   8.4       78.0   

     votes  us_grossMillions  
0    36088               NaN  
1    85955               NaN  
2  1757491            171.48  
3   101531               NaN  
4   938583            858.37  
movie                object
year                  int32
timeMin               int32
imdb                float64
metascore           float64
votes                 int32
us_grossMillions    float64
dtype: object


In [56]:
movies.to_csv('movies.csv')