# Web Scraper With Python

[Reference](https://medium.com/better-programming/the-only-step-by-step-guide-youll-need-to-build-a-web-scraper-with-python-e79066bd895a)

## Import libraries

In [1]:
import requests # Requests will allow us to send HTTP requests to get HTML files
from requests import get
from bs4 import BeautifulSoup # BeautifulSoup will help us parse the HTML files
import pandas as pd # pandas will help us assemble the data into a DataFrame to clean and analyze it
import numpy as np # NumPy will add support for mathematical functions and tools for working with arrays

## Movies in English

In [2]:
headers = {"Accept-Language": "en-US, en;q=0.5"}

## Request contents of the URL

In [3]:
url = "https://www.imdb.com/search/title/?groups=top_1000&ref_=adv_prv"

results = requests.get(url, headers=headers) 

## Using BeautifulSoup

In [4]:
soup = BeautifulSoup(results.text, "html.parser")

print(soup.prettify())

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
 <head>
  <meta charset="utf-8"/>
  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>
  <meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
  <script type="text/javascript">
   var IMDbTimer={starttime: new Date().getTime(),pt:'java'};
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <title>
   IMDb "Top 1000"
(Sorted by Popularity Ascending) - IMDb
  </title>
  <script>
   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);
  </script>
  <script>
   if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
    }
  </script>
  <script>
   if (typeof uex == 'function') {
      uex("ld", "Loa

## Initialize your storage

In [5]:
#initialize empty lists where you'll store your data
titles = []
years = []
time = []
imdb_ratings = []
metascores = []
votes = []
us_gross = []

## Find all ```lister-item mode-advanced``` divs

In [6]:
movie_div = soup.find_all('div', class_='lister-item mode-advanced')

## Getting into each ```lister-item mode-advanced``` div

In [7]:
for container in movie_div:
  
  #Name
  name = container.h3.a.text
  titles.append(name)
        
  #year
  year = container.h3.find('span', class_='lister-item-year').text
  years.append(year)

  #time
  runtime = container.p.find('span', class_='runtime').text if container.p.find('span', class_='runtime').text else '-'
  time.append(runtime)

  #IMDb rating
  imdb = float(container.strong.text)
  imdb_ratings.append(imdb)

  #metascore
  m_score = container.find('span', class_='metascore').text if container.find('span', class_='metascore') else '-'
  metascores.append(m_score)

  #here are two NV containers, grab both of them as they hold both the votes and the grosses
  nv = container.find_all('span', attrs={'name': 'nv'})
        
  #filter nv for votes
  vote = nv[0].text
  votes.append(vote)
        
  #filter nv for gross
  grosses = nv[1].text if len(nv) > 1 else '-'
  us_gross.append(grosses)

In [8]:
print(titles)
print(years)
print(time)
print(imdb_ratings)
print(metascores)
print(votes)
print(us_gross)

['Knives Out', 'Ford v Ferrari', 'Once Upon a Time... in Hollywood', 'Parasite', 'Joker', 'The Gentlemen', '1917', 'Avengers: Endgame', 'Into the Wild', 'The Shawshank Redemption', 'Inception', 'Jojo Rabbit', 'The Godfather', 'The Lord of the Rings: The Fellowship of the Ring', 'Back to the Future', 'Little Women', 'Interstellar', 'The Dark Knight', 'Watchmen', "Harry Potter and the Sorcerer's Stone", 'The Silence of the Lambs', 'The Wolf of Wall Street', 'Call Me by Your Name', 'Thor: Ragnarok', 'Jaws', 'Alien', 'The Help', 'The Matrix', 'Jurassic Park', 'Pulp Fiction', 'Django Unchained', 'Inglourious Basterds', 'The Lighthouse', 'V for Vendetta', 'The Princess Bride', 'Gladiator', 'Spider-Man: Into the Spider-Verse', 'Avengers: Infinity War', 'Mad Max: Fury Road', 'Boogie Nights', 'The Prestige', 'Titanic', 'Forrest Gump', 'Gone Girl', 'Kingsman: The Secret Service', 'The Green Mile', 'Zodiac', 'Se7en', 'Fight Club', 'Blade Runner 2049']
['(2019)', '(2019)', '(2019)', '(2019)', '(20

## Building a DataFrame With pandas

In [9]:
movies = pd.DataFrame({
'movie': titles,
'year': years,
'timeMin': time,
'imdb': imdb_ratings,
'metascore': metascores,
'votes': votes,
'us_grossMillions': us_gross,
})

In [10]:
print(movies)

                                                movie  ... us_grossMillions
0                                          Knives Out  ...         $165.36M
1                                      Ford v Ferrari  ...         $117.62M
2                    Once Upon a Time... in Hollywood  ...         $142.50M
3                                            Parasite  ...          $53.37M
4                                               Joker  ...         $335.45M
5                                       The Gentlemen  ...                -
6                                                1917  ...         $159.23M
7                                   Avengers: Endgame  ...         $858.37M
8                                       Into the Wild  ...          $18.35M
9                            The Shawshank Redemption  ...          $28.34M
10                                          Inception  ...         $292.58M
11                                        Jojo Rabbit  ...           $0.35M
12          

## Data Cleaning

### Checking Data Types

In [11]:
print(movies.dtypes)

movie                object
year                 object
timeMin              object
imdb                float64
metascore            object
votes                object
us_grossMillions     object
dtype: object


In [12]:
movies['year'] = movies['year'].str.extract('(\d+)').astype(int)
movies['timeMin'] = movies['timeMin'].str.extract('(\d+)').astype(int)
movies['metascore'] = movies['metascore'].astype(int)
movies['votes'] = movies['votes'].str.replace(',', '').astype(int)
movies['us_grossMillions'] = movies['us_grossMillions'].map(lambda x: x.lstrip('$').rstrip('M'))
movies['us_grossMillions'] = pd.to_numeric(movies['us_grossMillions'], errors='coerce')

## Review the Cleaned and Converted Code

In [13]:
print(movies)
print(movies.dtypes)

                                                movie  ...  us_grossMillions
0                                          Knives Out  ...            165.36
1                                      Ford v Ferrari  ...            117.62
2                    Once Upon a Time... in Hollywood  ...            142.50
3                                            Parasite  ...             53.37
4                                               Joker  ...            335.45
5                                       The Gentlemen  ...               NaN
6                                                1917  ...            159.23
7                                   Avengers: Endgame  ...            858.37
8                                       Into the Wild  ...             18.35
9                            The Shawshank Redemption  ...             28.34
10                                          Inception  ...            292.58
11                                        Jojo Rabbit  ...              0.35

## Saving Your Data to a CSV

In [14]:
# movies.to_csv('movies.csv')