## Web Scraping

This script will scrape content from IMDB, such as:



In [1]:
import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.imdb.com/feature/genre/?ref_=nv_ch_gr'
html = requests.get(url).content

In [2]:
type(html)

bytes

A file-like object or a string can also be passed to the BeautifulSoup constructor to create the object. The file-like object is useful in parsing an online web page, which is the most common use of Beautiful Soup.

In [4]:
soup = BeautifulSoup(html, features="lxml")
# It's recommended that you name a specific parser, so that Beautiful Soup gives you the 
# same results across platforms and virtual environments.

soup

<!DOCTYPE html>
<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">
<head>
<meta charset="utf-8"/>
<meta content="IE=edge" http-equiv="X-UA-Compatible"/>
<meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>
<style>
                body#styleguide-v2 {
                    background: no-repeat fixed center top #000;
                }
            </style>
<script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:'java'};</script>
<script>
    if (typeof uet == 'function') {
      uet("bb", "LoadTitle", {wb: 1});
    }
</script>
<script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>
<title>Browse Movies and TV by Genre - IMDb</title>
<script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>
<script>
    if (typeof uet == 'function') {
      uet("be", "LoadTitle", {wb: 1});
 

In [8]:
soup.prettify()

'<!DOCTYPE html>\n<html xmlns:fb="http://www.facebook.com/2008/fbml" xmlns:og="http://ogp.me/ns#">\n <head>\n  <meta charset="utf-8"/>\n  <meta content="IE=edge" http-equiv="X-UA-Compatible"/>\n  <meta content="app-id=342792525, app-argument=imdb:///?src=mdot" name="apple-itunes-app"/>\n  <style>\n   body#styleguide-v2 {\n                    background: no-repeat fixed center top #000;\n                }\n  </style>\n  <script type="text/javascript">\n   var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};\n  </script>\n  <script>\n   if (typeof uet == \'function\') {\n      uet("bb", "LoadTitle", {wb: 1});\n    }\n  </script>\n  <script>\n   (function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);\n  </script>\n  <title>\n   Browse Movies and TV by Genre - IMDb\n  </title>\n  <script>\n   (function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);\n  </script>\n  <script>\n   if (typeof u

In [16]:
soup.h3#.text

<h3> Popular TV Show and Movie Genres</h3>

## Filtering

### Search for Strings

'b' tags

In [39]:
# new_list = [i.text for i in soup.find_all('h3')]

new_list = []

for i in soup.find_all('h3'):
    new_list.append(i.text)#.strip().replace(' ', '_'))

new_list

['Popular_TV_Show_and_Movie_Genres',
 'Popular_Movies_by_Genre',
 'Popular_TV_Series_by_Genre',
 "Prime_Video_(You_Haven't_Rated)_by_Genre",
 'Video_Games_by_Genre',
 'Movie_and_TV_Series_Themes',
 'Movie_Charts',
 'TV_Charts',
 'Top_Rated_English_Movies_by_Genre',
 'Advanced_Search',
 'Recently_Viewed']

In [None]:
title_list = []
for i in soup.find_all('div', attrs={'class':'aux-content-widget-2'}):
    for ii in i.find_all('span', attrs={'class':'widget_header'}):
        title_list.append(ii.text.strip())

title_list

In [None]:
genre_list = []
for i in soup.find_all('div', attrs={'class':'aux-content-widget-2'}):
    for ii in i.find_all('div', attrs={'class':'widget_content no_inline_blurb'}):
        genre_list.append(ii.get_text('_', strip=True))

genre_list

In [75]:
new_genre_list = []

for i in genre_list:
    new_genre_list.append(i.split('_'))
    
new_genre_list

[['Anime',
  'Avant-Garde',
  'B-Movie',
  'Chick Flick',
  'Coming of Age',
  'Cult Film',
  'Dystopia',
  'Epic',
  'Espionage',
  'Femme Fatale',
  'High School',
  'Kung Fu',
  'Mockumentary',
  'Post-Apocalypse',
  'Spoof',
  'Supernatural',
  'Time Travel',
  'Vampire',
  'Zombie',
  'Browse/Search by keyword'],
 ['Most Popular Movies',
  'Top Rated Movies',
  'Top Rated Indian Movies',
  'Most Popular Future Movies'],
 ['Most Popular TV', 'Top Rated TV', 'Most Popular Future TV'],
 ['Action',
  'Adventure',
  'Animation',
  'Biography',
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Family',
  'Fantasy',
  'Film Noir',
  'History',
  'Horror',
  'Music',
  'Musical',
  'Mystery',
  'Romance',
  'Sci-Fi',
  'Short',
  'Sport',
  'Superhero',
  'Thriller',
  'War',
  'Western'],
 ['']]

In [86]:
genre_list = []

for i in soup.find_all('div', attrs={'class':'aux-content-widget-2'}):
    each_list=[]
    for ii in i.find_all('div', attrs={'class':'table-row'}):
        each_list.append(ii.text.strip())
    genre_list.append(each_list)

genre_list

[['Anime',
  'Avant-Garde',
  'B-Movie',
  'Chick Flick',
  'Coming of Age',
  'Cult Film',
  'Dystopia',
  'Epic',
  'Espionage',
  'Femme Fatale',
  'High School',
  'Kung Fu',
  'Mockumentary',
  'Post-Apocalypse',
  'Spoof',
  'Supernatural',
  'Time Travel',
  'Vampire',
  'Zombie',
  'Browse/Search by keyword'],
 ['Most Popular Movies',
  'Top Rated Movies',
  'Top Rated Indian Movies',
  'Most Popular Future Movies'],
 ['Most Popular TV', 'Top Rated TV', 'Most Popular Future TV'],
 ['Action',
  'Adventure',
  'Animation',
  'Biography',
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Family',
  'Fantasy',
  'Film Noir',
  'History',
  'Horror',
  'Music',
  'Musical',
  'Mystery',
  'Romance',
  'Sci-Fi',
  'Short',
  'Sport',
  'Superhero',
  'Thriller',
  'War',
  'Western'],
 []]

In [87]:
genres_dict = {}

for i in range(len(new_genre_list)):
    genres_dict[title_list[i]] = genre_list[i]
    
genres_dict

{'Movie and TV Series Themes': ['Anime',
  'Avant-Garde',
  'B-Movie',
  'Chick Flick',
  'Coming of Age',
  'Cult Film',
  'Dystopia',
  'Epic',
  'Espionage',
  'Femme Fatale',
  'High School',
  'Kung Fu',
  'Mockumentary',
  'Post-Apocalypse',
  'Spoof',
  'Supernatural',
  'Time Travel',
  'Vampire',
  'Zombie',
  'Browse/Search by keyword'],
 'Movie Charts': ['Most Popular Movies',
  'Top Rated Movies',
  'Top Rated Indian Movies',
  'Most Popular Future Movies'],
 'TV Charts': ['Most Popular TV', 'Top Rated TV', 'Most Popular Future TV'],
 'Top Rated English Movies by Genre': ['Action',
  'Adventure',
  'Animation',
  'Biography',
  'Comedy',
  'Crime',
  'Documentary',
  'Drama',
  'Family',
  'Fantasy',
  'Film Noir',
  'History',
  'Horror',
  'Music',
  'Musical',
  'Mystery',
  'Romance',
  'Sci-Fi',
  'Short',
  'Sport',
  'Superhero',
  'Thriller',
  'War',
  'Western'],
 'Advanced Search': []}

In [2]:
all_links = []

for i in soup.find_all('div', attrs={'class':'image'}):
    for ii in i.find_all('a'):
        all_links.append(ii.get('href'))

NameError: name 'soup' is not defined

In [None]:
movie_dict = {'title':[], 'year':[], 'genre':[], 'rating':[], 'votes':[], 'length':[], 'age':[], 'summary':[]} 

for link in all_links:
    html = requests.get(link).content
    soup = BeautifulSoup(html, features="lxml")
    
    title = [ii.text for i in soup.find_all('h3', attrs={'class':'lister-item-header'}) for ii in i.find_all('a')]
    year = [i.text for i in soup.find_all('span', attrs={'class':'lister-item-year text-muted unbold'})]
    rating = [i.get_text(strip=True) for i in soup.find_all('div', {'class': 'inline-block ratings-imdb-rating'})]
    votes = [i.get_text(strip=True) for i in soup.find_all('span', {'name': 'nv'}) if '$' not in i.text]
    
    txt_lst = [i.get_text(strip=True).split('|') for i in soup.find_all('p', {'class': 'text-muted'})]
    summary = [txt_lst[i] for i in range(len(txt_lst)) if i%2!=0]
    
    
    

In [99]:
movie_dict = {'title':[], 'year':[], 'genre':[], 'rating':[], 'length':[], 'age':[], 'summary':[]} 

url = 'https://www.imdb.com/search/title?genres=comedy&explore=title_type,genres'
html = requests.get(url).content
soup = BeautifulSoup(html, features="lxml")

title = []

for i in soup.find_all('h3', attrs={'class':'lister-item-header'}):
    for ii in i.find_all('a'):
        title.append(ii.text)
        
title, len(title)

(['Never Have I Ever',
  'After Life',
  'Community',
  'Bad Education',
  'O Escritório',
  'The Gentlemen - Senhores do Crime',
  'Brooklyn Nine-Nine',
  'Era Uma Vez em... Hollywood',
  'The Willoughbys',
  'Parasitas',
  'Upload',
  'The Midnight Gospel',
  'Parks and Recreation',
  'Friends',
  "Schitt's Creek",
  'Knives Out: Todos São Suspeitos',
  'Bad Boys Para Sempre',
  'Trolls: Tour Mundial',
  'Os Goonies',
  'A Teoria do Big Bang',
  'What We Do in the Shadows',
  'The Rookie',
  'Rick e Morty',
  'Ousadas e Golpistas',
  'Psych - Agentes Especiais',
  'No Limite',
  'Defesa à Medida',
  "Zoey's Extraordinary Playlist",
  'Jojo Rabbit',
  'Ask 101',
  "'Bora Lá",
  'Foi Assim que Aconteceu',
  'The Boys',
  'Os Simpsons',
  'Jumanji: O Nível Seguinte',
  'Sex Education',
  'Uma Família Muito Moderna',
  'Sonic: O Filme',
  'Dave',
  'Run',
  'Good Girls',
  'Isto Somos Nós',
  'Se Tu Soubesses...',
  'Ossos',
  'As Aventuras do Dr. Dolittle',
  'Médicos e Estagiários',
  

In [None]:
import re

year = []

for i in soup.find_all('span', attrs={'class':'lister-item-year text-muted unbold'}):
    year.append(i.text)
        

years = []
for i in year:
    years = re.findall('\d*',i)
    for ii in years:
        new_lst = []
        if len(i)>0:
            new_lst.append(ii)
        years.append(new_lst)



years           

In [None]:
[i.get_text(strip=True) for i in soup.find_all('div', {'class': 'inline-block ratings-imdb-rating'})]

In [134]:
[i.get_text(strip=True) for i in soup.find_all('span', {'name': 'nv'}) if '$' not in i.text]

50

In [120]:
txt_lst = []

for i in soup.find_all('p', {'class': 'text-muted'}):
    txt_lst.append(i.get_text(strip=True).split('|'))
    
txt_lst

[['30 min', 'Comedy'],
 ["The complicated life of a modern-day first generation Indian American teenage girl, inspired by Mindy Kaling's own childhood."],
 ['30 min', 'Comedy, Drama'],
 ["After Tony's wife dies unexpectedly, his nice-guy persona is altered into an impulsive, devil-may-care attitude; taking his old world by storm."],
 ['22 min', 'Comedy'],
 ['A suspended lawyer is forced to enroll in a community college with an eclectic staff and student body.'],
 ['108 min', 'Biography, Comedy, Crime'],
 ["The beloved superintendent of New York's Roslyn school district and his staff, friends and relatives become the prime suspects in the unfolding of the single largest public school embezzlement scandal in American history."],
 ['22 min', 'Comedy'],
 ['A mockumentary on a group of typical office workers, where the workday consists of ego clashes, inappropriate behavior, and tedium.'],
 ['M/16', '113 min', 'Action, Comedy, Crime'],
 ['An American expat tries to sell off his highly profi

In [119]:
summary = []

for i in range(len(txt_lst)):
    if i%2!=0:
        summary.append(txt_lst[i])
        
len(summary)

50

In [None]:
age_time_genre = [txt_lst[i] for i in range(len(txt_lst)) if i%2==0]

age = []
time = [i for i in ]
genre = age_time_genre[-1]
