# Beautiful Soup - IMDb Scraping
For this project I'm using urllib.request and BeautifulSoup to scrape the Internet Movie Database adavanced search results (approx 5600 titles). Worth noting I have only used the Requests and Selenium libraries in the past so I'm looking forward to ading BeautifulSoup to the toolbelt.

Advanced search details
* TV Series, Released between 2000-01-01 and 2020-12-31, Rating Count at least 500, United States, English (Sorted by Popularity Ascending)

In [2]:
# Load libraries 
from bs4 import BeautifulSoup
import matplotlib.pyplot as plt
import pandas as pd
from urllib.request import urlopen
import numpy as np
import re


In [89]:
# Creating an array with incriment of 250 to capture the 250 results per page each for each of the 5600+ titles 
nums = np.arange(1, 3250, 250)
nums = nums.astype(str)


In [97]:
# Instantiate the lists to capture the div, span, and classes
IMDB = []
Name = []
Cert = []
Years = []
Rating = []
Genre = []
Votes = []
Description = []
Stars = []
Runtime = []


# Looping through the numbers of the array which is concatenated to the link
for num in nums:
    url = "https://www.imdb.com/search/title/?title_type=tv_series&release_date=2000-01-01,2020-12-31&num_votes=100,&countries=us&languages=en&count=250&start="+num+"&ref_=adv_nxt"
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    movies = soup.find_all('div', {'class':'lister-item mode-advanced'})

# Looping through the movies contained in the movies lister-item for each page in the num loop
    for i in movies:
        try:
            IMDB.append(i.find('img', {'class':'loadlate'})['data-tconst'][2:])
        except:
            IMDB.append(np.nan)
        try:
            Name.append(i.h3.a.text)
        except:
            Name.append('None')
        try:
            Runtime.append(i.find("span",{'class':'runtime'}).text[:-3])
        except:
            # Add np.nan for later conversion to int and fill with mean/median
            Runtime.append(np.nan)
        try:
            # Split the text into release/end years 
            year = i.find("span", {"class": "lister-item-year text-muted unbold"}).text[1:-1]
            Years.append(year)
        except:
            Release_year.append(np.nan)
            End_year.append(np.nan)
        try:
            Genre.append(i.find("span", {"class": "genre"}).text[1:])
        except:
            Genre.append('None')
        try:
            Rating.append(i.find("div", {"class": "inline-block ratings-imdb-rating"})['data-value'])
        except:
            Rating.append(np.nan)
        try:
            # Replacing the comma for easier dytpe conversion later
            Votes.append(i.find("span", {"name": "nv"}).text.replace(",",""))
        except:
            Votes.append(np.nan)
        try:
            Description.append(i.find_all('p', {'class':'text-muted'})[1].text[1:])
        except:
            Description.append('None')
        try:
            star = i.find_all("a", href=True)[-4:]
            Stars.append(star[0].text+", "+star[1].text+", "+star[2].text+", "+star[3].text)
        except:
            Stars.append('None')


In [111]:
# Zipping the lists together 
data = list(zip(IMDB, Name, Runtime, Years, Genre, Rating, Votes, Description, Stars))

In [112]:
# Creating the dataframe from the results
df = pd.DataFrame(data, columns = ['IMDB','Name', 'Runtime (mins)', "Years", 'Genre', 'Rating', 'Votes' , 'Description', 'Stars'])
print(df.shape)
df.head()

(3250, 9)


Unnamed: 0,IMDB,Name,Runtime (mins),Years,Genre,Rating,Votes,Description,Stars
0,8740790,Bridgerton,60,2020–,"Drama, Romance",7.3,50357,"Wealth, lust, and betrayal set against the...","Phoebe Dynevor, Regé-Jean Page, Nicola Coughla..."
1,7221388,Cobra Kai,30,2018–,"Action, Comedy, Drama",8.6,104807,Decades after their 1984 All Valley Karate...,"Ralph Macchio, William Zabka, Xolo Maridueña, ..."
2,8111088,The Mandalorian,40,2019–,"Action, Adventure, Sci-Fi",8.8,281712,The travels of a lone bounty hunter in the...,"Pedro Pascal, Gina Carano, Giancarlo Esposito,..."
3,4477976,Superstore,22,2015–,Comedy,7.8,23192,A look at the lives of employees at a big ...,"Ben Feldman, Lauren Ash, Colton Dunn, Nico Santos"
4,944947,Game of Thrones,57,2011–2019,"Action, Adventure, Drama",9.3,1763644,Nine noble families fight for control over...,"Emilia Clarke, Peter Dinklage, Kit Harington, ..."


In [116]:
RE_years = df['Years'].str.split("–", expand=True)
df['Release_year'] = RE_years[0]
df['End_year'] = RE_years[1]
df.drop('Years', axis=1, inplace=True)

In [119]:
# Can see a little short coming in the Stars col where there were less than 4 stars e.g. 1 of 3 stars produces
# "10, X, See full summary, " in its place.
df['Stars'] = df['Stars'].str.replace("10,", "").str.replace("X,", "").str.replace("See full summary,", "")
# Many End years and a few release years where not present in the table and thus not scraped
df['Release_year'] = df['Release_year'].str.replace('(', "").str.replace("II\) ", "", regex=True).str.replace("I\)", "", regex=True)
# Cpuple of shows have the entire seasons runtime i.e 3,900 is 84 x 46mins
df['Runtime (mins)'] = df['Runtime (mins)'].str.replace("1,248 " ,"22").str.replace("3,900 ", "46")

In [120]:
df.head()

Unnamed: 0,IMDB,Name,Runtime (mins),Genre,Rating,Votes,Description,Stars,Release_year,End_year
0,8740790,Bridgerton,60,"Drama, Romance",7.3,50357,"Wealth, lust, and betrayal set against the...","Phoebe Dynevor, Regé-Jean Page, Nicola Coughla...",2020,
1,7221388,Cobra Kai,30,"Action, Comedy, Drama",8.6,104807,Decades after their 1984 All Valley Karate...,"Ralph Macchio, William Zabka, Xolo Maridueña, ...",2018,
2,8111088,The Mandalorian,40,"Action, Adventure, Sci-Fi",8.8,281712,The travels of a lone bounty hunter in the...,"Pedro Pascal, Gina Carano, Giancarlo Esposito,...",2019,
3,4477976,Superstore,22,Comedy,7.8,23192,A look at the lives of employees at a big ...,"Ben Feldman, Lauren Ash, Colton Dunn, Nico Santos",2015,
4,944947,Game of Thrones,57,"Action, Adventure, Drama",9.3,1763644,Nine noble families fight for control over...,"Emilia Clarke, Peter Dinklage, Kit Harington, ...",2011,2019.0


In [122]:
df.to_csv("Saved CSVs/tv_series_2000_2020.csv", index=False)

# Part 2

### Scraping additional data from the main page of each title and the associated company credits page
* Missing titles end/most recent season years
* No. of episodes
* No. of seasons
* Distribution company
* Full list of genres (not limited to 3)

In [34]:
# Loading cols as string for concaaination with loops below
df = pd.read_csv('tv_series_2000_2020.csv', dtype={'Release_year': "object", 'IMDB': 'object'})

In [35]:
df.head()

Unnamed: 0,IMDB,Name,Runtime (mins),Genre,Rating,Votes,Description,Stars,Release_year,End_year,Episodes
0,8740790,Bridgerton,60.0,"Drama, Romance",7.3,50357,"Wealth, lust, and betrayal set against the...","Phoebe Dynevor, Regé-Jean Page, Nicola Coughla...",2020,,9
1,7221388,Cobra Kai,30.0,"Action, Comedy, Drama",8.6,104807,Decades after their 1984 All Valley Karate...,"Ralph Macchio, William Zabka, Xolo Maridueña, ...",2018,,33
2,8111088,The Mandalorian,40.0,"Action, Adventure, Sci-Fi",8.8,281712,The travels of a lone bounty hunter in the...,"Pedro Pascal, Gina Carano, Giancarlo Esposito,...",2019,,24
3,4477976,Superstore,22.0,Comedy,7.8,23192,A look at the lives of employees at a big ...,"Ben Feldman, Lauren Ash, Colton Dunn, Nico Santos",2015,,113
4,944947,Game of Thrones,57.0,"Action, Adventure, Drama",9.3,1763644,Nine noble families fight for control over...,"Emilia Clarke, Peter Dinklage, Kit Harington, ...",2011,2019.0,73


In [128]:
df.dtypes

IMDB               object
Name               object
Runtime (mins)    float64
Genre              object
Rating            float64
Votes               int64
Description        object
Stars              object
Release_year       object
End_year           object
dtype: object

In [82]:
# Lists that contin the imdb id for most scapres and release for to assist in pulling correct distributors 
imdbid = df['IMDB'].tolist()
ry = df['Release_year'].tolist()

# Test
# short_list = imdb_year[:5]

# Starting with episode count

In [141]:
# Load first page of results into soup 

Episodes = []
 
for imdb in imdbid:
    url = "https://www.imdb.com/title/tt"+imdb
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    #soup.title.text
    try:
        eps = soup.find_all('span',{'class':'bp_sub_heading'})
        for x in eps:
            if "episodes" in x.text:
                Episodes.append(x.text.replace(' episodes', ""))
            elif "episode" in x.text:
                Episodes.append(x.text.replace(' episode', ""))
            else:
                pass
    except:
        Episodes.append("None")

In [145]:
df['Episodes'] = Episodes

Unnamed: 0,IMDB,Name,Runtime (mins),Genre,Rating,Votes,Description,Stars,Release_year,End_year,Episodes
0,8740790,Bridgerton,60.0,"Drama, Romance",7.3,50357,"Wealth, lust, and betrayal set against the...","Phoebe Dynevor, Regé-Jean Page, Nicola Coughla...",2020,,9
1,7221388,Cobra Kai,30.0,"Action, Comedy, Drama",8.6,104807,Decades after their 1984 All Valley Karate...,"Ralph Macchio, William Zabka, Xolo Maridueña, ...",2018,,33
2,8111088,The Mandalorian,40.0,"Action, Adventure, Sci-Fi",8.8,281712,The travels of a lone bounty hunter in the...,"Pedro Pascal, Gina Carano, Giancarlo Esposito,...",2019,,24
3,4477976,Superstore,22.0,Comedy,7.8,23192,A look at the lives of employees at a big ...,"Ben Feldman, Lauren Ash, Colton Dunn, Nico Santos",2015,,113
4,944947,Game of Thrones,57.0,"Action, Adventure, Drama",9.3,1763644,Nine noble families fight for control over...,"Emilia Clarke, Peter Dinklage, Kit Harington, ...",2011,2019.0,73


In [148]:
# Saving verified results to csv
df.to_csv("Saved CSVs/tv_series_2000_2020_episodes.csv", index=False)

# Moving onto seasons

In [2]:
df = pd.read_csv('tv_series_2000_2020_episodes.csv', dtype={'Release_year': "object", 'IMDB': 'object'})

In [433]:
# Find_all list of div containers that hold each set of series data on each page
Seasons = []
 
for imdb in imdbid[742:]:
    url = "https://www.imdb.com/title/tt"+str(imdb)
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    #soup.title.text  
   
    try:
        titles = soup.find_all('div', {'class':'seasons-and-year-nav'})

    for x in titles:
        x.text
        # Number seasons and start year
        if x.text != None:
            season_count = x.find_all("a", href=True)
            Seasons.append(season_count[0].text)
    

In [496]:
# Creating column
df['Seasons'] = Seasons

In [497]:
# Saving verified results
df.to_csv("Saved CSVs/tv_series_2000_2020_seasons.csv", index=False)

# Moving to Distributors

In [None]:
df = pd.read_csv('tv_series_2000_2020_seasons.csv', dtype={'Release_year': "object", 'IMDB': 'object'})

In [259]:
# Test
# short_list = imdbid[:5]

In [648]:
# Creating a dictionary pair for IMDb id and Release Year
imdb_ry = dict(zip(imdbid, ry))

In [649]:
%%time
dist_list =[]
# Replaced imdb_year.items() with second_half in a new cell with same code after HTTP error killed the loop 
for imdb, ry in imdb_ry.items():    
    url = "https://www.imdb.com/title/tt"+imdb+"/companycredits?ref_=tt_ql_dt_4"
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    # soup.title.text
    try:
        dist_table = soup.find_all('ul', {'class':'simpleList'})
        x = dist_table[1].text.split('\n')
        
        # Conditions ensure the original US distributor is returned while more obscure will still produce a result
        foo_indexes = [i for i, s in enumerate(x) if ry and "USA" in s or ry and "World-wide" in s or "TV" and "USA" in s]

        dist_list.append(x[foo_indexes[0]].split("(", 1)[0].strip())
    except:
        dist_list.append('None')

CPU times: user 3min 49s, sys: 12.7 s, total: 4min 2s
Wall time: 1h 25s


In [665]:
# Creating col
df['Distributor'] = dist_list

In [667]:
# Saving results
df.to_csv("Saved CSVs/tv_series_2000_2020_distributor.csv", index=False)

# Second pass at distributors
Capturing some distribors lost by using dist_table[1] this index is correct when there is a production company and there are two 'class':'simpleList' tables. In the case where no production company is listed the dist_table[0] is the distributor list

In [39]:
df = pd.read_csv('tv_series_2000_2020_distributor.csv', dtype={'Release_year': "object", 'IMDB': 'object'})

In [40]:
second_pass = df[df['Distributor'] == 'None']

In [44]:
len(second_pass)

NameError: name 'second_pass' is not defined

In [59]:
imdb_ry2 = dict(zip(third_pass['IMDB'], third_pass['Release_year']))

In [60]:
pass_list ={}
# Replaced imdb_year.items() with second_half in a new cell with same code after HTTP error killed the loop 
for imdb, ry in imdb_ry2.items():    
    url = "https://www.imdb.com/title/tt"+imdb+"/companycredits?ref_=tt_ql_dt_4"
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    # soup.title.text
    try:
        dist_table = soup.find_all('ul', {'class':'simpleList'})
        x = dist_table[0].text.split('\n')
        
        # Conditions ensure the original US distributor is returned while more obscure will still produce a result
        foo_indexes = [i for i, s in enumerate(x) if "USA" in s]

        pass_list[imdb] = x[foo_indexes[0]].split("(", 1)[0].strip()
    except:
        pass_list[imdb] = ('None')

In [62]:
# Mapping the distributors based on the IMDb key without overwriting the first pass results
for key, value in pass_list.items():
    df.loc[df['IMDB'] == key, ['Distributor']] = value

In [45]:
len(df[df['Distributor']=='None'])

218

In [65]:
# Saving verified results
df.to_csv("Saved CSVs/tv_series_2000_2020_distributor.csv", index=False)

# Moving to certificate

In [417]:
df = pd.read_csv('tv_series_2000_2020_distributor.csv', dtype={'Release_year': "object", 'IMDB': 'object'})
imdbid = df['IMDB'].tolist()

In [28]:
Certs = {}
for imdb in imdbid:
    url = "https://www.imdb.com/title/tt"+str(imdb)+"/parentalguide"
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    certificates = soup.find_all("ul", {"class":"ipl-inline-list"})
    for cert in certificates:
        try:
            x = ''.join(re.findall(r"United States:TV.{1,3}", cert.text))
            y = x.replace("United States:", "")
            if y != "":
                Certs[imdb] = y[:5]
        except:
            Certs[imdb] = 'None'

In [41]:
df['Certificate'] = df['IMDB'].map(Certs)

In [4]:
df.to_csv("Saved CSVs/tv_series_2000_2020_certs.csv", index=False)

# Moving onto story desc and full genre list

In [8]:
story = {}
genre = {}

for imdb in imdbid:
    url = "https://www.imdb.com/title/tt"+imdb
    html = urlopen(url)
    soup = BeautifulSoup(html, "html.parser")
    title = soup.find_all("div", {"id":"titleStoryLine"})
    try:
        for x in title:
            story[imdb] = x.find("p").text.strip().split('\n', 1)[0]
    except:
        story[imdb] = 'None'
    try:
        for x in title:
            genre[imdb] = x.find_all("div", {"class":"see-more inline canwrap"})[1].text.replace("Genres:", "").replace("|", ",")
    except:
        genre[imdb] = 'None'


In [10]:
len(genre)

3250

In [40]:
df['Genres'] = df['IMDB'].map(genre)
df['Story'] = df['IMDB'].map(story)
df['Genres'] = df['Genres'].str.replace(' ',"").str.replace('\n',"")
df.head(2)

Unnamed: 0,IMDB,Name,Runtime (mins),Genre,Rating,Votes,Description,Stars,Release_year,End_year,Episodes,Seasons,Distributor,Genres,Story
0,8740790,Bridgerton,60.0,"Drama, Romance",7.3,50357,"Wealth, lust, and betrayal set against the...","Phoebe Dynevor, Regé-Jean Page, Nicola Coughla...",2020,,9,2,Netflix,"Drama ,Romance","Wealth, lust, and betrayal set against the bac..."
1,7221388,Cobra Kai,30.0,"Action, Comedy, Drama",8.6,104807,Decades after their 1984 All Valley Karate...,"Ralph Macchio, William Zabka, Xolo Maridueña, ...",2018,,33,4,Netflix,"Action ,Comedy ,Drama ,Sport",Thirty years after their final confrontation a...


In [5]:
df.to_csv("Saved CSVs/tv_series_2000_2020_FINAL_DIRTY.csv", index=False)

In [6]:
df = pd.read_csv('Saved CSVs/tv_series_2000_2020_FINAL_DIRTY.csv')
df.head()

Unnamed: 0,IMDB,Name,Runtime (mins),Genre,Rating,Votes,Description,Stars,Release_year,End_year,Episodes,Seasons,Distributor,Genres,Story,Certificate
0,8740790,Bridgerton,60.0,"Drama, Romance",7.3,50357,"Wealth, lust, and betrayal set against the...","Phoebe Dynevor, Regé-Jean Page, Nicola Coughla...",2020,,9,2,Netflix,"Drama ,Romance","Wealth, lust, and betrayal set against the bac...",TV-MA
1,7221388,Cobra Kai,30.0,"Action, Comedy, Drama",8.6,104807,Decades after their 1984 All Valley Karate...,"Ralph Macchio, William Zabka, Xolo Maridueña, ...",2018,,33,4,Netflix,"Action ,Comedy ,Drama ,Sport",Thirty years after their final confrontation a...,TV-14
2,8111088,The Mandalorian,40.0,"Action, Adventure, Sci-Fi",8.8,281712,The travels of a lone bounty hunter in the...,"Pedro Pascal, Gina Carano, Giancarlo Esposito,...",2019,,24,3,Disney+,"Action ,Adventure ,Sci-Fi","After the stories of Jango and Boba Fett, anot...",TV-14
3,4477976,Superstore,22.0,Comedy,7.8,23192,A look at the lives of employees at a big ...,"Ben Feldman, Lauren Ash, Colton Dunn, Nico Santos",2015,,113,6,National Broadcasting Company,Comedy,A look at the lives of employees at a big box ...,TV-14
4,944947,Game of Thrones,57.0,"Action, Adventure, Drama",9.3,1763644,Nine noble families fight for control over...,"Emilia Clarke, Peter Dinklage, Kit Harington, ...",2011,2019.0,73,8,Home Box Office,"Action ,Adventure ,Drama ,Fantasy","In the mythical continent of Westeros, several...",TV-MA
