### Scraping the TV series from IMDB which has a rating of 6.0 or higher and 10k or more votes.

In [12]:
import pandas as pd
import numpy as np
import requests
from bs4 import BeautifulSoup
from random import randint
from time import sleep

In [13]:
# Declare the variables about imdb tv shows
show_title = []
year = []
time = []
description = []
genres = []
creators = []
stars = []
rating = []
votes = []
imdb_link = []

# Create an array of values and pass it to the url for dynamic webpages
pages = np.arange(1, 1500, 100)


# Create a for loop to iterate through the pages
for page in pages:
    page = requests.get("https://www.imdb.com/search/title/?title_type=tv_series,tv_miniseries&release_date=1970-01-01,2022-02-12&user_rating=6.0,10.0&num_votes=10000,&languages=en&sort=user_rating,desc&count=100&start="+str(page)+"&ref_=adv_nxt")
    # Create a soup object
    soup = BeautifulSoup(page.text, 'html.parser')
    tv_data = soup.find_all('div', attrs = {'class': 'lister-item mode-advanced'})
    sleep(randint(3,6))
    # Create a for loop to iterate through the tv_data
    for data in tv_data:
        title = data.h3.a.text
        show_title.append(title)

        # Get movie link
        link = data.h3.a['href']
        imdb_link.append("https://www.imdb.com"+link)

        life_span = data.h3.find('span', class_ = "lister-item-year text-muted unbold").text
        year.append(life_span)

        runtime = data.p.find("span", class_ = 'runtime')
        time.append(runtime)

        genre = data.find('span', class_ = 'genre').text
        genres.append(genre)

        rate = data.find("div", class_ = "inline-block ratings-imdb-rating").text.replace('\n', '')
        rating.append(rate)

        # Find the stars and append them to the stars array
        star = data.find_all('p', class_ = '")')
        star_ = star[1].text.replace('\n', '') if len(star) > 1 else '*****'
        stars.append(star_)

        # Find the creators and append them to the creators array
        creator = data.find_all('p', class_ = '")')
        creator_ = creator[2].text.replace('\n', '') if len(creator) > 2 else '*****'
        creators.append(creator_)

        # Find the votes and append them to the votes array
        value = data.find_all('span', attrs = {'name': "nv"})
        vote = value[0].text
        votes.append(vote)


# Create a dataframe with the scraped data
tv_df = pd.DataFrame({"Title": show_title, "Year": year, "Runtime": time, "Genre": genres, "Creator": creators, "Stars": stars, "Rating": rating, "Votes": votes, "IMDB Link": imdb_link})

In [14]:
tv_df.head()

Unnamed: 0,Title,Year,Runtime,Genre,Creator,Stars,Rating,Votes,IMDB Link
0,Sugarland,(2019– ),,"\nAnimation, Comedy, Family",*****,*****,9.9,33515,https://www.imdb.com/title/tt4057372/
1,The Chosen,(2017– ),[54 min],"\nDrama, History",*****,*****,9.5,23997,https://www.imdb.com/title/tt9471404/
2,Planet Earth II,(2016),[298 min],\nDocumentary,*****,*****,9.5,110299,https://www.imdb.com/title/tt5491994/
3,Chernobyl,(2019),[330 min],"\nDrama, History, Thriller",*****,*****,9.4,654062,https://www.imdb.com/title/tt7366338/
4,The Filthy Frank Show,(2011–2017),[12 min],"\nComedy, Fantasy, Music",*****,*****,9.4,32908,https://www.imdb.com/title/tt4202274/


In [15]:
#drop columns creator, stars
tv_df = tv_df.drop(columns = ['Creator', 'Stars'])
tv_df.head()

Unnamed: 0,Title,Year,Runtime,Genre,Rating,Votes,IMDB Link
0,Sugarland,(2019– ),,"\nAnimation, Comedy, Family",9.9,33515,https://www.imdb.com/title/tt4057372/
1,The Chosen,(2017– ),[54 min],"\nDrama, History",9.5,23997,https://www.imdb.com/title/tt9471404/
2,Planet Earth II,(2016),[298 min],\nDocumentary,9.5,110299,https://www.imdb.com/title/tt5491994/
3,Chernobyl,(2019),[330 min],"\nDrama, History, Thriller",9.4,654062,https://www.imdb.com/title/tt7366338/
4,The Filthy Frank Show,(2011–2017),[12 min],"\nComedy, Fantasy, Music",9.4,32908,https://www.imdb.com/title/tt4202274/


In [16]:
tv_df.shape

(1419, 7)

In [17]:
# strip the \n from the genre column
tv_df["Genre"] = tv_df["Genre"].str.strip('\n')

tv_df.head()

Unnamed: 0,Title,Year,Runtime,Genre,Rating,Votes,IMDB Link
0,Sugarland,(2019– ),,"Animation, Comedy, Family",9.9,33515,https://www.imdb.com/title/tt4057372/
1,The Chosen,(2017– ),[54 min],"Drama, History",9.5,23997,https://www.imdb.com/title/tt9471404/
2,Planet Earth II,(2016),[298 min],Documentary,9.5,110299,https://www.imdb.com/title/tt5491994/
3,Chernobyl,(2019),[330 min],"Drama, History, Thriller",9.4,654062,https://www.imdb.com/title/tt7366338/
4,The Filthy Frank Show,(2011–2017),[12 min],"Comedy, Fantasy, Music",9.4,32908,https://www.imdb.com/title/tt4202274/


Switch this line to a code line if needed.

spec_chars = ["s", "p", "a", "n", "c", "l", '"', "=", "<", ">", "/", "r", "u", "n", "t", "i", "m", "e"]

for char in spec_chars:
    tv_df["Runtime"] = tv_df["Runtime"].str.replace(char, "")

tv_df.head()

In [18]:
tv_df.rename(columns={"Runtime": "Runtime_minutes"}, inplace=True)
tv_df.head()

Unnamed: 0,Title,Year,Runtime_minutes,Genre,Rating,Votes,IMDB Link
0,Sugarland,(2019– ),,"Animation, Comedy, Family",9.9,33515,https://www.imdb.com/title/tt4057372/
1,The Chosen,(2017– ),[54 min],"Drama, History",9.5,23997,https://www.imdb.com/title/tt9471404/
2,Planet Earth II,(2016),[298 min],Documentary,9.5,110299,https://www.imdb.com/title/tt5491994/
3,Chernobyl,(2019),[330 min],"Drama, History, Thriller",9.4,654062,https://www.imdb.com/title/tt7366338/
4,The Filthy Frank Show,(2011–2017),[12 min],"Comedy, Fantasy, Music",9.4,32908,https://www.imdb.com/title/tt4202274/


In [19]:
# Save the dataframe to a csv file
tv_df.to_csv("imdb_tv_series_with_link.csv", index = False)