In [1]:
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd
import numpy as np
import ast
import warnings
warnings.filterwarnings('ignore')

# Set up Splinter
browser = Browser('chrome')

# Create an empty list to store the TV show data
TVshows = []

# Loop over all the URLs
for i in range(10):
    # Construct the URL for the current page
    url = f'https://www.imdb.com/search/title/?title_type=tv_series&num_votes=1000,&languages=en&sort=num_votes,desc&count=100&start={i*100+1}&ref_=adv_nxt'

    # Visit the URL
    browser.visit(url)

    # Parse the website
    html = browser.html
    html_soup = soup(html, 'html.parser')

    # Find the list of TV shows
    TV_list = html_soup.find('div', {'class': 'lister-list'})

    # Find all the rows in the list
    TV_rows = TV_list.find_all('div', {'class': 'lister-item'})

    # Loop over the rows and extract the TV show details
    for row in TV_rows:
        # Get the TV show title
        title = row.find('h3', {'class': 'lister-item-header'}).find('a').text

        # Get the TV show year
        year = row.find('span', {'class': 'lister-item-year'}).text.strip('()')

        # Get the TV show rating
        rating = row.find('div', {'class': 'ratings-bar'}).find('strong').text

        # Get the votes and stars
        details = row.find('div', {'class': 'lister-item-content'})
        stars = [star.text for star in details.find_all('p')[2].find_all('a')[1:]]
        votes = details.find('p', {'class': 'sort-num_votes-visible'}).find_all('span')[1].text.replace(',', '')

        # Get the TV show genres
        genres = [genre.strip() for genre in details.find('span', {'class': 'genre'}).text.split(',')]
        
        # Add the TV show data to the list
        TVshows.append({'Title': title, 'Year': year, 'Rating': rating, 'Votes': votes, 'Stars': stars, 'Genres': genres})


# Create a DataFrame from the TV show data
df_TV = pd.DataFrame(TVshows)

df_TV.head()

Unnamed: 0,Title,Year,Rating,Votes,Stars,Genres
0,Game of Thrones,2011–2019,9.2,2141065,"[Peter Dinklage, Kit Harington, Lena Headey]","[Action, Adventure, Drama]"
1,Breaking Bad,2008–2013,9.5,1945469,"[Aaron Paul, Anna Gunn, Betsy Brandt]","[Crime, Drama, Thriller]"
2,Stranger Things,2016–2024,8.7,1225425,"[Finn Wolfhard, Winona Ryder, David Harbour]","[Drama, Fantasy, Horror]"
3,The Walking Dead,2010–2022,8.1,1016458,"[Norman Reedus, Melissa McBride, Lauren Cohan]","[Drama, Horror, Thriller]"
4,Friends,1994–2004,8.9,1015775,"[Courteney Cox, Lisa Kudrow, Matt LeBlanc]","[Comedy, Romance]"


In [2]:
# Split the year range into two columns
df_TV[['Start_Year','End_Year']] = df_TV['Year'].str.split('–',expand=True)

df_TV.head()

Unnamed: 0,Title,Year,Rating,Votes,Stars,Genres,Start_Year,End_Year
0,Game of Thrones,2011–2019,9.2,2141065,"[Peter Dinklage, Kit Harington, Lena Headey]","[Action, Adventure, Drama]",2011,2019
1,Breaking Bad,2008–2013,9.5,1945469,"[Aaron Paul, Anna Gunn, Betsy Brandt]","[Crime, Drama, Thriller]",2008,2013
2,Stranger Things,2016–2024,8.7,1225425,"[Finn Wolfhard, Winona Ryder, David Harbour]","[Drama, Fantasy, Horror]",2016,2024
3,The Walking Dead,2010–2022,8.1,1016458,"[Norman Reedus, Melissa McBride, Lauren Cohan]","[Drama, Horror, Thriller]",2010,2022
4,Friends,1994–2004,8.9,1015775,"[Courteney Cox, Lisa Kudrow, Matt LeBlanc]","[Comedy, Romance]",1994,2004


In [3]:
# Drop the year range column
df_TV.drop(['Year'], axis=1,inplace=True)

df_TV.head()

Unnamed: 0,Title,Rating,Votes,Stars,Genres,Start_Year,End_Year
0,Game of Thrones,9.2,2141065,"[Peter Dinklage, Kit Harington, Lena Headey]","[Action, Adventure, Drama]",2011,2019
1,Breaking Bad,9.5,1945469,"[Aaron Paul, Anna Gunn, Betsy Brandt]","[Crime, Drama, Thriller]",2008,2013
2,Stranger Things,8.7,1225425,"[Finn Wolfhard, Winona Ryder, David Harbour]","[Drama, Fantasy, Horror]",2016,2024
3,The Walking Dead,8.1,1016458,"[Norman Reedus, Melissa McBride, Lauren Cohan]","[Drama, Horror, Thriller]",2010,2022
4,Friends,8.9,1015775,"[Courteney Cox, Lisa Kudrow, Matt LeBlanc]","[Comedy, Romance]",1994,2004


In [4]:
# Remove extra characters from the data
df_TV['Start_Year'] = df_TV['Start_Year'].str.replace('(', '').str.replace(')', '').str.replace('I', '').str.replace('II', '')

df_TV.head()

Unnamed: 0,Title,Rating,Votes,Stars,Genres,Start_Year,End_Year
0,Game of Thrones,9.2,2141065,"[Peter Dinklage, Kit Harington, Lena Headey]","[Action, Adventure, Drama]",2011,2019
1,Breaking Bad,9.5,1945469,"[Aaron Paul, Anna Gunn, Betsy Brandt]","[Crime, Drama, Thriller]",2008,2013
2,Stranger Things,8.7,1225425,"[Finn Wolfhard, Winona Ryder, David Harbour]","[Drama, Fantasy, Horror]",2016,2024
3,The Walking Dead,8.1,1016458,"[Norman Reedus, Melissa McBride, Lauren Cohan]","[Drama, Horror, Thriller]",2010,2022
4,Friends,8.9,1015775,"[Courteney Cox, Lisa Kudrow, Matt LeBlanc]","[Comedy, Romance]",1994,2004


In [5]:
# Review data types for formatting
df_TV.dtypes

Title         object
Rating        object
Votes         object
Stars         object
Genres        object
Start_Year    object
End_Year      object
dtype: object

In [6]:
# Convert Start_Year column to int
df_TV['Start_Year'] = pd.to_numeric(df_TV['Start_Year'])

df_TV.dtypes

Title         object
Rating        object
Votes         object
Stars         object
Genres        object
Start_Year     int64
End_Year      object
dtype: object

In [7]:
# Convert End_Year column to int and populate empty rows with NaN; then replace NaN with the current year
df_TV['End_Year'] = pd.to_numeric(df_TV['End_Year'],errors='coerce')
df_TV = df_TV.replace(np.nan, 2023, regex=True)
df_TV['End_Year'] = df_TV['End_Year'].astype(int)
df_TV.dtypes

Title         object
Rating        object
Votes         object
Stars         object
Genres        object
Start_Year     int64
End_Year       int64
dtype: object

In [8]:
# Export the DataFrame to a CSV file
df_TV.to_csv('imdb_top_1000_TV_final.csv', index=False)

In [9]:
# Read the CSV file into a DataFrame
clean_df_TV = pd.read_csv('imdb_top_1000_TV_final.csv')

# Convert the string representation of the list to an actual list
clean_df_TV['Stars'] = clean_df_TV['Stars'].apply(lambda x: ast.literal_eval(x))
clean_df_TV['Genres'] = clean_df_TV['Genres'].apply(lambda x: ast.literal_eval(x))

# Convert the DataFrame to JSON and write it to a file
clean_df_TV.to_json('imdb_top_1000_TV_final.json', orient='records')

In [10]:
# Close the browser
browser.quit()