## TV urls (series overview)

In [1]:
# Imports
import requests
from bs4 import BeautifulSoup
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import pandas as pd

# Use your own User-Agent
headers = {'User-Agent':'Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36 RuxitSynthetic/1.0 v316542848 t18859'}

In [None]:
# Get the tv urls for the overview details page
# Lists to store the scraped data in
tv_urls = []

# Get series urls - the overview of the series 
# Preparing the monitoring of the loop
start_time = time()
req = 0

# The loop
categories = ['new-series', 'returning-series', 'special-event']
# For every category
for category in categories:
    if category=='returning-series':
    pages = range(7)
    else:
    pages = [0]

    # For every page
    for page in pages:
    if page==0:
        response = requests.get('https://www.metacritic.com/browse/tv/release-date/'+category+'/date', headers=headers)
    else:
        response = requests.get('https://www.metacritic.com/browse/tv/release-date/'+category+'/date?page='+str(page), headers=headers)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    req += 1
    elapsed_time = time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(req, req/elapsed_time))
    clear_output(wait = True)

    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(req, response.status_code))

    # Break the loop if the number of requests is greater than expected
    if req > 10:
        warn('Number of requests was greater than expected.')

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the tvshow containters from a single page
    first_show = page_html.find('li', class_='product season_product first_product')
    show_containers = page_html.find_all('li', class_='product season_product')
    last_show = page_html.find('li', class_='product season_product last_product')

    # Extract the page of tvshow from individual tvshow container
    # First container
    # TV Show page url
    show_url = first_show.a['href'].split('/season-', 1)[0]
    if show_url not in tv_urls:
        tv_urls.append(show_url)

    # Middle containers
    for container in show_containers:
        show_url = container.a['href'].split('/season-', 1)[0]
        if show_url not in tv_urls:
        tv_urls.append(show_url)

    # Last container
    # TV Show page url
    show_url = last_show.a['href'].split('/season-', 1)[0]
    if show_url not in tv_urls:
        tv_urls.append(show_url)

In [None]:
# Save tv_urls into csv, so that we don't need to make the future requests
tv_urls_df = pd.DataFrame({'tv_url': tv_urls})
tv_urls_df.to_csv('tv_urls.csv', index=False)

## TV series overview ratings & reviews

In [71]:
# Import the `tv_urls.csv` file
tv_urls = pd.read_csv('tv_urls.csv')

# Save tv_url in a list
tv_urls_list = tv_urls['tv_url'].tolist()
tv_urls_list[:2]

['/tv/love-fraud', '/tv/in-my-skin']

In [55]:
# Loop through all tv_urls and get title, date, user names, user ratings, and user reviews
# This takes the information for the overview of the tv series

# Lists to store the scraped data
titles = []
dates = []
user_names = []
ratings = []
reviews = []
review_dates = []


# Functions to make the code more efficient
def monitor_loop():
    """
    Function to pause the loop and monitor the requests
    """
    # Pause the loop
    sleep(randint(10,30))
    
    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        message = warn('Status code: {}'.format(response.status_code))
        return message

def get_title_date():
    """
    Gets the title and the release date
    """
    # The title
    title = page_html.h1.text
    # The release date
    release_date = page_html.find('span', class_='release_date').find('span', class_=None).text
    return title, release_date

def get_user_rating_review():
    """
    Gets the user name, rating, review, and review_date
    Adds the scraped data to the lists
    """
    # Add the title to the list
    titles.append(title)
    # Add the release date to the list
    dates.append(release_date)
    # Get the user name and add it to the list
    user_name = container.a.text
    user_names.append(user_name)
    # Get the rating and add it to the list
    rating = container.find('div', class_='left fl').text.replace('\n','')
    ratings.append(rating)
    # Get the review and add it to the list
    review = container.find('div', class_='review_body').text.replace('\n', '')
    reviews.append(review)
    # Get the review date and add it to the list
    review_date = container.find('span', class_='date').text
    review_dates.append(review_date)
    return titles, dates, user_names, ratings, reviews, review_dates
    
# The loop
# For each url of the series
for url in tv_urls_list:
    # Monitor the url scraped
    print(url)
    clear_output(wait = True)
    
    # Get the html page
    response = requests.get('https://www.metacritic.com'+url+'/user-reviews', headers=headers)
    # Monitor the loop
    monitor_loop()
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
        
    # Get the reviews containers
    review_containers = page_html.find_all('div', class_='review pad_top1')
    
    # If any review_container, get the data
    if review_containers:
        # The title and release date
        title, release_date = get_title_date()
        # Get the data for each review
        for container in review_containers:
            titles, dates, user_names, ratings, reviews, review_dates = get_user_rating_review()

        # Go to the next page (if any)
        # The while loop runs only if there is a `next` button
        if page_html.find('span', class_='flipper next'):
            # Get the page url
            next_page = page_html.find('span', class_='flipper next').a
            while next_page:
                next_page_url = next_page['href']
                # Download the next page
                response = requests.get('https://www.metacritic.com'+next_page_url, headers=headers)
                # Monitor the loop
                monitor_loop()
                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')
                # The reviews containers
                review_containers = page_html.find_all('div', class_='review pad_top1')
                # The data for each review
                for container in review_containers:
                    get_user_rating_review()
                # Get the page url
                # The while loop stops if next_page==None
                next_page = page_html.find('span', class_='flipper next').a

In [69]:
# Save series ratings into csv, so that we don't need to make future requests
series_ratings = pd.DataFrame({
    'title' : titles,
    'release_date' : dates,
    'user_name' : user_names,
    'rating' : ratings,
    'review' : reviews,
    'review_date' : review_dates
})

series_ratings.to_csv('series_ratings.csv', index=False)

## Season URLs

In [90]:
# Get the season urls
season_urls = []

for url in tv_urls_list:
    response = requests.get('https://www.metacritic.com'+url, headers=headers)
    # Monitor the loop
    monitor_loop()
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    # Get the url containers
    season_urls_containers = page_html.find_all('li', class_='ep_guide_season')
    # Get the season url
    for container in season_urls_containers:
        season_url = container.a['href']
        season_urls.append(season_url)

In [None]:
# Save the seasons urls into csv, so that we don't need to make future requests
# Save tv_urls into csv, so that we don't need to make the future requests
season_urls_df = pd.DataFrame({'season_url': season_urls})
season_urls_df.to_csv('season_urls.csv', index=False)