## TV urls (series overview)

In [2]:
# Imports
import requests
from bs4 import BeautifulSoup
from time import sleep
from time import time
from random import randint
from IPython.core.display import clear_output
from warnings import warn
import pandas as pd

# Use your own User-Agent
headers = {'User-Agent':'...'}

In [None]:
# Get the tv urls for the overview details page
# Lists to store the scraped data in
tv_urls = []

# Get series urls - the overview of the series 
# Preparing the monitoring of the loop
start_time = time()
req = 0

# The loop
categories = ['new-series', 'returning-series', 'special-event']
# For every category
for category in categories:
    if category=='returning-series':
    pages = range(7)
    else:
    pages = [0]

    # For every page
    for page in pages:
    if page==0:
        response = requests.get('https://www.metacritic.com/browse/tv/release-date/'+category+'/date', headers=headers)
    else:
        response = requests.get('https://www.metacritic.com/browse/tv/release-date/'+category+'/date?page='+str(page), headers=headers)

    # Pause the loop
    sleep(randint(8,15))

    # Monitor the requests
    req += 1
    elapsed_time = time() - start_time
    print('Request:{}; Frequency: {} requests/s'.format(req, req/elapsed_time))
    clear_output(wait = True)

    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        warn('Request: {}; Status code: {}'.format(req, response.status_code))

    # Break the loop if the number of requests is greater than expected
    if req > 10:
        warn('Number of requests was greater than expected.')

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Select all the tvshow containters from a single page
    first_show = page_html.find('li', class_='product season_product first_product')
    show_containers = page_html.find_all('li', class_='product season_product')
    last_show = page_html.find('li', class_='product season_product last_product')

    # Extract the page of tvshow from individual tvshow container
    # First container
    # TV Show page url
    show_url = first_show.a['href'].split('/season-', 1)[0]
    if show_url not in tv_urls:
        tv_urls.append(show_url)

    # Middle containers
    for container in show_containers:
        show_url = container.a['href'].split('/season-', 1)[0]
        if show_url not in tv_urls:
        tv_urls.append(show_url)

    # Last container
    # TV Show page url
    show_url = last_show.a['href'].split('/season-', 1)[0]
    if show_url not in tv_urls:
        tv_urls.append(show_url)

In [None]:
# Save tv_urls into csv, so that we don't need to make the future requests
tv_urls_df = pd.DataFrame({'tv_url': tv_urls})
tv_urls_df.to_csv('tv_urls.csv', index=False)

## TV series overview ratings & reviews

In [71]:
# Import the `tv_urls.csv` file
tv_urls = pd.read_csv('tv_urls.csv')

# Save tv_url in a list
tv_urls_list = tv_urls['tv_url'].tolist()
tv_urls_list[:2]

['/tv/love-fraud', '/tv/in-my-skin']

In [55]:
# Loop through all tv_urls and get title, date, user names, user ratings, and user reviews
# This takes the information for the overview of the tv series

# Lists to store the scraped data
titles = []
dates = []
user_names = []
ratings = []
reviews = []
review_dates = []


# Functions to make the code more efficient
def monitor_loop():
    """
    Function to pause the loop and monitor the requests
    """
    # Pause the loop
    sleep(randint(10,30))
    
    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        message = warn('Status code: {}'.format(response.status_code))
        return message

def get_title_date():
    """
    Gets the title and the release date
    """
    # The title
    title = page_html.h1.text
    # The release date
    release_date = page_html.find('span', class_='release_date').find('span', class_=None).text
    return title, release_date

def get_user_rating_review():
    """
    Gets the user name, rating, review, and review_date
    Adds the scraped data to the lists
    """
    # Add the title to the list
    titles.append(title)
    # Add the release date to the list
    dates.append(release_date)
    # Get the user name and add it to the list
    user_name = container.a.text
    user_names.append(user_name)
    # Get the rating and add it to the list
    rating = container.find('div', class_='left fl').text.replace('\n','')
    rating = int(rating)/2
    ratings.append(rating)
    # Get the review and add it to the list
    review = container.find('div', class_='review_body').text.replace('\n', '')
    reviews.append(review)
    # Get the review date and add it to the list
    review_date = container.find('span', class_='date').text
    review_dates.append(review_date)
    return titles, dates, user_names, ratings, reviews, review_dates
    
# The loop
# For each url of the series
for url in tv_urls_list:
    # Monitor the url scraped
    print(url)
    clear_output(wait = True)
    
    # Get the html page
    response = requests.get('https://www.metacritic.com'+url+'/user-reviews', headers=headers)
    # Monitor the loop
    monitor_loop()
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
        
    # Get the reviews containers
    review_containers = page_html.find_all('div', class_='review pad_top1')
    
    # If any review_container, get the data
    if review_containers:
        # The title and release date
        title, release_date = get_title_date()
        # Get the data for each review
        for container in review_containers:
            titles, dates, user_names, ratings, reviews, review_dates = get_user_rating_review()

        # Go to the next page (if any)
        # Runs only if there is a `next` button
        if page_html.find('span', class_='flipper next'):
            # Get the page url
            next_page = page_html.find('span', class_='flipper next').a
            # The while loop runs only if next_page!=None
            while next_page:
                next_page_url = next_page['href']
                # Monitor the page scraped
                print(next_page_url)
                clear_output(wait = True)
                # Download the next page
                response = requests.get('https://www.metacritic.com'+next_page_url, headers=headers)
                # Monitor the loop
                monitor_loop()
                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')
                # The reviews containers
                review_containers = page_html.find_all('div', class_='review pad_top1')
                # The data for each review
                for container in review_containers:
                    get_user_rating_review()
                # Get the page url
                # The while loop stops if next_page==None
                next_page = page_html.find('span', class_='flipper next').a

In [69]:
# Save series ratings into csv, so that we don't need to make future requests
series_ratings = pd.DataFrame({
    'title' : titles,
    'release_date' : dates,
    'user_name' : user_names,
    'rating' : ratings,
    'review' : reviews,
    'review_date' : review_dates
})

series_ratings.to_csv('series_ratings.csv', index=False)

## Season URLs

In [94]:
# Import the `tv_urls.csv` file
tv_urls = pd.read_csv('tv_urls.csv')

# Save tv_url in a list
tv_urls_list = tv_urls['tv_url'].tolist()
tv_urls_list[:2]

['/tv/love-fraud', '/tv/in-my-skin']

In [None]:
def monitor_loop():
    """
    Function to pause the loop and monitor the requests
    """
    # Pause the loop
    sleep(randint(10,30))
    
    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        message = warn('Status code: {}'.format(response.status_code))
        return message

In [95]:
# Get the season urls
season_urls = []

for url in tv_urls_list:
    response = requests.get('https://www.metacritic.com'+url, headers=headers)
    # Monitor the loop
    monitor_loop()
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    # Get the url containers
    season_urls_containers = page_html.find_all('li', class_='ep_guide_season')
    # Get the season url
    for container in season_urls_containers:
        season_url = container.a['href']
        season_urls.append(season_url)

In [96]:
# Save the seasons urls into csv, so that we don't need to make future requests
season_urls_df = pd.DataFrame({'season_url': season_urls})
season_urls_df.to_csv('season_urls.csv', index=False)

In [97]:
len(season_urls)

4139

## Season ratings & reviews

In [13]:
# Import the `season_urls.csv` file
season_urls = pd.read_csv('season_urls.csv')

# Save season_urls in a list
season_urls_list = season_urls['season_url'].tolist()
season_urls_list[:2]

['/tv/love-fraud/season-1', '/tv/in-my-skin/season-1']

In [14]:
# Functions to make the code more efficient
def monitor_loop():
    """
    Function to pause the loop and monitor the requests
    """
    # Pause the loop
    sleep(randint(10,15))

    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        message = warn('Status code: {}'.format(response.status_code))
        return message

def get_title_date():
    """
    Gets the title and the release date
    """
    # The title
    title = page_html.h1.text + ': ' + page_html.h2.text
    # The release date
    release_date = page_html.find('ul', class_='summary_details').find_all('span', class_='data')[1].text
    return title, release_date

def get_user_rating_review():
    """
    Gets the user name, rating, review, and review_date
    Adds the scraped data to the lists
    """
    # Get the user names
    all_users = all_reviews.find_all('div', class_='name')
    for i in range(len(all_users)):        
        # Add the title to the list
        titles.append(title)
        # Add the release date to the list
        dates.append(release_date)
        user_name = all_users[i].text.replace('\n', '')
        user_names.append(user_name)
    # Get the ratings
    all_ratings = all_reviews.find_all('div', class_='review_grade')
    for i in range(len(all_ratings)):
        rating = all_ratings[i].text.replace('\n', '')
        rating = int(rating)/2
        ratings.append(rating)
    # Get the reviews
    all_user_reviews = all_reviews.find_all('div', class_='review_body')
    for i in range(len(all_user_reviews)):
        if all_user_reviews[i].span:
            review = all_user_reviews[i].span.text
            reviews.append(review)
        else:
            reviews.append(' ')
    # Get the review dates
    all_dates = all_reviews.find_all('div', class_='date')
    for i in range(len(all_dates)):
        review_date = all_dates[i].text
        review_dates.append(review_date)
    return titles, dates, user_names, ratings, reviews, review_dates

In [18]:
# Loop through all season_urls and get title, date, user names, user ratings, and user reviews
# This takes the information for the overview of the seasons

# Lists to store the scraped data
titles = []
dates = []
user_names = []
ratings = []
reviews = []
review_dates = []

# The loop
# For each url of the series
for url in season_urls_list:
    # Monitor the url scraped
    print(url)
    clear_output(wait = True)
    # Get the html page
    response = requests.get('https://www.metacritic.com'+url+'/user-reviews', headers=headers)
    # Monitor the loop
    monitor_loop()
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    # Get all user reviews
    all_reviews = page_html.find('ol', class_='reviews user_reviews')
    # If any all_reviews, get the data
    if all_reviews:
        # The title and release date
        title, release_date = get_title_date()
        # Get the data for each review
        titles, dates, user_names, ratings, reviews, review_dates = get_user_rating_review()
        
        # Go to the next page (if any)
        # Runs only if there is a `next` button
        if page_html.find('span', class_='flipper next'):
            # Get the page url
            next_page = page_html.find('span', class_='flipper next').a
            # The while loop runs only if next_page!=None
            while next_page:
                next_page_url = next_page['href']
                # Monitor the page scraped
                print(next_page_url)
                clear_output(wait = True)
                # Download the next page
                response = requests.get('https://www.metacritic.com'+next_page_url, headers=headers)
                # Monitor the loop
                monitor_loop()
                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')
                # Get all user reviews
                all_reviews = page_html.find('ol', class_='reviews user_reviews')
                # If any all_reviews, get the data
                if all_reviews:
                    # The title and release date
                    title, release_date = get_title_date()
                    # Get the data for each review
                    titles, dates, user_names, ratings, reviews, review_dates = get_user_rating_review()
                # Get the reference for the next page
                # The while loop stops if next_page==None
                next_page = page_html.find('span', class_='flipper next').a

/tv/strike-back/season-1


In [19]:
# Save season ratings into csv, so that we don't need to make future requests
season_ratings_2 = pd.DataFrame({
    'title' : titles,
    'release_date' : dates,
    'user_name' : user_names,
    'rating' : ratings,
    'review' : reviews,
    'review_date' : review_dates
})

season_ratings_2.to_csv('season_ratings.csv', index=False)

## Episode URLs

In [124]:
# Import the `season_urls.csv` file
season_urls = pd.read_csv('season_urls.csv')

# Save season_urls in a list
season_urls_list = season_urls['season_url'].tolist()
season_urls_list[:2]

['/tv/love-fraud/season-1', '/tv/in-my-skin/season-1']

In [126]:
def monitor_loop():
    """
    Function to pause the loop and monitor the requests
    """
    # Pause the loop
    sleep(randint(10,15))
    
    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        message = warn('Status code: {}'.format(response.status_code))
        return message

In [127]:
# Get the episode urls
episode_urls = []

for url in season_urls_list:
    response = requests.get('https://www.metacritic.com'+url, headers=headers)
    # Monitor the loop
    monitor_loop()
    print(url)
    clear_output(wait = True)
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')
    # Get the url containers
    episode_urls_containers = page_html.find_all('li', class_='ep_guide_item')
    # Get the episode url
    for container in episode_urls_containers:
        if container.text != 'No episode information is currently available for this season':
            episode_url = container.a['href']
            if episode_url not in episode_urls:
                episode_urls.append(episode_url)

/tv/strike-back/season-1


In [129]:
# Save the episode urls into csv, so that we don't need to make future requests
episode_urls_df = pd.DataFrame({'episode_url': episode_urls})
episode_urls_df.to_csv('episode_urls.csv', index=False)

## Episode ratings and reviews

*To run on EC2 instance*

In [20]:
# Import the `episode_urls.csv` file
episode_urls = pd.read_csv('episode_urls.csv')

# Save episode_urls in a list
episode_urls_list = episode_urls['episode_url'].tolist()
episode_urls_list[:2]

['/tv/love-fraud/season-1/episode-4-episode-4-1048046',
 '/tv/love-fraud/season-1/episode-3-episode-3-1048045']

In [21]:
# Functions to make the code more efficient
def monitor_loop():
    """
    Function to pause the loop and monitor the requests
    """
    # Pause the loop
    sleep(randint(10,15))
    # Throw a warning for non-200 status codes
    if response.status_code != 200:
        message = warn('Status code: {}'.format(response.status_code))
        return message

def get_title_date():
    """
    Gets the title and the release date
    """
    # The title
    title = url.replace('/tv/', '').replace('/', ': ').replace('-', ' ')
    # The release date
    release_date = page_html.find('span', class_='release_date').find('span', class_=None).text
    return title, release_date

def get_user_rating_review():
    """
    Gets the user name, rating, review, and review_date
    Adds the scraped data to the lists
    """
    # Add the title to the list
    titles.append(title)
    # Add the release date to the list
    dates.append(release_date)
    # Get the user name and add it to the list
    user_name = container.a.text
    user_names.append(user_name)
    # Get the rating and add it to the list
    rating = container.find('div', class_='left fl').text.replace('\n','')
    rating = int(rating)/2
    ratings.append(rating)
    # Get the review and add it to the list
    if container.find('div', class_='review_body'):
        review = container.find('div', class_='review_body').text.replace('\n', '')
        reviews.append(review)
    else:
        reviews.append(' ')
    # Get the review date and add it to the list
    review_date = container.find('span', class_='date').text
    review_dates.append(review_date)
    return titles, dates, user_names, ratings, reviews, review_dates

In [23]:
# Loop through all episode_urls and get title, date, user names, user ratings, and user reviews
# This takes the information for the overview of the episodes

# Lists to store the scraped data
titles = []
dates = []
user_names = []
ratings = []
reviews = []
review_dates = []

# The loop
# For each url of the series
for url in episode_urls_list:
    # Monitor the url scraped
    print(url)
    clear_output(wait = True)

    # Get the html page
    response = requests.get('https://www.metacritic.com'+url+'/user-reviews', headers=headers)
    # Monitor the loop
    monitor_loop()
    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # Get the reviews containers
    review_containers = page_html.find_all('div', class_='review pad_top1')

    # If any review_container, get the data
    if review_containers:
        # The title and release date
        title, release_date = get_title_date()
        # Get the data for each review
        for container in review_containers:
            titles, dates, user_names, ratings, reviews, review_dates = get_user_rating_review()

        # Go to the next page (if any)
        # Runs only if there is a `next` button
        if page_html.find('span', class_='flipper next'):
            # Get the page url
            next_page = page_html.find('span', class_='flipper next').a
            # The while loop runs only if next_page!=None
            while next_page:
                next_page_url = next_page['href']
                # Monitor the page scraped
                print(next_page_url)
                clear_output(wait = True)
                # Download the next page
                response = requests.get('https://www.metacritic.com'+next_page_url, headers=headers)
                # Monitor the loop
                monitor_loop()
                # Parse the content of the request with BeautifulSoup
                page_html = BeautifulSoup(response.text, 'html.parser')
                # The reviews containers
                review_containers = page_html.find_all('div', class_='review pad_top1')
                # The data for each review
                for container in review_containers:
                    titles, dates, user_names, ratings, reviews, review_dates = get_user_rating_review()
                # Get the page url
                # The while loop stops if next_page==None
                next_page = page_html.find('span', class_='flipper next').a

In [None]:
# Save episode ratings into csv, so that we don't need to make future requests
episode_ratings = pd.DataFrame({
    'title' : titles,
    'release_date' : dates,
    'user_name' : user_names,
    'rating' : ratings,
    'review' : reviews,
    'review_date' : review_dates
})

episode_ratings.to_csv('episode_ratings.csv', index=False)

## Data cleaning before inserting into the DB

In [53]:
import pandas as pd
# Read the season_ratings and series_ratings csv files
season = pd.read_csv('season_ratings.csv')
series = pd.read_csv('series_ratings.csv')
season.shape, series.shape

((37701, 6), (1891, 6))

In [54]:
# Concatente the dataframes
frames = [season, series]
df = pd.concat(frames)
df.shape

(39592, 6)

In [55]:
# I scraped the ratings on the scale 1-10
# Divide rating by 2
df['rating'] /= 2

In [56]:
# Check the data types
df.dtypes

title            object
release_date     object
user_name        object
rating          float64
review           object
review_date      object
dtype: object

In [57]:
# Release date and review_date should be date.
# Inspect release_date
df['release_date'].value_counts()

Apr 17, 2011                                                                                                                                   1671
\n\n                                                                CW\n                                                            \n         1228
Oct 31, 2010                                                                                                                                   1216
\n\n                                                                Netflix\n                                                            \n    1058
\n\n                                                                BBC\n                                                            \n         796
                                                                                                                                               ... 
June 5, 2011                                                                                                    

In [58]:
# Clean the data and save only the release year when possible
years = []
for d in df['release_date'].tolist():
    try:
        year = d.split(', ')[1]
    except:
        year = None
    years.append(year)
df['release_date'] = years

In [59]:
df['release_date'].value_counts()

2011    4024
2016    2751
2017    2379
2013    2232
2014    2125
2005    2033
2015    1884
2010    1836
2018    1466
2009    1256
2007    1253
2006     859
2012     701
2008     693
1999     575
2002     506
2020     493
2001     427
2004     417
2019     341
1989     267
1997     262
2000     181
1993     166
1994     159
2003     148
1998      64
1988      34
1990      26
1991      24
1996      20
1975      15
1995       5
1944       5
1982       3
1981       1
1969       1
Name: release_date, dtype: int64

In [61]:
# Inspect review_date
df['review_date'].value_counts().sort_index()

Apr  1, 2006     3
Apr  1, 2007     3
Apr  1, 2008     2
Apr  1, 2009     3
Apr  1, 2010     1
                ..
Sep 30, 2015     7
Sep 30, 2016     9
Sep 30, 2017    11
Sep 30, 2018    10
Sep 30, 2019     6
Name: review_date, Length: 5008, dtype: int64

In [67]:
# Save the cleaned dataframe
df.to_csv('clean_ratings.csv', sep='|', index=False)

In [71]:
df.head()

Unnamed: 0,title,release_date,user_name,rating,review,review_date
0,Unbreakable Kimmy Schmidt: Kimmy Vs. The Rever...,2020,remyzerofan,5.0,I haven't smiled a lot lately because the quar...,"May 12, 2020"
1,Solar Opposites: Season 1,2020,ManvsCar,4.0,"Thought it was pretty good, not as good as ric...","May 9, 2020"
2,Solar Opposites: Season 1,2020,Chunklite,5.0,"Really funny and crazy, it only gets better as...","May 8, 2020"
3,Solar Opposites: Season 1,2020,kirkender,2.5,"Rick and Morty comparisons aside, Solar Opposi...","May 9, 2020"
4,Solar Opposites: Season 1,2020,Jakepuwawa,4.0,It's really dumb but at the same time interest...,"May 8, 2020"


In [79]:
tv_reviews_lst = []
titles = df['title'].tolist()
release_dates = df['release_date'].tolist()
user_names = df['user_name'].tolist()
ratings = df['rating'].tolist()
reviews = df['review'].tolist()
review_dates = df['review_date'].tolist()

for i in range(len(df)):
    tv_reviews_lst.append([titles[i], release_dates[i], user_names[i], ratings[i], reviews[i], review_dates[i]])

len(tv_reviews_lst)

39592

In [82]:
import psycopg2
from psycopg2.extras import execute_batch
from getpass import getpass

In [84]:
connection = psycopg2.connect(
    database  = "postgres",
    user      = "postgres",
    password  = getpass(), # secure password entry. Enter DB password in the prompt and press Enter.
    host      = "groa.cbayt2opbptw.us-east-1.rds.amazonaws.com",
    port      = '5432'
)

 ················


In [86]:
cursor = connection.cursor()
step = 1000

for ix in range(0, len(df), step):
    print(f"doing step: {ix}")
    
    batch = tv_reviews_lst[ix:ix+step]
    
    execute_batch(cursor, """
        INSERT INTO tvshow_reviews (title, release_date, user_name, rating, review_text, review_date)
        VALUES (%s, %s, %s, %s, %s, %s);
        """, batch
    )

doing step: 0
doing step: 1000
doing step: 2000
doing step: 3000
doing step: 4000
doing step: 5000
doing step: 6000
doing step: 7000
doing step: 8000
doing step: 9000
doing step: 10000
doing step: 11000
doing step: 12000
doing step: 13000
doing step: 14000
doing step: 15000
doing step: 16000
doing step: 17000
doing step: 18000
doing step: 19000
doing step: 20000
doing step: 21000
doing step: 22000
doing step: 23000
doing step: 24000
doing step: 25000
doing step: 26000
doing step: 27000
doing step: 28000
doing step: 29000
doing step: 30000
doing step: 31000
doing step: 32000
doing step: 33000
doing step: 34000
doing step: 35000
doing step: 36000
doing step: 37000
doing step: 38000
doing step: 39000


In [87]:
cursor.close()
connection.commit()

## TVShow Details

In [None]:
# Loop through all the tv urls
tv_urls = pd.read_csv('tv_urls.csv')
tv_urls_list = tv_urls['tv_url'].tolist()

# Lists to store the scraped data in
names = names[]
dates = dates[]
genres = genres[]
descriptions = descriptions[]
poster_urls = poster_urls[]

# For each url
for url in tv_urls_list:
    # Get the page
    response = requests.get('https://www.metacritic.com'+url, headers=headers)

    # Pause the loop
    sleep(randint(8,15))
    # Monitor the loop
    print(url)
    clear_output(wait = True)

    # Parse the content of the request with BeautifulSoup
    page_html = BeautifulSoup(response.text, 'html.parser')

    # The titles
    title = page_html.h1.text
    names.append(title)

    # The release date
    date_container = page_html.find('span', class_='release_date')
    release_date = date_container.find('span', class_=None).text
    dates.append(release_date)

    # The genres
    genres_section = page_html.find('div', class_='genres')
    if genres_section:
        spans = genres_section.find_all('span', class_=None)
        genres_list = []
        for i in range(1, len(spans)):
            genre = spans[i].text.replace('\n', '')
            genres_list.append(genre)
        genres.append(genres_list)
    else:
        genres.append(' ')

    # The description
    summary_section = page_html.find('div', class_='summary_deck details_section')
    description = summary_section.find('span', class_=None).text.replace('\n', '')
    descriptions.append(description)

    # The poster url
    poster_url = page_html.find('img', class_='summary_img')['src']
    poster_urls.append(poster_url)

In [None]:
# Convert into dataframe
tvshows = pd.DataFrame({
    'url' : urls,
    'title': names,
    'release_date' : dates,
    'genres' : genres,
    'description' : descriptions,
    'poster_url' : poster_urls
})

# Save into csv if we need for future processing
tvshows.to_csv('tvshows.csv', index=False)

In [None]:
# Insert into the database

# Prepare the list to be inserted
tv_shows_lst = []
for i in range(len(tvshows)):
    tv_shows_lst.append([names[i], dates[i], descriptions[i], poster_urls[i], genres[i]])

In [None]:
# Connect
connection = psycopg2.connect(
    database  = "postgres",
    user      = "postgres",
    password  = getpass(), # secure password entry. Enter DB password in the prompt and press Enter.
    host      = "groa.cbayt2opbptw.us-east-1.rds.amazonaws.com",
    port      = '5432'
)

In [None]:
# Insert
cursor = connection.cursor()
step = 100

for ix in range(0, len(tv_shows_lst), step):
    print(f"doing step: {ix}")
    
    batch = tv_shows_lst[ix:ix+step]
    
    execute_batch(cursor, """
        INSERT INTO tvshows (title, release_date, description, poster_url, genres)
        VALUES (%s, %s, %s, %s, %s);
        """, batch
    )
cursor.close()
connection.commit()