## Imports and script params/constants

In [1]:
import json
from datetime import datetime
import requests
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd

# Read API key from file that is in .gitignore to avoid sharing key
# Generically named `api_keys.json` in case more APIs are introduced
with open('api_keys.json', 'r') as f:
    API_KEYS = json.load(f)
    

timestamp = datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
OUTPUT_CSV = f'data/movies_{timestamp}.csv'
N_PAGES = 30


FLIXABLE_REQUEST_URL = 'http://www.flixable.com'
MOVIE_TITLE_SELECTOR = '.title'

OMDB_REQUEST_URL = 'http://www.omdbapi.com'
OMDB_API_KEY = API_KEYS['omdb_key']

## Helper functions

In [2]:
def get_flixable_movie_urls(params):
    """Given a page of flixable styled like the home page; scrape movie page urls
    
    :param params: dictionary of parameters to filter movie pages.
                   example parameters:
                     * page
                     * min-rating
                     * min-year
                     * max-year
    """
    # Get home page html
    response = requests.get(FLIXABLE_REQUEST_URL, params=params)
    page_html = response.text
    page_soup = BeautifulSoup(page_html)

    # Select title links
    movie_titles = page_soup.select(MOVIE_TITLE_SELECTOR)

    # Build movie page urls
    movie_page_urls = {}
    for movie_title in movie_titles:
        key = movie_title.text
        movie_page_url = FLIXABLE_REQUEST_URL + movie_title['href']
        movie_page_urls[key] = movie_page_url
    
    return movie_page_urls


def get_extra_info(title, year):
    """Make API request to OMDB API
    
    :param title: name of movie/series to request
    :param year: year of the movie/series
    :return: dict with OMDB info; blank dict if API request failed
    """
    params = {'t': title, 'y': year, 'apikey': OMDB_API_KEY}
    response = requests.get(OMDB_REQUEST_URL, params=params)
    
    # When movie not found response.json() is:
    # {'Response': 'False', 'Error': 'Movie not found!'}
    # Check if this is the case in addition to response.ok
    if response.ok and 'Error' not in response.json().keys():
        return response.json()
    
    # Default value
    return {}


def get_info(flixable_urls):
    """Get Netflix item's info from flixable and OMDB
    
    :param flixable_urls: list of flixable urls (example: ['https://flixable.com/title/81116576/'])
    :return: pd.DataFrame with a record for each url.  
             Failed records will only have the url column be non nan.
    """
    info_dicts = []
    for url in flixable_urls:
        default_info_dict = {'flixable_url': url}
        response = requests.get(url)
        
        # If failed, add default record and go to next url
        if not response.ok:
            info_dicts.append(default_info_dict)
            continue
            
        html = response.text
        soup = BeautifulSoup(html)

        # Get info from flixable page
        # This might fail in case of bad url
        # Instead of giving a 404, flixable redirects bad urls. 
        # example: https://flixable.com/title/AdamWuzHere/
        try:
            title = soup.find('h1', {'class': 'mb-3'}).text.strip()
            year = soup.find('span', {'class': 'font-weight-bold mr-2'}).text
            mpaa_rating = soup.find('span', {'class': 'border border-dark rounded font-weight-bold px-1 mr-2'}).text

            added_to_netflix = soup.find("div", {"class": "mb-4"}).text
            added_to_netflix = added_to_netflix.split(':')[1].strip()
        except AttributeError:
            # If failed, add default record and go to next url
            info_dicts.append(default_info_dict)
            continue

        # Get info from OMDB
        info_dict = get_extra_info(title, year)

        # Use flixable title/year instead of OMDB in case OMDB failed
        info_dict['Title'] = title
        info_dict['Year'] = year

        # Add flixable info to OMDB data
        info_dict['mpaa_rating'] = mpaa_rating
        info_dict['added_to_netflix'] = added_to_netflix
        info_dict['flixable_url'] = url

        # Store what will be a row in the end dataframe
        info_dicts.append(info_dict)
    
    return pd.DataFrame(info_dicts)

## Test cases for functions

In [3]:
# # Create test cases to cover the expected exception (not on OMDB)
# # Also included an unexpected exception (bad flixable url)
# movie_on_omdb = 'https://flixable.com/title/81116576/'
# movie_not_on_omdb = 'https://flixable.com/title/81226955/'
# bad_flixable_url = 'https://flixable.com/title/AdamWuzHere/'

# urls = [movie_on_omdb, movie_not_on_omdb, bad_flixable_url]

# # Test
# get_info(urls)

## Perform Scrape

In [12]:
flixable_params = {'min-rating': 0, 
                   'min-year': 0, 
                   'max-year': 3000, 
                   'order': 'date', 
                   'page': 1}

movie_urls = []
for page in range(1, N_PAGES + 1):
    flixable_params['page'] = page
    movie_url_dict = get_flixable_movie_urls(flixable_params)
    movie_urls.extend(movie_url.values())

movie_info_df = get_info(movie_urls)
movie_info_df.to_csv(OUTPUT_CSV, index=False)