In [None]:
def scrape_imdb():
    import csv
    import re
    from requests import get
    from bs4 import BeautifulSoup
    from IPython.core.display import clear_output
    from warnings import warn
    from time import time, sleep
    from random import gauss, shuffle, randint


    start_time = time()
    html_soup = BeautifulSoup(response.text, 'html.parser')
    page_count = 10

    years = [i for i in range(1935, 2020)]
    year_pages = [(year, page) for year in years for page in range(page_count)]
    # shuffle(year_pages)

    headers = {"Accept-Language": "en-US, en;q=0.5"}

    # Preparing the monitoring of the loop
    start_time = time()
    requests = 0

    # Write the header to file before looping
    with open('output/movie_ratings.csv', 'w', newline='') as output_file:
        csv_headers = ['imdb_ids', 'movie', 'year', 'imdb', 'metascore', 'votes']
        writer = csv.DictWriter(output_file, fieldnames=csv_headers)
        writer.writeheader()

    for year, page in year_pages:

        # Redeclaring the lists to store data in
        names = []
        years = []
        imdb_ratings = []
        metascores = []
        votes = []
        imdb_ids = []

        # Make a get request
        start = str((page * 50) + 1)
        url = 'http://www.imdb.com/search/title?release_date=' + str(year) + '&sort=num_votes,desc&start=' + start 
        while(requests / elapsed_time > 0.1):
            sleep(randint(1, 10))
            elapsed_time = time() - start_time
        response = get(url, headers = headers)

        # Pause the loop
        sleep_time = gauss(10, 4)
        sleep(abs(sleep_time))

        # Monitor the requests
        requests += 1
        elapsed_time = time() - start_time
        print('Request:{}; Frequency: {} requests/s\nLast URL: {}'.format(requests, requests/elapsed_time, url))
        clear_output(wait = True)

        # Throw a warning for non-200 status codes
        if response.status_code != 200:
            warn('Request: {}; Status code: {}'.format(requests, response.status_code))
            continue

        # Break the loop if the number of requests is greater than expected
        if requests > 2000:
            warn('Number of requests was greater than expected.')
            break

        # Parse the content of the request with BeautifulSoup
        page_html = BeautifulSoup(response.text, 'html.parser')

        # Select all the 50 movie containers from a single page
        mv_containers = page_html.find_all('div', class_ = 'lister-item mode-advanced')

        # For every movie of these 50
        for container in mv_containers:
            # Scrape the name
            title_html = container.h3.a
            imdb_id = re.search('/title/tt(\d+)/', container.h3.a['href']).group(1)
            imdb_ids.append(imdb_id)
            names.append(title_html.text)

            # Scrape the year
            # year = container.h3.find('span', class_ = 'lister-item-year').text
            years.append(year)

            # Scrape the IMDB rating
            imdb = float(container.strong.text)
            imdb_ratings.append(imdb)

            # Scrape the Metascore
            if container.find('div', class_ = 'ratings-metascore') is not None:
                m_score = container.find('span', class_ = 'metascore').text
                metascores.append(int(m_score))
            else:
                metascores.append(None)

            # Scrape the number of votes
            vote = container.find('span', attrs = {'name':'nv'})['data-value']
            votes.append(int(vote))

        movie_ratings_df = pd.DataFrame({'imdb_ids': imdb_ids,
                                         'movie': names,
                                         'year': years,
                                         'imdb': imdb_ratings,
                                         'metascore': metascores,
                                         'votes': votes})
        # Checkpoint data as you go
        with open('output/movie_ratings.csv', 'a', newline='') as output_file:
            movie_ratings_df.to_csv(output_file, header=False, index=False)

    sleep(1)
    clear_output(wait = True)

    normalized_movie_ratings_df = pd.read_csv('output/movie_ratings.csv', encoding='ISO-8859-1')
    normalized_movie_ratings_df = normalized_movie_ratings_df[['imdb_ids', 'movie', 'year', 'imdb', 'metascore', 'votes']]
    normalized_movie_ratings_df['imdb'] = normalized_movie_ratings_df['imdb'] * 10
    normalized_movie_ratings_df['imdb'] = normalized_movie_ratings_df['imdb'].astype(int)
    normalized_movie_ratings_df.to_csv('output/norm_movie_ratings.csv')

    display(normalized_movie_ratings_df)
scrape_imdb()

In [None]:
def scrape_thenumbers():
    import bs4
    import requests
    import json
    import os
    import re
    import time
    # There are 5860 movies at the time this program was written
    PAGE_COUNT = 59


    class SimpleLogger(object):
        def __init__(self, filename):
            self.file = open(filename, 'w')

        def __del__(self):
            self.file.close()

        def log(self, msg):
            print(msg)
            print(msg, file=self.file)

    def parse_dollars(dollar_str):
        return int(dollar_str.replace('$', '').replace(',', ''))


    def clean_input(input_str):
        return input_str.encode("ascii", errors="ignore").decode()


    def scrape_movie(url_ext):
        page_str = 'https://www.the-numbers.com' + url_ext
        movie_rsp = requests.get(page_str)
        
        try:
            movie_rsp.raise_for_status()
        except requests.HTTPError:
            logger.log('Request for ' + page_str + ' failed!')
            return None
        parsed_movie_rsp = bs4.BeautifulSoup(clean_input(movie_rsp.text), features='html.parser')
        table = parsed_movie_rsp.find(text='Movie Details').parent.find_next('table')
        mpaa_rating = None
        running_time = None
        franchises = None
        keywords = None
        source = None
        genre = None
        production_method = None
        creative_type = None
        production_companies = None
        production_countries = None
        languages = None
        temp = table.find(text='MPAA Rating:')
        if temp:
            result = re.search('/market/mpaa-rating/([\w\d\(\)-]+)', temp.parent.parent.parent.select('a')[0]['href'])
            mpaa_rating = {
                'slug': result.group(1) if result else None,
                'pretty': temp.parent.parent.parent.select('a')[0].text
            }
        temp = table.find(text='Running Time:')
        if temp:
            temp = temp.parent.parent.find_next('td').getText()
            running_time = int(temp.split(' ')[0])
        temp = table.find(text='Franchise:')
        if temp:
            franchises = list()
            for franchise in temp.parent.find_next('td').find_all('a'):
                result = re.search('/movies/franchise/([\w\d\(\)-]+)', franchise['href'])
                franchises.append({
                    'slug': result.group(1) if result else None,
                    'pretty': franchise.text
                })
        temp = table.find(text='Keywords:')
        if temp:
            keywords = list()
            for keyword in temp.parent.find_next('td').find_all('a'):
                result = re.search('/movies/keywords/([\w\d\(\)-]+)', keyword['href'])
                keywords.append({
                    'slug': result.group(1) if result else None,
                    'pretty': keyword.text
                })
        temp = table.find(text='Source:')
        if temp:
            result = re.search('/market/source/([\w\d\(\)-]+)', temp.parent.parent.parent.select('a')[0]['href'])
            source = {
                'slug': result.group(1) if result else None,
                'pretty': temp.parent.parent.parent.select('a')[0].getText()
            }
        temp = table.find(text='Genre:')
        if temp:
            result = re.search('/market/genre/([\w\d\(\)-]+)', temp.parent.parent.parent.select('a')[0]['href'])
            genre = {
                'slug': result.group(1) if result else None,
                'pretty': temp.parent.parent.parent.select('a')[0].getText()
            }
        temp = table.find(text='Production Method:')
        if temp:
            result = re.search('/market/production-method/([\w\d\(\)-]+)', temp.parent.parent.parent.select('a')[0]['href']) 
            production_method = {
                'slug': result.group(1) if result else None,
                'pretty': temp.parent.parent.parent.select('a')[0].getText()
            }
        temp = table.find(text='Creative Type:')
        
        print(table)
        print(temp)
        if temp:
            result = re.search('/market/creative-type/([\w\d\(\)-]+)', temp.parent.parent.parent.select('a')[0]['href'])
            creative_type = {
                'slug': result.group(1) if result else None,
                'pretty': temp.parent.parent.parent.select('a')[0].getText()
            }
        temp = table.find(text='Production Companies:')
        if temp:
            production_companies = list()
            for production_company in temp.parent.find_next('td').find_all('a'):
                result = re.search('/movies/production-company/([\w\d\(\)-]+)', production_company['href'])
                production_companies.append({
                    'slug': result.group(1) if result else None,
                    'pretty': production_company.text
                })
        temp = table.find(text='Production Countries:')
        if temp:
            production_countries = list()
            for production_country in temp.parent.find_next('td').find_all('a'):
                result = re.search('/([\w\d\(\)-]+)/movies', production_country['href'])
                production_countries.append({
                    'slug': result.group(1) if result else None, 
                    'pretty': production_country.text
                })
        temp = table.find(text='Languages:')
        if temp:
            languages = list()
            for language in temp.parent.find_next('td').find_all('a'):
                result = re.search('/language/([\w\d\(\)-]+)/movies', language['href'])
                languages.append({
                    'slug': result.group(1) if result else None,
                    'pretty': language.text
                })
        movie_dict = {
            'mpaa-rating': mpaa_rating,
            'running-time': running_time,
            'franchises': franchises,
            'keywords': keywords,
            'source': source,
            'genre': genre,
            'production_method': production_method,
            'creative_type': creative_type,
            'production_companies': production_companies,
            'production_countries': production_countries,
            'languages': languages
        }
        return movie_dict['creative_type']

    print(scrape_movie('/movie/Star-Wars-Ep-VIII-The-Last-Jedi'))
    return

    def scrape_list_page(url_ext):
        count = 0
        page_str = 'https://www.the-numbers.com/movie/budgets/all' + url_ext
        movie_lst_rsp = requests.get(page_str)
        # Halt if there was an issue with the request
        try:
            movie_lst_rsp.raise_for_status()
        except requests.HTTPError:
            logger.log('Request for ' + page_str + ' failed!')
            return None
        else:
            logger.log('Request for ' + page_str + ' succeeded')
        parsed_movie_lst_rsp = bs4.BeautifulSoup(clean_input(movie_lst_rsp.text), features="html.parser")
        for table_row in parsed_movie_lst_rsp.find_all('tr')[1:]:
            table_cells = table_row.find_all('td')
            i = 0
            movie_dict = dict()
            result = re.search('/box-office-chart/daily/(\d{4}/\d{1,2}/\d{1,2})', table_cells[i + 1].a['href'])
            movie_dict['release_date'] = {
                'slug': result.group(1) if result else None,
                'pretty': table_cells[i + 1].a.text
            }
            movie_dict['title'] = {
                'slug': re.search('/movie/([\w\d\(\)-]+)#tab=summary', table_cells[i + 2].b.a['href']).group(1),
                'pretty': table_cells[i + 2].b.a.text
            }
            # Scrape the movie page
            movie_dict['production_budget'] = parse_dollars(table_cells[i + 3].getText())
            movie_dict['domestic_gross'] = parse_dollars(table_cells[i + 4].getText())
            movie_dict['worldwide_gross'] = parse_dollars(table_cells[i + 5].getText())
            movie_dict.update(scrape_movie(table_cells[i + 2].select('a')[0]['href']))
            yield movie_dict
            count += 1
        logger.log('Read ' + str(count) + ' entries from page ' + page_str)


    def scrape_the_numbers_generator():
        url_ext_lst = [''] + ['/' + str(x) + '01' for x in range(1, PAGE_COUNT)]
        for url_ext in url_ext_lst:
            for page in scrape_list_page(url_ext):
                if page is None:
                    logger.log('Page {} returned None'.format(url_ext))
                    break
                else:
                    yield page


    def store_the_numbers(filename):
        start_total = time.time()
        i = 1
        start_row = time.time()
        entries = list()
        for entry in scrape_the_numbers_generator():
            entries.append(entry)
            exec_time_row = time.time() - start_row
            logger.log('In {:.3f} seconds - {:0>4} {}'.format(exec_time_row, i, entry['title']['pretty']))
            start_row = time.time()
            i += 1
            if i % 100 == 0:
                with open(filename, 'w', newline='') as outfile:
                    json.dump(entries, outfile)
        with open(filename, 'w', newline='') as outfile:
                    json.dump(entries, outfile)
        exec_time_total = time.time() - start_total
        logger.log('{} total entries in {:.3f} seconds'.format(i - 1, exec_time_total))


    if __name__ == '__main__':
        filename = 'output/the_numbers.json'        
        # We really don't want to overwrite any existing good DB file, it takes a long time to scrape all of the data
        if os.path.isfile(filename):
            exit()
        logger = SimpleLogger('log.txt')
        store_the_numbers(filename)

scrape_thenumbers()

In [None]:
def normalize_movies():
    import json
    import re
    import time
    import tmdbsimple as tmdb
    from requests import HTTPError
    import sys
    
    tmdb.API_KEY = 'af3f6fb737c9085bd424c403dc83c196'
    
    def simple_test():
        search = tmdb.Search()
        results = search.movie(**{'query': 'Star Wars Episode VIII The Last Jedi', 
                                  'year': '2017'})
        movie = tmdb.Movies(results['results'][0]['id'])
        results = movie.external_ids()
        print(movie.external_ids()['imdb_id'])
        sys.exit()
    # simple_test()
    
    class SimpleLogger(object):
        def __init__(self, filename):
            self.file = open(filename, 'w')

        def __del__(self):
            self.file.close()

        def log(self, msg):
            print(msg)
            print(msg, file=self.file)
            
    def clean_string(in_str):
        return re.sub('[^A-Za-z0-9 \-]+', '', in_str)
    
    def normalize_movie_title(in_str):
        in_str = clean_string(in_str)
        in_str = re.sub(' ([Ee]p) ', ' \g<1>isode ', in_str)
        return in_str
            
    def search_imdb_id(title, year):
        # print('\"{}\" ({})'.format(title, year))
        search = tmdb.Search()
        query_dict = {'query': title}
        if year is not None:
            query_dict['year'] = str(year)
        results = search.movie(**query_dict)
        if (not 'results' in results) or (not results['results']) or (not 'id' in results['results'][0]):
            return None
        movie = tmdb.Movies(results['results'][0]['id'])
        results = movie.external_ids()
        if not 'imdb_id' in results or results['imdb_id'] is  None:
            return None
        result = re.search('tt(\d+)', results['imdb_id'])
        return result.group(1) if result else None
    
    
    def normalize_the_numbers():
        with open('data/the_numbers/the_numbers.json', 'r') as input_file:
            the_numbers = json.load(input_file)
        for i, movie in enumerate(the_numbers):
            year = None
            if movie['release_date']['slug'] is not None:
                result = re.search('^(\d{4}).*', movie['release_date']['slug'])
                year = int(result.group(1)) if result is not None else None
                if 1900 < year < 2030:
                    year = str(year)
                else:
                    year = None
            try:
                imdb_id = search_imdb_id(normalize_movie_title(movie['title']['slug']), year)
                if imdb_id is None:
                    time.sleep(0.26)
                    imdb_id = search_imdb_id(normalize_movie_title(movie['title']['pretty']), year)
                the_numbers[i]['imdb_id'] = imdb_id
            except HTTPError:
                print('HTTPError: {} ({})'.format(title, year))
            # API Limited to 40 calls every 10 seconds, search_imdb_id calls API twice for each movie
            logger.log('{:0>4} tt{: <10} {} ({})'.format(i, imdb_id if imdb_id is not None else '', movie['title']['pretty'], year))
            time.sleep(0.51)
        with open('output/the_numbers_norm.json', 'w', newline='') as outfile:
            json.dump(the_numbers, outfile)
    
    logger = SimpleLogger('normalize_log.txt')
    normalize_the_numbers()

normalize_movies()

In [None]:
def rectify_the_numbers():
    import csv
    import json
    import os
    import pandas as pd
    import re
    
    def rectify_single(the_numbers, output_dir):
        single_columns = \
        ['release_date', 'release_date_norm', 'title', 'title_norm', 'mpaa-rating', 
         'mpaa-rating_norm', 'running-time', 'source', 'source_norm', 'genre', 
         'genre_norm', 'production_method', 'production_method_norm', 
         'creative_type', 'creative_type_norm', 'production_budget', 'domestic_gross', 
         'worldwide_gross', 'imdb_id']
        
        with open(os.path.join(output_dir, 'the_numbers_rectified.csv'), 'w', newline='') as outfile:
            header_writer = csv.writer(outfile)
            header_writer.writerow(columns)
        i = 0
        for entry in the_numbers:
            rectified_entry = dict()
            for key, value in entry.items():
                if key is None:
                    continue
                if type(value) == dict and 'slug' in value and 'pretty' in value:
                    rectified_entry[key]  = value['pretty']
                    rectified_entry[key + '_norm'] = value['slug']
                else:
                    rectified_entry[key] = value
            with open('output/the_numbers_rectified.csv', 'a', newline='') as outfile:
                row_writer = csv.DictWriter(outfile, columns)
                row_writer.writerow(rectified_entry)
            if rectified_entry['release_date_norm'] is not None:
                result = re.search('^(\d{4}).*', rectified_entry['release_date_norm'])
                year = result.group(1) if result is not None else None
            log = '{:0>4} {}'.format(i, rectified_entry['title'])
            if year is not None:
                log += ' ({})'.format(year)
            print(log)
            i += 1
    
    def rectify_lst(df, imdb_id, lst):
        return pd.concat([pd.DataFrame([imdb_id, el], columns=['imdb']) for el in lst])
    
    def rectify_lsts(the_numbers, output_dir):
        lst_columns = ['franchises', 'keywords', 'production_companies', 'production_countries', 'languages']
        pretty_columns = [(col, 'pretty', col) for col in lst_columns]
        slug_columns = [(col, 'slug', col + '_norm') for col in lst_columns]
        all_columns = pretty_columns + slug_columns
        dataframe_dict = {dest_col: pd.DataFrame([], columns=['imdb_id', dest_col]) 
                          for source_col, specifier, dest_col in all_columns}
        for entry in the_numbers:
            if 'imdb_id' not in entry:
                continue
            imdb_id = entry['imdb_id'] 
            for source_col, specifier, dest_col in all_columns:
                if source_col in entry:
                    # print(source_col, ' ', specifier, ' ', dest_col)
                    # dataframe_dict[dest_col] = dataframe_dict[dest_col].append(pd.concat([pd.DataFrame([imdb_id, el], columns=dataframe_dict[dest_col].columns) for el in entry[source_col][specifier]]))
                    dataframe_dict[dest_col].append(pd.concat([pd.DataFrame([[imdb_id, el[specifier]]], columns=dataframe_dict[dest_col].columns) for el in entry[source_col]]))
        display(dataframe_dict['franchises'])
        
    def rectify():
        with open('output/the_numbers_norm.json', 'r') as infile:
            the_numbers = json.load(infile)
        output_dir = os.path.join('output', 'rectified')
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # rectify_single(the_numbers, output_dir)
        rectify_lsts(the_numbers, output_dir)
        
    rectify()
    
rectify_the_numbers()

In [None]:
def merge_data():
    import pandas as pd
    
    df_movie_ratings = pd.read_csv('output/norm_movie_ratings.csv', encoding = "ISO-8859-1")
    df_the_numbers = pd.read_csv('output/the_numbers_rectified.csv', encoding = "ISO-8859-1")
    display(df_movie_ratings.head())
    display(df_the_numbers.head())
    df_merged = pd.merge(df_movie_ratings, df_the_numbers, left_on='imdb_ids', right_on='imdb_id')
    df_merged.to_csv('output/merged.csv', index=False)
    
merge_data()