In [1]:
import warnings
warnings.filterwarnings("ignore")

import numpy as np
import pandas as pd
import datetime
import sys
import time
import requests
from bs4 import BeautifulSoup
from datetime import datetime
import pickle


# IMDB Demo scraper
### This notebook loops over imdb ids and scrapes the content of Demographic ratings table, which includes user ratings and number of votes by age groups, gender and the distributions.

In [2]:
def split_cols(t_in):
    '''Splits columns into ratings and votes
    '''
    cols = t_in.columns
    for icol in cols:
        # print(icol)
        if icol != 'Gender':
            t_in[icol + '_rating'] = t_in[icol].map(lambda x: x.split()[0])
            t_in[icol + '_votes'] = t_in[icol].map(lambda x: x.split()[1] if len(x.split()) > 1 else '-')
            del t_in[icol]
    
    return t_in
# t2 = split_cols(t2)

In [3]:
def str_to_int(df):
    '''
    Some of the columns have commas and percent signs. this function takes care of this
    '''
    # print(df)
    cols = df.columns
    for icol in cols:
        # print(icol)
        if icol != 'Gender':
            df[icol] = df[icol].map(lambda x: float(x) if type(x) != str else float(x.replace(',', '').replace('%', '')) if x != '-' else None)
    return df
# t3 = str_to_int(t3)


In [4]:
def get_tables(imdb_id):
    '''
    the code that does the scraping. Pretty simiple here, we can even use pandas to convert
    the html into a pandas DF.
    '''
    
    url = f'https://www.imdb.com/title/{imdb_id}/ratings?ref_=tt_ov_rt'
    response = requests.get(url)    

    soup = BeautifulSoup(response.text, 'html.parser')
    tables = soup.findAll('table')

    itable = tables[0]
    t = pd.read_html('<table> ' + str(itable) + ' </table>')
    t1 = t[0][1:]
    t1.columns = ['rating', 'percentage', 'totals']

    itable = tables[1]
    t = pd.read_html('<table> ' + str(itable) + ' </table>')
    t2 = t[0][1:]
    t2.columns = ['Gender', 'All Ages', '<18', '18-29', '30-44', '45+']
    # print(t2)

    itable = tables[2]
    t = pd.read_html('<table> ' + str(itable) + ' </table>')
    t3 = t[0][1:]
    t3.columns = ['top_1000_voters', 'US_users', 'nonUS_users']
    # print(t3)

    t1 = str_to_int(t1)

    t2 = split_cols(t2)
    t2 = str_to_int(t2)

    t3 = split_cols(t3)
    t3 = str_to_int(t3)

    return t1, t2, t3

In [6]:
all_movies = pd.read_csv('../../../../new_files2/all_imdb_ids.csv', lineterminator='\n')

## Loop through imdb ids

In [7]:
# Do all ids? Truncate the list of scrapes here:
# The first 50 are done here
all_movies = all_movies[0:50]


all_t1, all_t2, all_t3 = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
for i in range(len(all_movies)):
    
    imdb_id = all_movies['imdb_id'].iloc[i]
    title = all_movies['title'].iloc[i]
    print(i, title)
    
    try:
        t1, t2, t3 = get_tables(imdb_id)
        t1['imdb_id'] = imdb_id
        t2['imdb_id'] = imdb_id
        t3['imdb_id'] = imdb_id
    
        t1['title'] = title
        t2['title'] = title
        t3['title'] = title
        all_t1 = pd.concat([all_t1, t1])
        all_t2 = pd.concat([all_t2, t2])
        all_t3 = pd.concat([all_t3, t3])
    except:
        print('FAILED!!!!!!!!!!!', i, title, imdb_id)
    
    all_t1.to_csv('all_t1_3.csv')
    all_t2.to_csv('all_t2_3.csv')
    all_t3.to_csv('all_t3_3.csv')
    time.sleep(15)

0 Inception
1 Deadpool
2 Interstellar
3 The Avengers
4 The Dark Knight
5 Avatar
6 Guardians of the Galaxy
7 Fight Club
8 Django Unchained
9 Pulp Fiction
10 Avengers: Infinity War
11 Iron Man
12 Forrest Gump
13 The Matrix
14 Titanic
15 The Lord of the Rings: The Fellowship of the Ring
16 The Hunger Games
17 Mad Max: Fury Road
18 Harry Potter and the Philosopher's Stone
19 Captain America: Civil War
20 Iron Man 3
21 The Dark Knight Rises
22 Avengers: Age of Ultron
23 Jurassic World
24 The Shawshank Redemption
25 Suicide Squad
26 The Lord of the Rings: The Return of the King
27 Black Panther
28 Doctor Strange
29 Shutter Island
30 The Wolf of Wall Street
31 Guardians of the Galaxy Vol. 2
32 Inside Out
33 Wonder Woman
34 Captain America: The First Avenger
35 Thor
36 Star Wars: The Force Awakens
37 The Lord of the Rings: The Two Towers
38 Inglourious Basterds
39 Logan
40 Spider-Man: Homecoming
41 Iron Man 2
42 Pirates of the Caribbean: The Curse of the Black Pearl
43 Up
44 The Martian
45 Bat

In [9]:
# Print out results
all_t1

Unnamed: 0,rating,percentage,totals,imdb_id,title
1,9.0,30.3,603107.0,tt1375666,Inception
2,8.0,19.7,391390.0,tt1375666,Inception
3,7.0,8.4,167398.0,tt1375666,Inception
4,6.0,2.9,58537.0,tt1375666,Inception
5,5.0,1.3,26061.0,tt1375666,Inception
...,...,...,...,...,...
5,5.0,4.0,21253.0,tt0295297,Harry Potter and the Chamber of Secrets
6,4.0,1.4,7643.0,tt0295297,Harry Potter and the Chamber of Secrets
7,3.0,0.7,3691.0,tt0295297,Harry Potter and the Chamber of Secrets
8,2.0,0.4,2301.0,tt0295297,Harry Potter and the Chamber of Secrets


In [10]:
all_t2

Unnamed: 0,Gender,All Ages_rating,All Ages_votes,<18_rating,<18_votes,18-29_rating,18-29_votes,30-44_rating,30-44_votes,45+_rating,45+_votes,imdb_id,title
1,Males,8.8,1198939.0,9.1,1203.0,9.0,386173.0,8.8,601245.0,8.1,107891.0,tt1375666,Inception
2,Females,8.6,278009.0,8.7,251.0,8.8,108382.0,8.6,126703.0,8.1,20251.0,tt1375666,Inception
1,Males,8.0,498167.0,8.1,844.0,8.1,168342.0,7.9,221888.0,7.8,46330.0,tt1431045,Deadpool
2,Females,8.0,105797.0,8.1,146.0,8.1,43863.0,7.9,40883.0,7.9,7777.0,tt1431045,Deadpool
1,Males,8.6,843796.0,8.9,1068.0,8.9,294511.0,8.5,376742.0,8.0,77103.0,tt0816692,Interstellar
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,Females,7.4,59481.0,7.6,127.0,7.4,23458.0,7.3,24297.0,7.5,5142.0,tt0478970,Ant-Man
1,Males,7.8,315037.0,8.1,998.0,7.9,102408.0,7.8,140424.0,7.7,30953.0,tt3501632,Thor: Ragnarok
2,Females,8.1,64132.0,8.3,196.0,8.1,24106.0,8.0,25717.0,8.1,5744.0,tt3501632,Thor: Ragnarok
1,Males,7.3,294781.0,7.5,453.0,7.5,103212.0,7.1,141509.0,7.2,32165.0,tt0295297,Harry Potter and the Chamber of Secrets
