In [8]:
# import BeautifulSoup and requests

from bs4 import BeautifulSoup as bs
import requests

In [9]:
# make soup from list of Disney films

start_URL = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"
res = requests.get(start_URL)
soup = bs(res.content)

In [10]:
# set up info collection

def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()
        
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info

In [None]:
# scrape wikipedia. Start from the list of Disney movies page, then use that information to navigate to appropriate page for each movie

r = requests.get(start_URL)
soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index, movie in enumerate(movies):
#     if index == 10:
#         break
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'


In [30]:
# create json file saving and loading functions, avoid repeated scraping 

import json

def save_json(name, data) :
    with open(name, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii = False, indent = 2)

In [3]:
import json

def load_json(name) :
    with open(name, "r", encoding="utf-8") as f:
        return json.load(f)

In [None]:
save_json("disney_data.json", movie_info_list)

In [66]:
# from now on, just load in data from json file

movie_info_list = load_json("disney_data.json")

In [6]:
len(movie_info_list)

437

## Cleaning

In [None]:
# five tasks, listed below


print("EXAMPLES\n\n")

print(f"{movie_info_list[1]}\n")

# 1 references
print(f"1: {movie_info_list[1]['Budget']}, get rid of [1]\n")

# 2 dates into datetimes
print(f"2: {movie_info_list[1]['Release date']}\n")

# 3 numbers into ints
print(f"3: {movie_info_list[1]['Running time']}, but type: {type(movie_info_list[1]['Running time'])}\n")

# 4 convert long strings into arrays
print(f"4: {movie_info_list[21]['Starring']}\n")

# 5 budget and box office
print(f"5: {movie_info_list[1]['Budget']}, but type: {type(movie_info_list[1]['Budget'])}")

In [11]:
# fixed references (added remove_sups_spans() above)

cinderella = get_info_box("https://en.wikipedia.org/wiki/Cinderella_(1950_film)")

cinderella

{'title': 'Cinderella',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wilfred Jackson'],
 'Produced by': 'Walt Disney',
 'Story by': ['Ken Anderson',
  'Perce Pearce',
  'Homer Brightman',
  'Winston Hibler',
  'Bill Peet',
  'Erdman Penner',
  'Harry Reeves',
  'Joe Rinaldi',
  'Ted Sears',
  'Maurice Rapf (uncredited)'],
 'Based on': ['Cinderella', 'by', 'Charles Perrault'],
 'Starring': ['Ilene Woods',
  'Eleanor Audley',
  'Verna Felton',
  'Rhoda Williams',
  'James MacDonald',
  'Luis van Rooten',
  'Don Barclay',
  'Mike Douglas',
  'William Phipps',
  'Lucille Bliss'],
 'Narrated by': 'Betty Lou Gerson',
 'Music by': ['Oliver Wallace', 'Paul J. Smith'],
 'Edited by': 'Donald Halliday',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['February 15, 1950 (Boston)',
  'March 4, 1950 (United States)'],
 'Running time': '74 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$2.9 million',
 'B

In [174]:
# create new column for running time as an int

movie_info_list = load_json("disney_data_wiki.json")

for i, j in enumerate(movie_info_list):
    # account for absence of key, i.e movie_info_list[323]
    if "Running time" not in j.keys():
        continue
    elif type(j['Running time']) == list:
        # take first listed value, skip second+ value(s)
        str_run_time = (j["Running time"][0]).split(' ')[0]
    else:
        str_run_time = (j["Running time"]).split(" ")[0]
    if i % 50 == 0:
        print(i)
    j["Running time (int)"] = int(str_run_time)

0
50
100
150
200
250
300
350
400


In [167]:
# observe different formats of collected budget info

count = 0
for i, j in enumerate(movie_info_list):
    if "Budget" not in j.keys() :
        continue
    else:
        print(j["Budget"])

$1.49 million
$2.6 million
$2.28 million
$600,000
$950,000
$858,000
$788,000
$1.35 million
$2.125 million
$1.5 million
$1.5 million
$2.9 million
$1,800,000
$3 million
$4 million
$2 million
$300,000
$1.8 million
$5 million
$4 million
$700,000
$6 million
under $1 million or $1,250,000
$2 million
$2.5 million
$4 million
$3.6 million
$3 million
$3 million
$3 million
$4.4–6 million
$4 million
$5 million
$5 million
$4 million
$6.3 million
$5 million
$8 million
AU$1 million
$5 million
$7.5 million
$10 million
$3.5 to 4 million
$5.25 million
$20 million
$9 million
$6-8 million
$20 million
$18 million
$12 million
$14 million

$5 million
unknown
$19 million (estimated)
$11 million
$28 million
$25–44 million
$14 million
$9 million
A$8.7 million
$31 million
$18 million
$5 million
$40 million
$20 million
$14 million
60 million Norwegian Kroner (around $8.7 million in 1989)
$35-40 million
$25 million
$15 million
$40 million
$14 million
$28 million
$12 million
$6.5 million
$28 million
$17 million
$30

In [169]:
# observe different formats of collected box office info

for j in movie_info_list:
    if "Box office" not in j.keys() :
        continue
    else:
        print(j["Box office"])

$45.472
$418 million
$164 million
$76.4–$83.3 million
$960,000 (worldwide rentals)
$1.3 million (est. United States/Canada rentals, 1941)
$267.4 million
$1,135,000 (worldwide rentals)
$799,000
$3,355,000 (worldwide rentals)
$3.275 million (worldwide rentals)
$65 million
$3,165,000 (worldwide rentals)
$2,560,000 (worldwide rentals)
$3.7 million (U.S. rental) $575,000 (foreign rental)
$1,625,000 (worldwide rentals)
$263.6 million
$4,100,000 (worldwide rentals)
$5.6 million (US, 1951)
$2.1 million (US rentals)
$87.4 million
$1 million (US)
$2.6 million (US)
$1.75 million (US and Canadian rentals)
$28.2 million
$2,150,000 (US)
$187 million
$2.1 million (US)
$1.6 million (US)
$1.7 million (US)
$2.75 million (US)
$1.75 million (US rentals)
$6,250,000 (US/Canada rentals)
$1.8 million (est. US/ Canada rentals)
$2.5 million (est. US/ Canada rentals)
$51.6 million (United States/Canada)
$12.3 million (US and Canada rentals)
['Original release', ':', '$2.6 million (est. US/ Canada rentals)', '196

In [107]:
# define regular expression to selectively grab numerical values

import re

rgx = r"(?<=\$)\d+(([\.?|\,?]?(\d+))+)?"

In [176]:
# function to collect, convert, and modify values appropriately

def create_int_key (key_string):
    
    for movie in movie_info_list:
        if key_string not in movie.keys() or isinstance((movie[key_string]), list):
            movie[f"{key_string} (int)"] = None
            continue
        else:
            to_search = movie[key_string]
            try:
                check = re.search(rgx, to_search)
                extracted = check.group()
            except:
                continue
            if len(extracted.split(",")) > 1:
                final_int = int(extracted.replace(",", ""))
            elif "million" in to_search.split(" "): # searching for million rather than billion due to USD conversion
                final_int = int(float(extracted) * 1000000)
            else:
                final_int = int(float(extracted) * 1000000000)
            movie[f"{key_string} (int)"] = final_int


In [177]:
create_int_key("Budget")

In [178]:
create_int_key("Box office")

In [192]:
# checking progress

movie_info_list[-47]

{'title': 'Finding Dory',
 'Directed by': 'Andrew Stanton',
 'Produced by': 'Lindsey Collins',
 'Screenplay by': ['Andrew Stanton', 'Victoria Strouse'],
 'Story by': 'Andrew Stanton',
 'Starring': ['Ellen DeGeneres',
  'Albert Brooks',
  'Hayden Rolence',
  "Ed O'Neill",
  'Kaitlin Olson',
  'Ty Burrell',
  'Diane Keaton',
  'Eugene Levy'],
 'Music by': 'Thomas Newman',
 'Cinematography': ['Jeremy Lasky (camera)', 'Ian Megibben (lighting)'],
 'Edited by': 'Axel Geddes',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release date': ['June 8, 2016 ( El Capitan Theatre )',
  'June 17, 2016 (United States)'],
 'Running time': '97 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$175–200 million',
 'Box office': '$1.029 billion',
 'Running time (int)': 97,
 'Budget (int)': 175000000,
 'Box office (int)': 1028999999}

In [225]:
# collect dates as variably-formatted strings and lists of strings

from datetime import datetime

dates = [date.get("Release date") for date in movie_info_list]

In [194]:
dates[0:5]

[['May 19, 1937'],
 ['December 21, 1937 ( Carthay Circle Theatre , Los Angeles , CA )',
  'February 4, 1938 (United States)'],
 ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'],
 ['November 13, 1940'],
 ['June 20, 1941']]

In [288]:
# convert lists and long strings into only relevant date info

def clean_dates (date):
    return date.split("(")[0].strip()

def get_dates (date):
    if isinstance(date, list):
        date = date[0]
    
    if date == "N/A":
        return None

    date = clean_dates(date)
    return date


In [286]:
# convert strings into datetimes

def make_datetime(date):
    date_fmts = ["%B %d, %Y", "%d %B %Y", "%Y", "%B %Y"]
    
    date = get_dates(date)
    
    for fmt in date_fmts:
        try:
            date_clean = datetime.strptime(date, fmt)
            return date_clean
        except:
            continue
    return None

In [292]:
# apply above function to each movie

for movie in movie_info_list:
    movie["Release date (datetime)"] = make_datetime(movie.get("Release date", "N/A"))

None


In [5]:
# save_json doesn't work with datetime formats, so we make pickle functions for bitwise file saving and loading

import pickle

def save_pickle (name, dict_) :
    with open(name, 'wb') as f:
        pickle.dump(dict_, f, pickle.HIGHEST_PROTOCOL)

def load_pickle (name) :
    with open(name, "rb") as f:
        return pickle.load(f)

In [299]:
save_pickle("disney_data_clean.pickle", movie_info_list)

In [6]:
movie_info_list = load_pickle("disney_data_clean.pickle")

In [301]:
# checking object from pickle file

movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (int)': 3600000,
 'Box office (int)': 303000000,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

In [17]:
# supplementing wikipedia information with rating information from the OMDB API

import requests
import os

k = os.environ["OMDB_KEY"]

def generate_URL(movie):
    # example request: "http://www.omdbapi.com/?apikey=[API_KEY]&t=Cinderella&y=1950"
    
    base_URL = "http://www.omdbapi.com/"
    t = movie["title"]
    
    y_datetime = movie["Release date (datetime)"]
    
    try:
        y = y_datetime.strftime("%Y")
    except:
        return None
    
    return "%s?apikey=%s&t=%s&y=%s" % (base_URL, k, t, y)

def get_ratings(movie):
    # necessary try/except for movies without year information
    try:
        r = requests.get(generate_URL(movie))
        res = r.json()
    except:
        rotten_tomatoes = None
        imdb = None
    
    try:
        rotten_tomatoes_str = res["Ratings"][1]["Value"]
        rotten_tomatoes = int(rotten_tomatoes_str.replace("%",""))
    except:
        rotten_tomatoes = None
        
    try:
        imdb_str = res["imdbRating"]
        imdb = float(imdb_str)
    except:
        imdb = None
        
    return rotten_tomatoes, imdb

In [18]:
# apply above functions. NOTE: free API version allows 1000 requests/day

for movie in movie_info_list:
    ratings = get_ratings(movie)
    movie["Rotten Tomatoes"] = ratings[0]
    movie["Imdb"] = ratings[1]

## Saving

In [22]:
save_pickle("disney_data_final.pickle", movie_info_list)

In [26]:
# copy movie_info_list

movie_info_copy = movie_info_list[:]
movie_info_copy[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (int)': 3600000,
 'Box office (int)': 303000000,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0),
 'Rotten Tomatoes': 98,
 'Imdb': 7.2}

In [27]:
# convert datetime object into string in order to enable creation of json file

for movie in movie_info_copy:
    orig_date = movie["Release date (datetime)"]
    if orig_date:
        movie["Release date (datetime)"] = orig_date.strftime("%B %d, %Y")
    else:
        movie["Release date (datetime)"] = None

In [36]:
# check for successful conversion

movie_info_copy[50]['Release date (datetime)']

'January 25, 1961'

In [38]:
save_json("disney_data_final.json", movie_info_copy)

In [32]:
# importing pandas to save as csv

import pandas as pd

df = pd.DataFrame(movie_info_list)
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (int),Box office (int),...,Narrated by,Cinematography,Edited by,Screenplay by,Production companies,Japanese,Hepburn,Adaptation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45472000000.0,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million,126.0,2280000.0,76400000.0,...,Deems Taylor,James Wong Howe,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 20, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,,Bert Giennon,Paul Weatherwax,,,,,,,


In [37]:
df.to_csv("disney_data_final.csv")