## INFO BOX for Toy Story

In [1]:
from bs4 import BeautifulSoup as bs
import requests

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")
soup = bs(r.content)
contents = soup.prettify()

In [3]:
info_box = soup.find(class_="infobox vevent")

In [4]:
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace('\xa0',' ') for li in row_data.find_all('li')]
    else:
        return row_data.get_text(" ", strip=True)

movie_info = {}
info_rows = info_box.find_all("tr")
for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        movie_info[row.th.get_text(" ", strip=True)] = get_content_value(row.td)

## INFO BOX for all the movies

In [5]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)

In [6]:
def clean_tags(soup):
    for tag in soup.find_all(['sup', "span"]):
        tag.decompose()

def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    contents = soup.prettify()
    clean_tags(soup)
    info_box = soup.find(class_="infobox vevent")
    movie_info = {}
    info_rows = info_box.find_all("tr")
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        elif index == 1:
            continue
        else:
            if row.th:
                movie_info[row.th.get_text(" ", strip=True)] = get_content_value(row.td)
    return movie_info

movies = soup.select('.wikitable.sortable i a')
movie_info_list = []
for movie in movies:
    try:
        url = 'https://en.wikipedia.org'+movie['href']
        movie_info_list.append(get_info_box(url))
    except Exception as e:
        print(movie.get_text())
        print(e)


Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'
The London Connection
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find'
Tower of Terror
'NoneType' object has no attribute 'find_all'
Tron: Ares
'NoneType' object has no attribute 'find'
61
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
FC Barcelona
'NoneType' object has no attribute 'find_all'


## Save/Reload Movie Data

In [9]:
movie_info_list

[{'title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472'},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': 'Snow White by The Brothers Grimm',
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Roy Atwell',
   'Pinto Colvig',
   'Otis Harlan',
   'Scotty Mattraw',
   'Billy Gilbert',
   'Eddie Collins',
   'Moroni Olsen',
   'Stuart Buchanan'],
  'Music by': ['F

In [10]:
import json

def save_data(title, data):
    with open(title,'w',encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [11]:
save_data(title='disney_data.json', data=movie_info_list)

## Data Cleaning
- clean up referneces [1] (code before)
- convert running time into integer (after)
- convert dates to datetime (after)
- splitl ong strings (before)
- convert budget and box office to numbers (after)


In [12]:
disney_data = load_data('disney_data.json')

def get_running_time(time):
    if isinstance(time, list):
        time= time[0]
    elif time=="N/A":
        return None
    return int(time.split()[0])

# [get_running_time(movie.get('Running time','N/A')) for movie in disney_data]
for i,movie in enumerate(disney_data):
    disney_data[i]['Running time (int)'] = get_running_time(movie.get('Running time','N/A'))


In [13]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None

In [14]:
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget','N/A'))
    movie['Box office (float)'] = money_conversion(movie.get('Box office','N/A'))

In [15]:
from datetime import datetime

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if type(date) == list:
        date = date[0]
    if date == 'N/A':
        return None
    date_str = clean_date(date)
    for fmt in ["%B %d, %Y", "%d %B %Y"]:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None


for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))



In [16]:
import pickle

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

## Attach scores

In [21]:
import urllib
import os

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": os.environ['OMDB_API_KEY'], 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [22]:
# for movie in movie_info_list:
#     movie['imdb'] = float(movie['imdb'])
#     movie['metascore'] = float(movie['metascore'])
#     movie['rotten_tomatoes'] = float(movie['rotten_tomatoes'].strip('%'))

movie_info_list

[{'title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Budget (float)': None,
  'Box office (float)': 45.472,
  'Release date (datetime)': datetime.datetime(1937, 5, 19, 0, 0),
  'imdb': '7.1',
  'metascore': 'N/A',
  'rotten_tomatoes': None},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': 'Snow White by The Brothers Grimm',
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'H

In [23]:
len(movie_info_list)

506

In [30]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime('%B %d %Y')
    else:
        movie['Release date (datetime)'] = None

movie_info_copy

[{'title': 'Academy Award Review of',
  'Production company': 'Walt Disney Productions',
  'Distributed by': 'RKO Radio Pictures',
  'Release date': ['May 19, 1937'],
  'Running time': '41 minutes (74 minutes 1966 release)',
  'Country': 'United States',
  'Language': 'English',
  'Box office': '$45.472',
  'Budget (float)': None,
  'Box office (float)': 45.472,
  'Release date (datetime)': 'May 19 1937',
  'imdb': '7.1',
  'metascore': 'N/A',
  'rotten_tomatoes': None},
 {'title': 'Snow White and the Seven Dwarfs',
  'Directed by': ['David Hand',
   'William Cottrell',
   'Wilfred Jackson',
   'Larry Morey',
   'Perce Pearce',
   'Ben Sharpsteen'],
  'Written by': ['Ted Sears',
   'Richard Creedon',
   'Otto Englander',
   'Dick Rickard',
   'Earl Hurd',
   'Merrill De Maris',
   'Dorothy Ann Blank',
   'Webb Smith'],
  'Based on': 'Snow White by The Brothers Grimm',
  'Produced by': 'Walt Disney',
  'Starring': ['Adriana Caselotti',
   'Lucille La Verne',
   'Harry Stockwell',
   'Ro

In [32]:
save_data('disney_data_final.json', movie_info_copy)

In [35]:
import pandas as pd

df = pd.DataFrame(movie_info_list)
df.to_csv('disney_data_final.csv')