# Disney Dataset Creation (w/ BeautifulSoup)

Scrape & clean a list of Disney Wikipedia pages to create a dataset.

#### Import Neccessary Libraries

In [1]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import re


#### Load the webpage

In [2]:
# Load webpage content
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content, 'html.parser')


#### Grab the info box table

In [3]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")


#### Write some functions to make this a lot more neater

In [4]:
def get_title(rows):
    return rows.get_text()


def get_role(rows):
    return rows.find("th").get_text(" ", strip=True)


def get_people(rows):
    return rows.find(class_="infobox-data").get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").replace("[1]", "").strip()


def if_list(row):
    return [ls.get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").replace("[1]", "").strip() for ls in row.find_all("li")]


#### Store the table in movie_info dictionary

In [5]:
movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.get_text()
    elif index == 1:
        continue
    elif row.find("li"):
        movie_info[get_role(row)] = if_list(row)
    else:
        movie_info[get_role(row)] = get_people(row)


In [6]:
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

## Walt Disney Pictures Films Scraping

Now we want to run the same idea but on a collection of different Disney Movies.
We'll now scrape the wikipedia page here: https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films and obtain all the information like the above format.

In [7]:
# Load webpage content
d = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_" +
                 "Pictures_films")

# Create beautifulsoup object
soupe = bs(d.content, 'html.parser')


### Grab links of all movies on the webpage

In [8]:
table_1 = soupe.find_all(class_="wikitable sortable")

In [9]:
# Grab html links links and names

names = []
links = []
names_links = {}

for table in table_1:
    for table_2 in table.find_all("tr"):
        for table_3 in table_2.find_all("i"):
            for table_4 in table_3.find_all("a", href=True):
                names.append(table_4.get_text())
                links.append('https://en.wikipedia.org/' + table_4['href'])
                names_links[table_4.get_text(
                )] = "https://en.wikipedia.org/" + table_4['href']


In [10]:
links

['https://en.wikipedia.org//wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)',
 'https://en.wikipedia.org//wiki/Pinocchio_(1940_film)',
 'https://en.wikipedia.org//wiki/Fantasia_(1940_film)',
 'https://en.wikipedia.org//wiki/The_Reluctant_Dragon_(1941_film)',
 'https://en.wikipedia.org//wiki/Dumbo',
 'https://en.wikipedia.org//wiki/Bambi',
 'https://en.wikipedia.org//wiki/Saludos_Amigos',
 'https://en.wikipedia.org//wiki/Victory_Through_Air_Power_(film)',
 'https://en.wikipedia.org//wiki/The_Three_Caballeros',
 'https://en.wikipedia.org//wiki/Make_Mine_Music',
 'https://en.wikipedia.org//wiki/Song_of_the_South',
 'https://en.wikipedia.org//wiki/Fun_and_Fancy_Free',
 'https://en.wikipedia.org//wiki/Melody_Time',
 'https://en.wikipedia.org//wiki/So_Dear_to_My_Heart',
 'https://en.wikipedia.org//wiki/The_Adventures_of_Ichabod_and_Mr._Toad',
 'https://en.wikipedia.org//wiki/Cinderella_(1950_film)',
 'https://en.wikipedia.org//wiki/Treasure_Island_(1950_film)',
 'https://en.wikipedia.org//w

Taking the earlier functions, and updating them to be a lot more universal for the task we're working on.

In [11]:
def Requests(url):
    # Load webpage content
    b = requests.get(url)
    # Convert to a beautiful soup object
    soupa = bs(b.content, 'html.parser')
    return soupa


def get_info_box(soup):

    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")

    for row in info_rows:
        if row.find("sup"):
            row.find("sup").decompose()
        if row.find("span"):
            row.find("span").decompose()
    return (info_rows)

def get_title(rows):
    return rows.find(class_="infobox-above summary").get_text()


def get_role(rows):
    return rows.find(class_="infobox-label").get_text(" ", strip=True)


def get_people(rows):
    return rows.find(class_="infobox-data").get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").strip()


def to_number(rows):
    return ''.join(c for c in rows.get_text() if c.isdigit())


def release_date(rows):
    return re.sub("[\(\[].*?[\)\]]", "", rows.get_text()).strip()


def running_time (rows):
    return re.sub("[\(\[].*?[\)\]]", "", rows.get_text()).strip()


def break_sep(rows):
    br = rows.find(class_="infobox-data").get_text(separator=",", strip=True).split(",")
    if len(br) == 1:
        return br[0]
    else:
        return br


def if_list(rows):
    lst = [ls.get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").strip() for ls in rows.find_all("li")]
    if len(lst) == 1:
        return lst[0]
    else:
        return lst


def magic(enfo):

    movie_info = {}
    numerals = ['Budget', 'Box Office', 'Box office']
    for row in enfo:
        if row.find(class_="infobox-above summary"):
            movie_info["title"] = get_title(row)
        elif row.find(class_="infobox-label"):
            if row.find_all("li"):
                movie_info[get_role(row)] = if_list(row)
            elif row.find_all("br"):
                movie_info[get_role(row)] = break_sep(row)
            elif get_role(row) == 'Running time':
                movie_info[get_role(row) + ' (in hours)'] = get_people(row)
            elif get_role(row) == 'Release date':
                movie_info[get_role(row)] = release_date(row)
            else:
                movie_info[get_role(row)] = get_people(row)
        else:
            continue
    return movie_info


def Master(links):
    master_list = []

    for ln in links:
        try:
            req = Requests(ln)
            info = get_info_box(req)
            master_list.append(magic(info))
        except Exception as e:
            print(ln)
            print(e)
    return (master_list)


Run links through the Master() function to obtain information from each Wikipedia article's information box. If the information box does not exist, it will not output. The error and the link that was unable to be scraped will be output just so we can ensure these errors are not the fault of the code.

In [12]:
master = Master(links)

https://en.wikipedia.org//wiki/Wish_(2023_film)
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Elio_(film)
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Chris_Paul
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Big_Thunder_Mountain_Railroad
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Keeper_of_the_Lost_Cities#Film_adaptation
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Jim_Henson#Legacy
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/One_Thousand_and_One_Nights
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Space_Mountain#Film
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/The_Graveyard_Book#Possible_film_adaptation
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/The_Thief_(Turner_novel)
'NoneType' object has no attr

### Save/Load Movie Data

In [13]:
import json


def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


In [14]:
def load_title(title):
    with open(title, encoding='utf-8') as f:
        json.load(f)


In [15]:
save_data("./datasets/disney-data.json", master)


In [16]:
req = Requests(links[0])
info = get_info_box(req)

In [17]:
links[1]

'https://en.wikipedia.org//wiki/Pinocchio_(1940_film)'