# Disney Dataset Creation

Webscraping solution using beautifulsoup
Following along Keith Galli's video: https://www.youtube.com/watch?v=Ewgy-G9cmbg

In [60]:
import pandas as pd
from bs4 import BeautifulSoup as bs
import requests
import json

In [25]:
# getting the page
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# creating the soup
soup = bs(r.content)

# making it readable
contents = soup.prettify()

In [28]:
# getting only the infobox with the main info
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")


In [30]:
movie_info = {}


def get_content(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ").replace(' [1]', '')

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.find('th').get_text()
    elif index == 1:
        continue
    else:
        content_key = row.find("th").get_text(" ", strip=True)
        content_value = get_content(row.find('td'))
        movie_info[content_key] = content_value

                
movie_info
            

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million',
 'Box office': '$1.067 billion'}

In [37]:
# getting the page
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# creating the soup
soup = bs(r.content)

# making it readable
contents = soup.prettify()

In [44]:
movies = soup.select(".wikitable.sortable i")



TypeError: list indices must be integers or slices, not Tag

In [45]:
def get_content(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ").replace(' [1]', '')
    
def get_info_box(url):
    r = requests.get(url)
    soup = bs(r.content)
    
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find('th').get_text()
        elif index == 1:
            continue
        else:
            content_key = row.find("th").get_text(" ", strip=True)
            content_value = get_content(row.find('td'))
            movie_info[content_key] = content_value
    return movie_info

In [55]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)
movies = soup.select('.wikitable.sortable i a')
base_path = 'https://en.wikipedia.org'
movie_info_list = []

for index, movie in enumerate(movies):
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
True-Life Adventures
'NoneType' object has no attribute 'find_all'
The London Connection
'NoneType' object has no attribute 'find'
Spirited Away
'NoneType' object has no attribute 'get_text'
Howl's Moving Castle
'NoneType' object has no attribute 'get_text'
The Nightmare Before Christmas 3D
'NoneType' object has no attribute 'get_text'
The Secret of the Magic Gourd
'NoneType' object has no attribute 'get_text'
Ponyo
'NoneType' object has no attribute 'get_text'
Tales from Earthsea
'NoneType' object has no attribute 'get_text'
Anaganaga O Dheerudu
'NoneType' object has no attribute 'get_text'
The Secret World of Arrietty
'NoneType' object has no attribute 'get_text'
The Good Dinosaur
'NoneType' object has no attribute 'get_text'
Tini: The Movie
'NoneType' object has no attribute 'get_text'
Born in China
'NoneType' object has no attribute 'get_text'
The Beatles: Get Bac

In [61]:
def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [64]:
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [65]:
save_data("disney_data.json", movie_info_list)