## Disney Dataset Creation (w/ BeautifulSoup)

Scrape & clean a list of Disney Wikipedia pages to create a dataset to further analyze

#### Import Neccessary Libraries

In [20]:
from bs4 import BeautifulSoup as bs
import pandas as pd
import requests
import re


#### Load the webpage

In [4]:
# Load webpage content
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to a beautiful soup object
soup = bs(r.content, 'html.parser')


#### Grab the info box table

In [5]:
info_box = soup.find(class_="infobox vevent")
info_rows = info_box.find_all("tr")


#### Write some functions to make this a lot more neater

In [21]:
def get_title(rows):
    return rows.get_text()


def get_role(rows):
    return rows.find("th").get_text(" ", strip=True)


def get_people(rows):
    return rows.find(class_="infobox-data").get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").replace("[1]", "").strip()


def if_list(row):
    return [ls.get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").replace("[1]", "").strip() for ls in row.find_all("li")]


SyntaxError: invalid syntax (<ipython-input-21-c8b4db6055ff>, line 10)

#### Store the table in movie_info dictionary

In [7]:
movie_info = {}

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['title'] = row.get_text()
    elif index == 1:
        continue
    elif row.find("li"):
        movie_info[get_role(row)] = if_list(row)
    else:
        movie_info[get_role(row)] = get_people(row)


## Walt Disney Pictures Films Scraping

We'll now scrape the wikipedia page here: https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films

Specifically, we will get information of all the tables.

In [8]:
# Load webpage content
d = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_" +
                 "Pictures_films")

# Create beautifulsoup object
soupe = bs(d.content, 'html.parser')


#### Grab all information from the tableboxes, and then sort them in a neat way

In [9]:
table_1 = soupe.find_all(class_="wikitable sortable")


In [10]:
movie_dict = {}
movie_list = []
movie_title = ""
for tables in table_1:
    for index, tables_2 in enumerate(tables.find_all("tr")):
        if index == 0:
            continue
        for index, tables_3 in enumerate(tables_2.find_all("td")):
            movie_list.append(tables_3.get_text().replace(
                "\n", " ").replace("\xa0", " ").strip())
        if movie_list == []:
            continue
        else:
            movie_dict[movie_list.pop(1)] = movie_list
            movie_list = []


#### Now we're going to try something different.

This time we will get all the names and links of the movies. Then, by accessing the links, we'll grab similar information from the task, similar to the elements we obtained from the first task.

In [11]:
# Grab html links links and names

names = []
links = []
names_links = {}

for table in table_1:
    for table_2 in table.find_all("tr"):
        for table_3 in table_2.find_all("i"):
            for table_4 in table_3.find_all("a", href=True):
                names.append(table_4.get_text())
                links.append('https://en.wikipedia.org/' + table_4['href'])
                names_links[table_4.get_text(
                )] = "https://en.wikipedia.org/" + table_4['href']


Taking the earlier functions, and updating them to be a lot more universal for the task we're working on.

In [12]:
def Requests(url):
    # Load webpage content
    b = requests.get(url)
    # Convert to a beautiful soup object
    soupa = bs(b.content, 'html.parser')
    return soupa


def get_info_box(soup):

    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")

    for row in info_rows:
        if row.find("sup"):
            row.find("sup").decompose()
        elif row.find(class_="bday dtstart published updated"):
            row.find(class_="bday dtstart published updated").decompose()
        elif row.find("span"):
            row.find("span").decompose()
    return info_rows


def get_title(rows):
    return rows.find(class_="infobox-above summary").get_text()


def get_role(rows):
    return rows.find(class_="infobox-label").get_text(" ", strip=True)


def get_people(rows):
    return rows.find(class_="infobox-data").get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").strip()


def to_number(rows):

    # This regex characters from a string and leaves only numerical values
    return re.sub('\D', '', rows.find(class_="infobox-data").get_text())


def break_sep(rows):
    br = rows.find(class_="infobox-data").get_text(separator=",", strip=True).split(",")
    if len(br) == 1:
        return br[0]
    else:
        return br


def if_list(rows):
    lst = [ls.get_text(" ", strip=True).replace("\n", " ").replace("\xa0", " ").strip() for ls in rows.find_all("li")]
    if len(lst) == 1:
        return lst[0]
    else:
        return lst


def magic(enfo):

    movie_info = {}
    numerals = ['Budget', 'Box Office', 'Box office']
    for row in enfo:
        if row.find(class_="infobox-above summary"):
            movie_info["title"] = get_title(row)
        elif row.find(class_="infobox-label"):
            if row.find_all("li"):
                movie_info[get_role(row)] = if_list(row)
            elif row.find_all("br"):
                movie_info[get_role(row)] = break_sep(row)
            elif get_role(row) in numerals:
                movie_info[get_role(row) + ' (in millions)'] = to_number(row)
            elif get_role(row) == 'Running time':
                movie_info[get_role(row) + ' (in hours)'] = to_number(row)
            else:
                movie_info[get_role(row)] = get_people(row)
        else:
            continue
    return movie_info


def Master(links):
    master_list = []

    for ln in links:
        try:
            req = Requests(ln)
            info = get_info_box(req)
            master_list.append(magic(info))
        except Exception as e:
            print(ln)
            print(e)
    return (master_list)


Run links through the Master() function to obtain information from each Wikipedia article's information box. If the information box does not exist, it will not output. The error and the link that was unable to be scraped will be output just so we can ensure these errors are not the fault of the code.

In [13]:
master = Master(links)


https://en.wikipedia.org//wiki/True-Life_Adventures
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Giannis_Antetokounmpo
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/The_Twilight_Zone_Tower_of_Terror#Film_adaptation
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Chris_Paul
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/Jim_Henson#Legacy
'NoneType' object has no attribute 'find_all'
https://en.wikipedia.org//wiki/FC_Barcelona
'NoneType' object has no attribute 'find_all'


### Save/Load Movie Data

In [15]:
import json


def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


In [16]:
import json


def load_title(title):
    with open(title, encoding='utf-8') as f:
        json.load(f)


In [19]:
save_data("./datasets/disney-data.json", master)
