## Creating a Disney Dataset with Python BeautifulSoup

Webscraping Project. Scrape & clean a list of disney movies on wikipedia to create a dataset.

### Get Info Box & store as a dictionary

In [2]:
# import libraries

import requests
from bs4 import BeautifulSoup as bs

In [3]:
# loading wiki page

url = 'https://en.wikipedia.org/wiki/Mary_Poppins_(film)'
r = requests.get(url)

# beautiful soup object
soup = bs(r.content)

print(soup.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   Mary Poppins (film) - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"9e8cff27-96c4-444f-b5a0-f631e9f6c738","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"Mary_Poppins_(film)","wgTitle":"Mary Poppins (film)","wgCurRevisionId":1102248539,"wgRevisionId":1102248539,"wgArticleId":77856,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","Articles with short description","Short description matches Wikidata","Use mdy date

In [4]:
# get info table 
info_box = soup.find(class_='infobox vevent')

info_rows = info_box.find_all('tr')

for row in info_rows:
    print(row.prettify())

<tr>
 <th class="infobox-above summary" colspan="2" style="font-size: 125%; font-style: italic;">
  Mary Poppins
 </th>
</tr>

<tr>
 <td class="infobox-image" colspan="2">
  <a class="image" href="/wiki/File:Marypoppins.jpg">
   <img alt="Marypoppins.jpg" class="thumbborder" data-file-height="350" data-file-width="236" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/thumb/7/78/Marypoppins.jpg/220px-Marypoppins.jpg" srcset="//upload.wikimedia.org/wikipedia/en/7/78/Marypoppins.jpg 1.5x" width="220"/>
  </a>
  <div class="infobox-caption">
   Theatrical release poster
   <br/>
   Art by
   <a href="/wiki/Paul_Wenzel" title="Paul Wenzel">
    Paul Wenzel
   </a>
   <sup class="reference" id="cite_ref-auto_1-0">
    <a href="#cite_note-auto-1">
     [1]
    </a>
   </sup>
   <sup class="reference" id="cite_ref-Art_of_the_Stamp_2-0">
    <a href="#cite_note-Art_of_the_Stamp-2">
     [2]
    </a>
   </sup>
  </div>
 </td>
</tr>

<tr>
 <th class="infobox-label" scope="ro

In [5]:
# creating dictionary
movie_info = {}

# function for retriving row value(s)
def g_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all('li')]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

for index, row in enumerate(info_rows):
    if index == 0:
        movie_info['Title'] = row.find("th").get_text(" ", strip=True)
    elif index == 1:
        continue
    else:
        content_key = row.find('th').get_text(" ", strip=True)
        content_value = g_content_value(row.find('td'))
        movie_info[content_key] = content_value

        
movie_info

{'Title': 'Mary Poppins',
 'Directed by': 'Robert Stevenson',
 'Screenplay by': ['Bill Walsh', 'Don DaGradi'],
 'Based on': 'Mary Poppins by P. L. Travers',
 'Produced by': 'Walt Disney',
 'Starring': ['Julie Andrews',
  'Dick Van Dyke',
  'David Tomlinson',
  'Glynis Johns',
  'Hermione Baddeley',
  'Karen Dotrice',
  'Matthew Garber',
  'Elsa Lanchester',
  'Arthur Treacher',
  'Reginald Owen',
  'Ed Wynn'],
 'Cinematography': 'Edward Colman',
 'Edited by': 'Cotton Warburton',
 'Music by': 'Richard M. Sherman Robert B. Sherman',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution Company, Inc.',
 'Release dates': ['August 27, 1964 ( 1964-08-27 ) (LA)',
  'September 24, 1964 ( 1964-09-24 ) (NY)'],
 'Running time': '139 minutes [4]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$4.4–6 million [5]',
 'Box office': '$103.1 million [6]'}

### Info boxes for all movies 

##### According to robots.txt we are allowed to web scrape wikipedia pages.

In [7]:
# loading wiki page

url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
r = requests.get(url)

# beautiful soup object
soup = bs(r.content)

contents = soup.prettify()
print(contents)

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of Walt Disney Pictures films - Wikipedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":false,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"6c9ce396-e6bc-41ec-935a-aade0a333603","wgCSPNonce":false,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":false,"wgNamespaceNumber":0,"wgPageName":"List_of_Walt_Disney_Pictures_films","wgTitle":"List of Walt Disney Pictures films","wgCurRevisionId":1102572068,"wgRevisionId":1102572068,"wgArticleId":1970335,"wgIsArticle":true,"wgIsRedirect":false,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["Webarchive template wayback links","CS1 maint: url-status","Articles wit

In [13]:
# get info table 
movie_tables = soup.select('.wikitable.sortable i')

movie_tables

[<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>,
 <i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>,
 <i><a href="/wiki/Dumbo" title="Dumbo">Dumbo</a></i>,
 <i><a href="/wiki/Bambi" title="Bambi">Bambi</a></i>,
 <i><a href="/wiki/Saludos_Amigos" title="Saludos Amigos">Saludos Amigos</a></i>,
 <i><a href="/wiki/Victory_Through_Air_Power_(film)" title="Victory Through Air Power (film)">Victory Through Air Power</a></i>,
 <i><a href="/wiki/The_Three_Caballeros" title=

In [15]:
# function for retriving row value(s)
def g_content_value(row_data):
    if row_data.find('li'):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all('li')]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

    
def get_info_box(url):
    
    r = requests.get(url)

    # beautiful soup object
    soup = bs(r.content)
    info_box = soup.find(class_='infobox vevent')
    info_rows = info_box.find_all('tr')

    
    movie_info = {}    
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['Title'] = row.find("th").get_text(" ", strip=True)
        elif index == 1:
            continue
        else:
            content_key = row.find('th').get_text(" ", strip=True)
            content_value = g_content_value(row.find('td'))
            movie_info[content_key] = content_value
            
    return movie_info

In [21]:
url = 'https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films'
r = requests.get(url)
soup = bs(r.content)

movies = soup.select('.wikitable.sortable i')

for index, movie in enumerate(movies):
    try:
        path = movie.a['href']
        title = movie.a['title']
    except Exception as e:
            print(e)
            print(movie.get_text())

'NoneType' object is not subscriptable
Escape from the Dark
'NoneType' object is not subscriptable
Trail of the Panda
'NoneType' object is not subscriptable
Growing Up Wild
'NoneType' object is not subscriptable
Expedition China
'NoneType' object is not subscriptable
The Lion King 2
'NoneType' object is not subscriptable
29 Dates
'NoneType' object is not subscriptable
Aloha Rodeo
'NoneType' object is not subscriptable
Flight of the Navigator
'NoneType' object is not subscriptable
Knights
'NoneType' object is not subscriptable
Merlin
'NoneType' object is not subscriptable
Penelope
'NoneType' object is not subscriptable
Sadé
'NoneType' object is not subscriptable
Song for a Whale
'NoneType' object is not subscriptable
Space Mountain
'NoneType' object is not subscriptable
Spooked
'NoneType' object is not subscriptable
The Parent Trap
'NoneType' object is not subscriptable
Aladdin
'NoneType' object is not subscriptable
Cruella
'NoneType' object is not subscriptable
Jungle Cruise
'NoneType'