## <center> DISNEY'S MOVIES: WEB SCRAPING <center>



#### IMPORT LIBRARIES

In [None]:
from bs4 import BeautifulSoup
import requests

## TASK 1: GET DATA FROM AN EXAMPLE MOVIE 
### 1.1. Loading the webpage

In [None]:
source = requests.get('https://en.wikipedia.org/wiki/Born_in_China')
webpage = BeautifulSoup(source.content)

# print(webpage.prettify)

#### 1.2. Movie Info Table

In [None]:
info_table = webpage.find('table', class_ = 'infobox vevent').find_all('tr')

movie = {}

for index, row in enumerate(info_table):
   
  if index == 0:  # Movie title
    movie['Title'] = row.find('th').get_text()
  elif index == 1:  # Movie image
    continue
  else:  # Other infomation
    try:
      movie_key = row.find('th').get_text()
      movie_data = row.find('td').get_text()
      movie[movie_key] = movie_data
    except:
      continue

movie

{'Box office': '$25.1 million[4]',
 'Budget': '$5–10 million[3]',
 'Countries': 'United StatesChinaFrance',
 'Directed by': 'Lu Chuan',
 'Distributed by': 'Walt Disney Studios Motion PicturesShanghai Media Group (China)[2]',
 'Edited by': 'Matthew Meech',
 'Languages': 'EnglishMandarinFrench',
 'Music by': 'Barnaby Taylor',
 'Narrated by': 'John Krasinski (American release)Zhou Xun (Chinese release)Claire Keim (French release)',
 'Produced by': 'Roy ConliBrian LeithPhil Chapman',
 'Productioncompanies': '\nDisneynature\nShanghai Media Group\nChuan Films\nBrian Leith Productions[1]\n',
 'Release dates': '\nAugust\xa012,\xa02016\xa0(2016-08-12) (China)\nApril\xa021,\xa02017\xa0(2017-04-21) (United States)\nAugust\xa023,\xa02017\xa0(2017-08-23) (France)\n',
 'Running time': '76 minutes[2]',
 'Screenplay by': 'David FowlerBrian LeithPhil ChapmanLu Chuan',
 'Simplified': '我们诞生在中国',
 'Title': 'Born in China',
 'Traditional': '我們誕生在中國'}

In [None]:
def clean_tag(row):
  '''
  Remove references and a datatime format (2010-06-12)
  '''
  if row.find('sup') or row.find('span', class_ = 'bday dtstart published updated'):
    for tag in row.find_all(['sup', 'span']):
      tag.decompose()



def get_list_data(row):
  
  ''' 
  To list the data if a key has more than a paired value
  '''

  if row.find('li'):
    return [li.get_text(' ', strip = True).replace('\xa0', ' ') for li in row.find_all('li')]

  elif row.find('br'):
    return [text.replace('\xa0', ' ') for text in row.stripped_strings]
  
  else:
    return row.get_text(' ', strip = True).replace('\xa0', ' ')



def get_movie_info(url):
  
  ''' 
  Get the movie infomation from the given url link 
  '''
  
  ### Loading the page
  source = requests.get(url)
  webpage = BeautifulSoup(source.content)

  info_table = webpage.find('table', class_ = 'infobox vevent').find_all('tr')


  ### Store the movie info
  movie = {}

  for index, row in enumerate(info_table):

    if index == 0:
      movie['Title'] = row.find('th').get_text(' ', strip = True)
    else:
      try:
        clean_tag(row)  # Clean tags
        movie_key = row.find('th').get_text(' ', strip = True)
        movie_data = get_list_data(row.find('td'))
        movie[movie_key] = movie_data
      except:
        continue

  return movie


In [None]:
## Let's try to get the movie infomation
get_movie_info('https://en.wikipedia.org/wiki/Sister_Act_(franchise)#Sister_Act_3_(TBA)')

{'Created by': 'Paul Rudnick',
 'Film(s)': ['Sister Act',
  '(1992)',
  'Sister Act 2: Back in the Habit',
  '(1993)'],
 'Musical(s)': 'Sister Act',
 'Original work': 'Theatrical film',
 'Owner': ['Walt Disney Studios', '(', 'The Walt Disney Company', ')'],
 'Soundtrack(s)': ['Sister Act', 'Sister Act 2: Back in the Habit'],
 'Title': 'Sister Act'}

In [None]:
a = get_movie_info('https://en.wikipedia.org/wiki/Born_in_China')
for key in a:
  print(key, ' : ', a[key])

Title  :  Born in China
Traditional  :  我們誕生在中國
Simplified  :  我们诞生在中国
Directed by  :  Lu Chuan
Screenplay by  :  ['David Fowler', 'Brian Leith', 'Phil Chapman', 'Lu Chuan']
Produced by  :  ['Roy Conli', 'Brian Leith', 'Phil Chapman']
Narrated by  :  ['John Krasinski', '(American release)', 'Zhou Xun', '(Chinese release)', 'Claire Keim', '(French release)']
Edited by  :  Matthew Meech
Music by  :  Barnaby Taylor
Production companies  :  ['Disneynature', 'Shanghai Media Group', 'Chuan Films', 'Brian Leith Productions']
Distributed by  :  ['Walt Disney Studios Motion Pictures', 'Shanghai Media Group', '(China)']
Release dates  :  ['August 12, 2016 (China)', 'April 21, 2017 (United States)', 'August 23, 2017 (France)']
Running time  :  76 minutes
Countries  :  ['United States', 'China', 'France']
Languages  :  ['English', 'Mandarin', 'French']
Budget  :  $5–10 million
Box office  :  $25.1 million


## TAKS 2: LIST OF DISNEY'S MOVIES
#### 2.1. Loading page

In [None]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
webpage = BeautifulSoup(source.content)

#### 2.2. Get relative movie's links and access the movie data

In [None]:
movies = webpage.select('.wikitable.sortable i a')

base_path = 'https://en.wikipedia.org/'
movie_info_list = []


for index, movie in enumerate(movies):
  
  # if index == 6:
  #   break

  try:
    relative_path = movie['href']
    full_path = base_path + relative_path
    movie_info_list.append(get_movie_info(full_path))
  
  except Exception as e:
    print(e)
    print(movie.get_text())


'NoneType' object has no attribute 'find_all'
61
'NoneType' object has no attribute 'find_all'
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
The Thief
'NoneType' object has no attribute 'find_all'
Tom Sawyer
'NoneType' object has no attribute 'find_all'
Tower of Terror
'NoneType' object has no attribute 'find_all'
FC Barcelona
'NoneType' object has no attribute 'find_all'
Young Woman and the Sea


In [None]:
print(len(movies))
print(len(movie_info_list))

534
526


The movies in the Exception list cannot be accessed to the infomation. This is because the attached links either do not exit or are not the true movie link (eg. its original novel, music, city, ect). In this case, we can skip them.

#### 2.3. Save and Reload movie data

In [None]:
import json

def save_file(title, data):
  with open(title, 'w', encoding = 'utf-8') as f:
    json.dump(data, f, ensure_ascii = False, indent = 2)


save_file('disney_data.json', movie_info_list)

# ## This is for downloading files from Google colab jupiter to local system
from google.colab import files
files.download('disney_data.json')