# Web Scraping Disney movies

Goal: Scrape and clean data from Wikipedia using Beautiful Soup & Regex, add movie information via OMBD API and save as DataFrame, ready for analysis

In [112]:
import requests
from bs4 import BeautifulSoup
import re
import json
import os
import pandas as pd

### 1. Get wiki infobox for Toy Story 3

In [113]:
toystory = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')

In [114]:
soup = BeautifulSoup(toystory.content)

In [115]:
info_box = soup.find(class_='infobox vevent')
info_rows = info_box.find_all('tr')

In [116]:
movie_info = {}

def multi_data(r):
    if r.find('li'):
        return [li.get_text(' ', strip=True) for li in r.find_all('li')]
    else:
        return r.get_text(' ', strip=True)

for index,row in enumerate(info_rows):
    if index==0:
        movie_info['title']=row.find('th').get_text()
    else:
        header = row.find('th')
        if header:
            movie_info[row.find('th').get_text(' ', strip=True)]=multi_data(row.find('td'))
             
movie_info

{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Produced by': 'Darla K. Anderson',
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Music by': 'Randy Newman',
 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June\xa012,\xa02010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June\xa018,\xa02010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200\xa0million [1]',
 'Box office': '$1.067\xa0billion [1]'}

### 2. List of dictionnaries with infoboxes of all Disney movies

In [117]:
disneymovies = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

In [118]:
soup = BeautifulSoup(disneymovies.content)

In [119]:
wiki = []

for tr in soup.find_all('tbody'):
    for i in tr.find_all('tr'):
        for a in i.find_all('i'):
            for href in a.find_all('a'):
                wiki.append(href.get('href'))
                
wiki = wiki[1:]
wiki[2]

'/wiki/Pinocchio_(1940_film)'

In [9]:
disney_movies=[]

for href in wiki:
    
    try:
        url = requests.get('https://en.wikipedia.org' + href)
        soup = BeautifulSoup(url.content)
        info_box = soup.find(class_='infobox vevent')
        info_rows = info_box.find_all('tr')

        movie_info = {}

        def multi_data(r):
            if r.find('li'):
                return [li.get_text(' ', strip=True) for li in r.find_all('li')]
            elif r.find('br'):
                return [text for text in r.stripped_strings]
            else:
                return r.get_text(' ', strip=True)

        for index,row in enumerate(info_rows):
            if index==0:
                movie_info['title']=row.find('th').get_text()
            else:
                header = row.find('th')
                if header:
                    movie_info[row.find('th').get_text(' ', strip=True)]=multi_data(row.find('td'))

        disney_movies.append(movie_info)
        
    except Exception as e:
        print(href)
        print(e)

/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'find'
/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'find'
/wiki/True-Life_Adventures
'NoneType' object has no attribute 'find_all'
/wiki/The_Omega_Connection
'NoneType' object has no attribute 'find'
/wiki/True-Life_Adventures#Films
'NoneType' object has no attribute 'find_all'
/wiki/Tim_Federle#Fiction
'NoneType' object has no attribute 'find_all'


In [10]:
len(disney_movies)

449

In [11]:
def save_json(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [12]:
save_json('disney_movies.json',disney_movies)

In [13]:
def load_json(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [42]:
disney_movies = load_json('disney_movies.json')

### 3. Clean-up of data

3.1 remove references

In [43]:
disney_movies[2]['Budget']

'$2.6 million [3]'

In [44]:
for movie in disney_movies:
    for k,v in movie.items():
        if isinstance(v,str):
                movie[k] = re.sub(r'\s\[\d+\]','',v)
        else:
            for index,item in enumerate(v):
                v[index] = re.sub(r'\s\[\d+\]','',item)              
disney_movies[2]['Budget']

'$2.6 million'

3.2. remove non-breaking space

In [45]:
disney_movies[2]['Box office']

'$164\xa0million'

In [46]:
for movie in disney_movies:
    for k,v in movie.items():
        if isinstance(v,str):
                movie[k] = v.replace('\xa0',' ')
        else:
            for index,item in enumerate(v):
                v[index] = item.replace('\xa0',' ')
disney_movies[2]['Box office']

'$164 million'

In [47]:
def min_to_int(running_time):
    if running_time == 'N/A':
        return None
    if isinstance(running_time, list):
        first_entry = running_time[0]
        return int(first_entry.split(' ')[0])
    else:
        return int(running_time.split(' ')[0])

3.3 running time to int

In [48]:
disney_movies[2]['Running time']

'88 minutes'

In [49]:
for movie in disney_movies:
    movie['Running time (int)'] = min_to_int(movie.get('Running time','N/A'))
disney_movies[2]['Running time (int)']

88

3.4. release date to dt

In [50]:
def date_convert(date):
    if isinstance(date, list):
        date = date[0]
    if date == 'N/A':
        return None
    date = date.split('(')[0].strip()
    return date

In [51]:
disney_movies[2]['Release date']

['February 7, 1940 ( 1940-02-07 ) ( Center Theatre )',
 'February 23, 1940 ( 1940-02-23 ) (United States)']

In [52]:
for movie in disney_movies:
    movie['Release date (datetime)'] = date_convert(movie.get('Release date','N/A'))
disney_movies[2]['Release date (datetime)']

'February 7, 1940'

3.5. budget & box office to int format

In [53]:
def dollar_budgetbox(ex):
    if ex == 'N/A':
        return None
    if isinstance(ex,list):
        ex = ex[0]
    if bool(re.search('\$',ex)):
        ex = re.search('\$(.*)',ex).group()
        if bool(re.search('million',ex)):
            if bool(re.search('\d+\.\d+',ex)):
                return (int(re.search('\$.?\d+',ex).group()[1:])*1000000
                        + int(re.search('\.\d+',ex).group()[1:])
                        *10**(6-len(re.search('\.\d+',ex).group()[1:])))
            else:
                return (int(re.search('\$.?\d+',ex).group()[1:])*1000000)
        else:
            if bool(re.search('\d+\.\d+',ex)):
                return int(re.search('\$.?\d+',''.join(re.split('\.',ex))).group()[1:])
            else:
                return int(re.search('\$.?\d+',''.join(re.split('\,',ex))).group()[1:])

In [54]:
for movie in disney_movies:
    movie['budget in dollars'] = dollar_budgetbox(movie.get('Budget','N/A'))
disney_movies[2]['budget in dollars']

2600000

In [55]:
for movie in disney_movies:
    movie['box in dollars'] = dollar_budgetbox(movie.get('Box office','N/A'))
disney_movies[2]['box in dollars']

164000000

In [56]:
disney_movies[2]

{'title': 'Pinocchio',
 'Directed by': ['Ben Sharpsteen',
  'Hamilton Luske',
  'Bill Roberts',
  'Norman Ferguson',
  'Jack Kinney',
  'Wilfred Jackson',
  'T. Hee'],
 'Story by': ['Ted Sears',
  'Otto Englander',
  'Webb Smith',
  'William Cottrell',
  'Joseph Sabo',
  'Erdman Penner',
  'Aurelius Battaglia'],
 'Based on': ['The Adventures of Pinocchio', 'by', 'Carlo Collodi'],
 'Produced by': 'Walt Disney',
 'Starring': ['Cliff Edwards',
  'Dickie Jones',
  'Christian Rub',
  'Walter Catlett',
  'Charles Judels',
  'Evelyn Venable',
  'Frankie Darro'],
 'Music by': ['Leigh Harline', 'Paul J. Smith'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['February 7, 1940 ( 1940-02-07 ) ( Center Theatre )',
  'February 23, 1940 ( 1940-02-23 ) (United States)'],
 'Running time': '88 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$2.6 million',
 'Box office': '$164 million',
 'Running time (int)': 88,
 '

### 4. Adding ratings from movie API

In [57]:
def get_ombd(title):
    ombd_url = 'http://www.omdbapi.com/?'
    full_url = ombd_url + 'apikey=' + os.environ['OMDB_KEY'] + '&t=' + title
    return requests.get(full_url).json()

get_ombd('One Hundred and One Dalmatians')

{'Title': 'One Hundred and One Dalmatians',
 'Year': '1961',
 'Rated': 'G',
 'Released': '25 Jan 1961',
 'Runtime': '79 min',
 'Genre': 'Animation, Adventure, Comedy',
 'Director': 'Clyde Geronimi, Hamilton Luske, Wolfgang Reitherman',
 'Writer': 'Bill Peet, Dodie Smith',
 'Actors': "Rod Taylor, Betty Lou Gerson, J. Pat O'Malley",
 'Plot': 'When a litter of Dalmatian puppies are abducted by the minions of Cruella de Vil, the parents must find them before she uses them for a diabolical fashion statement.',
 'Language': 'English',
 'Country': 'United States',
 'Awards': '1 win & 1 nomination total',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BZGMyMjE4OGUtNGZmMC00YzdmLThkMWYtZWIzMmEzNjA4MzVkXkEyXkFqcGdeQXVyMTQxNzMzNDI@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '7.3/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '98%'},
  {'Source': 'Metacritic', 'Value': '83/100'}],
 'Metascore': '83',
 'imdbRating': '7.3',
 'imdbVotes': '159,069',
 'imdbID': 'tt0

In [58]:
def get_tomato(movie):
    score = movie.get('Ratings',[])
    for s in score:
        if s['Source'] == 'Rotten Tomatoes':
            return s['Value']
    return None

get_tomato(get_ombd('One Hundred and One Dalmatians'))

'98%'

In [59]:
for movie in disney_movies:
    title = movie['title']
    movie['imbd'] = get_ombd(title).get('imdbRating',None)
    movie['metascore'] = get_ombd(title).get('Metascore',None)
    movie['rotten_tomatoes'] = get_tomato(get_ombd(title))

In [60]:
disney_movies[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Produced by': 'Walt Disney',
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Music by': 'George Bruns',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961 ( 1961-01-25 )'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Release date (datetime)': 'January 25, 1961',
 'budget in dollars': 3600000,
 'box in dollars': 303000000,
 'imbd': '7.3',
 'metascore': '83',
 'rotten_tomatoes': '98%'}

### 5. Save in dataframe and csv

In [101]:
import pandas as pd
df = pd.DataFrame(disney_movies)

In [102]:
def release_datetime(date):

    ftms = ["%B %d, %Y","%d %B %Y"]

    for fmt in ftms:
        try:
            return pd.to_datetime(date,format=fmt)
        except:
            pass

In [103]:
df['Release date (datetime)'] = df['Release date (datetime)'].apply(release_datetime)

In [104]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 449 entries, 0 to 448
Data columns (total 37 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    449 non-null    object        
 1   Production company       190 non-null    object        
 2   Release date             443 non-null    object        
 3   Running time             437 non-null    object        
 4   Country                  401 non-null    object        
 5   Language                 432 non-null    object        
 6   Box office               374 non-null    object        
 7   Running time (int)       437 non-null    float64       
 8   Release date (datetime)  437 non-null    datetime64[ns]
 9   budget in dollars        284 non-null    float64       
 10  box in dollars           369 non-null    float64       
 11  imbd                     435 non-null    object        
 12  metascore                435 non-nul

In [105]:
df.drop(df.columns[[2,3,6,21,23,26,28,30,31,32,33,34,35,36]],axis=1,inplace=True)

In [106]:
df.shape

(449, 23)

In [107]:
df[df.title=='One Hundred and One Dalmatians']

Unnamed: 0,title,Production company,Country,Language,Running time (int),Release date (datetime),budget in dollars,box in dollars,imbd,metascore,...,Based on,Produced by,Starring,Music by,Distributed by,Story by,Cinematography,Edited by,Screenplay by,Production companies
50,One Hundred and One Dalmatians,Walt Disney Productions,United States,English,79.0,1961-01-25,3600000.0,303000000.0,7.3,83,...,"[The Hundred and One Dalmatians, by, Dodie Smith]",Walt Disney,"[Rod Taylor, Cate Bauer, Betty Lou Gerson, Ben...",George Bruns,Buena Vista Distribution,Bill Peet,,"[Roy M. Brewer, Jr., Donald Halliday]",,


In [108]:
df.to_csv('disney_movies.csv')