In [1]:
from bs4 import BeautifulSoup
import requests
import json

###### Scraping data for disney movies from wikipedia

In [2]:
# To get info box of a single movie wiki page.

# sometimes there are multiple values in <td> and we want them to be in list, for that this function.
def get_td_values(td_tag): #this function will be called by function 'get_info_box'
    if td_tag.find('li'):  #when <td> has multiple values in <li> tags.
        #returning a list via list comprehension.
        return [li.get_text(' ', strip= True).replace('\xa0', ' ') for li in td_tag.find_all('li')] 
    elif td_tag.find('br'): #when <td> has multiple values in <br> tags.
        return [text for text in td_tag.stripped_strings] #strings separated by <br> are now in list.
    else:  #when <td> has only one content.
        return td_tag.get_text(' ', strip= True).replace('\xa0', ' ')
        
def clean_tags(soup_obj):
    for tag in soup_obj.find_all(['sup', 'span']): #cleaning some tags whose content we don't want via get_text()
        tag.decompose()
        
def get_info_box(url):
    
    source = requests.get(url).content
    soup = BeautifulSoup(source, 'html.parser')
    info_box = soup.find('table', class_= 'infobox vevent')
    all_tr = info_box.find_all('tr') #getting all table rows where our content lies.
    clean_tags(soup)
    
    movie_info = {}
    for index, each_row in enumerate(all_tr):
        if index == 0:
            movie_info['Title'] = each_row.find('th').get_text(' ', strip= True)
        else:
            header = each_row.find('th')
            if header:
                row_key = each_row.find('th').get_text(' ', strip= True) #words in th will be joined by space and whitespace will be gone
                row_content = get_td_values(each_row.find('td')) #calling a function on <td>.
                movie_info[row_key] = row_content 
    return movie_info
# get_info_box('https://en.wikipedia.org/wiki/Ponyo')

In [3]:
# Code to loop through list of disney movies and run above function on each movie page.
source = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films').content
soup = BeautifulSoup(source, 'html.parser')
movies_info_list = [] 
base_url = 'https://en.wikipedia.org/'
movies_list = soup.select('.wikitable.sortable i a') #it is finding 'wikitable sortable' class in that, finds <i> having <a> tag. 
for movie in movies_list:
    try:
        relative_url = movie['href']
        full_url = base_url + relative_url
        movies_info_list.append(get_info_box(full_url))
    except Exception as e:
        print(e)
        print(movie.get_text())

'NoneType' object has no attribute 'find'
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find_all'
True-Life Adventures


In [4]:
len(movies_info_list)

437

###### Saving/loading scraped data

In [3]:
def save_data(filename, data):
    with open(filename, 'w', encoding= 'utf-8') as file:
        json.dump(data, file, ensure_ascii= False, indent= 2)
        
def load_data(filename):
    with open(filename, encoding= 'utf-8') as file:
        return json.load(file)

In [6]:
save_data('disney_movies_data.json', movies_info_list) #saving data in json so can be accessed easily next time.

In [4]:
# to access saved data
movies_info_list = load_data('disney_movies_data.json')

###### Running time conversions

In [8]:
# function takes movie dict and gives int if 'Running time' present in dict.
def int_running_time(movie):
    if 'Running time' in movie.keys(): #if there is running time.
        if isinstance(movie['Running time'], list): #when running time is in list with multiple values.
            return int(movie['Running time'][0].split(' ')[0]) #for just int 
        else:  #no list just single value
            return int(movie['Running time'].split(' ')[0])
    return None  #when movie dict has no 'Running time'.      

for movie in movies_info_list:
    movie['Running Time(Min)'] = int_running_time(movie) #new key added to dict.

save_data('disney_movies_data.json', movies_info_list) #saving modifications made in variable 'movies_info_list'.

###### Money conversions

In [6]:
import re

num = r'\d+(,\d{3})*\.*\d*' #'856', '122,322', '12.7' etc.
word = r'(thousand|million|billion)' 
money_num = fr'\${num}' 
money_num_word = fr'{money_num}(-|\sto\s|–)?({num})?\s{word}'  #fr'' for both formatted and raw string.

def money_word_conversion(string_word): #takes thousand,million,billion can gives their num factors.
    money_word_dict = {'thousand': 1000, 'million': 1000000, 'billion': 1000000000}
    return money_word_dict.get(string_word.lower(), 1)  #if string_word not in dict, return 1.


# takes diff. forms of money like '$856', '$122,322', '$12.7 million' and returns its single number value.
def money_conversion(money):
    if money == 'N/A':
        return None
    
    if isinstance(money, list):  #when there are multiple money values given in a list.
        money = money[0]

    #if there is match along with money word like '$12.7 million'
    if re.search(money_num_word, money, flags= re.I):  #flags= re.I ignores case of word. 
        string_match = re.search(money_num_word, money, flags= re.I).group()
        string_num = re.search(num, string_match).group() #finding only num from entire string match. 
        string_word = re.search(word, string_match, flags= re.I).group() #only money word from entire string match. 
        return (float(string_num.replace(',', '')) * money_word_conversion(string_word))

    #without money word like '$122,322'
    elif re.search(money_num, money): 
        string_num = re.search(num, money).group() 
        return float(string_num.replace(',', '')) #returning float money without unwanted chars. 
    
    else:
        return None

In [7]:
#cleaning diff. forms of money values into a single float values in new dict keys.
for movie in movies_info_list:
    movie['Budget($)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office($)'] = money_conversion(movie.get('Box office', 'N/A'))

save_data('disney_movies_data.json', movies_info_list) #saving modifications made in variable 'movies_info_list'.

###### Datetime cleaning

In [2]:
from datetime import datetime

def date_cleaning(date): #removing brackets and all uneccessry stuff.
    return date.split('(')[0].strip() #.strip() strips whitespaces from start and end.

def date_conversion(date):
    if isinstance(date, list): 
        date = date[0]
    if date == 'N/A':
        return None
    
    date_str = date_cleaning(date)
    
    date_formats = ['%B %d, %Y', '%d %B %Y']  #trying different formats that are present to be converted in datetime obj.
    for fmt in date_formats:
        try:
            return datetime.strptime(date_str, fmt).date()
        except Exception as e:
            pass
            
    return None
    
# date_conversion('23 December 2017 ( Spencer House )')

In [25]:
for movie in movies_info_list:
    movie['Release date(datetime)'] = date_conversion(movie.get('Release date', 'N/A')) 

In [5]:
# Now we will use pickle to save our data as jason doesn't allow datetime objects.
import pickle

def pickle_save(filename, data):
    with open(filename, 'wb') as file:
        pickle.dump(data, file)

def pickle_load(filename):
    with open(filename, 'rb') as file:
        return pickle.load(file)

In [None]:
pickle_save('cleaned_disney_movies_data.pickle', movies_info_list) #saving modifications done.

###### Attaching IMDB, Rotten Tomatoes and metascore

In [6]:
movies_data = pickle_load('cleaned_disney_movies_data.pickle')

In [13]:
import urllib
import os #as we will be using local envirnment vars for api key.

def get_omdb_info(title):
    base_url = 'http://www.omdbapi.com/?'
    parameters = {'apikey': os.environ['omdb_api_key'], 't':title} #according to api these parameters will be required
    params_encoded = urllib.parse.urlencode(parameters) #joining parameters in url form.
    full_url = base_url + params_encoded
    return requests.get(full_url).json()  #to get the html response in json format

def get_rotten_tomatoes(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None


In [None]:
for movie in movies_data:
    title = movie['Title']
    omdb_info = get_omdb_info(title)
    movie['IMDB score'] = omdb_info.get('imdbRating', None)
    movie['Metascore'] = omdb_info.get('Metascore', None)
    movie['Rotten Tomatoes'] = get_rotten_tomatoes(omdb_info)
    print('s')

pickle_save('cleaned_disney_movies_data.pickle', movies_data) #saving modifications

###### Final saving

In [7]:
movies_data_copy = [movie.copy() for movie in movies_data] #as lists are mutable

In [20]:
# Easy way to get it into csv :)
import pandas as pd
df = pd.DataFrame(movies_data_copy)
df.to_csv('disney_movies_data_final.csv')