In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import requests
from bs4 import BeautifulSoup as bs

In [2]:
def get_content_value (row):
    if row.find_all('li'):
        return [li.get_text(' ', strip = True).replace('\xa0',' ') for li in row.find_all('li')]
    elif row.find('br'):
        return [text for text in row.stripped_strings]  
    else:
        return row.find('td').get_text(' ', strip = True).replace('\xa0',' ')
    
   

#  Removing the reference tagsd
def clean_tag(soup):    
    for tag in soup.find_all(['sup','span']):
        tag.decompose()



def grab_info_box(movie_url):
    
    r = requests.get(movie_url)
    webpage = bs (r.content)

    clean_tag(webpage)
    
    info_box = webpage.find(class_ = 'infobox vevent')
    rows = info_box.find_all('tr')
  
    movie_info = {}
    for index,value in enumerate(rows):
        if index == 0:
            movie_info ['title'] = value.find('th').get_text(' ', strip = True)
        else:
            header = value.find('th')
            if header:
                caption = value.find('th').get_text(' ', strip = True)
                content = get_content_value (value)
                movie_info[caption] = content
            
    return movie_info

In [3]:
# Now we want to get the link of all movies in URL = "https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films"

disney_films = requests.get ('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')
disney_movies = bs (disney_films.content)
movies = disney_movies.select('.wikitable.sortable i a')

base_path = 'https://en.wikipedia.org/'
movie_list = []
for index, movie in enumerate(movies):
    try:
        link = movie['href']
        full_path = base_path + link
        title = movie['title']
        movie_list.append(grab_info_box(full_path))
    except:
        continue

In [4]:
        # Saving json
import json

with open('disney_movie_df.json', 'w', encoding='utf-8') as f:
    json.dump(movie_list, f,ensure_ascii = False, indent =2)


Now, we saved our data and we are doing some more cleaning as much as possible


In [5]:
# loading data
import json
with open('disney_movie_df.json', encoding='utf-8') as f:
    data = json.load(f)

 Some sub tasks to do:
 ~~ 1- clean up the refrences [1], [2], etc ~~
 ~~ 2- convert running time to int ~~
 3- convert date into datetime object
 ~~ 4- split up the long string ~~
 ~~5- convert budget and box office to numbers ~~

In [7]:
run_time = [film.get('Running time', 'N/A') for film in data]


In [8]:
def running_time_to_int(running_time) :
    
    if isinstance (running_time, list):
        return int (running_time[1].split(' ')[0])
    elif running_time == 'N/A':
        return None
    else:
        return int (running_time.split(' ')[0]) 



In [9]:
run_time_int = [running_time_to_int(times) for times in run_time]

In [10]:
for i, movies in enumerate(data):
    movies ['run time int'] = run_time_int[i]


    

In [11]:
##  5- convert budget and box office to numbers 

[t.get('Budget', 'N/A') for t in data]


['$1.5 million',
 '$2.6 million',
 '$2.28 million',
 '$600,000',
 '$950,000',
 '$858,000',
 'N/A',
 '$788,000',
 'N/A',
 '$1.35 million',
 '$2.125 million',
 'N/A',
 '$1.5 million',
 '$1.5 million',
 'N/A',
 '$2.2 million',
 '$1.8 million',
 '$3 million',
 'N/A',
 '$4 million',
 '$2 million',
 '$300,000',
 '$1.8 million',
 'N/A',
 '$5 million',
 'N/A',
 '$4 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$700,000',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$6 million',
 'under $1 million or $1,250,000',
 'N/A',
 '$2 million',
 'N/A',
 'N/A',
 '$2.5 million',
 'N/A',
 'N/A',
 '$4 million',
 '$3.6 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$3 million',
 'N/A',
 '$3 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$3 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$4.4–6 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$4 million',
 'N/A',
 '$5 million',
 'N/A',
 'N/A',
 'N/A',
 'N/A',
 '$5 mill

In [12]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value

'''
money_conversion("$12.2 million") --> 12200000 ## Word syntax
money_conversion("$790,000") --> 790000        ## Value syntax
'''
def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None



In [13]:
for movie in data:
    movie['budget (int)'] = money_conversion(movie.get('Budget', 'N/A'))
    movie['Box office (int)'] = money_conversion(movie.get('Box office', 'N/A'))

In [14]:
dates = [movie.get('Release dates', 'N/A') for movie in data]
dates

[['December 21, 1937 ( Carthay Circle Theatre )',
  'February 4, 1938 (United States)'],
 ['February 7, 1940 ( Center Theatre )', 'February 23, 1940 (United States)'],
 'N/A',
 'N/A',
 ['October 23, 1941 (New York City)', 'October 31, 1941 (U.S.)'],
 ['August 9, 1942 ( London )', 'August 21, 1942 (United States)'],
 ['August 24, 1942 (Rio de Janeiro)',
  'February 6, 1943 (Boston)',
  'February 19, 1943 (United States)'],
 'N/A',
 ['December 21, 1944 (Mexico City)', 'February 3, 1945 (United States)'],
 ['April 20, 1946 (New York City)', 'August 15, 1946 (United States)'],
 ['November 12, 1946 (premiere)', 'November 20, 1946'],
 'N/A',
 'N/A',
 ['November 29, 1948 (Chicago)', 'January 19, 1949 (Indianapolis)'],
 'N/A',
 ['February 15, 1950 (Boston)', 'March 4, 1950 (United States)'],
 ['June 22, 1950 (London)', 'July 29, 1950 (United States)', 'July 19, 1950'],
 ['July 26, 1951 (London)',
  'July 28, 1951 (New York City)',
  'September 14, 1951 (United States)'],
 ['13 March 1952 (Lond

In [15]:
from datetime import datetime

def clean_date (date):
    return date.split('(')[0].strip()


def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
    
    if date == 'N/A':
        return None
    
    date_str = clean_date(date)
    print(date_str)

    fmts = ['%B %d, %Y', '%d %B %Y'] 

    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass

In [16]:
for date in dates:
    print (date_conversion(date))
    print()
    

December 21, 1937
1937-12-21 00:00:00

February 7, 1940
1940-02-07 00:00:00

None

None

October 23, 1941
1941-10-23 00:00:00

August 9, 1942
1942-08-09 00:00:00

August 24, 1942
1942-08-24 00:00:00

None

December 21, 1944
1944-12-21 00:00:00

April 20, 1946
1946-04-20 00:00:00

November 12, 1946
1946-11-12 00:00:00

None

None

November 29, 1948
1948-11-29 00:00:00

None

February 15, 1950
1950-02-15 00:00:00

June 22, 1950
1950-06-22 00:00:00

July 26, 1951
1951-07-26 00:00:00

13 March 1952
1952-03-13 00:00:00

None

None

None

26 October 1953
1953-10-26 00:00:00

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

June 24, 1959
1959-06-24 00:00:00

None

None

None

None

June 1960
None

None

None

None

None

None

None

None

None

None

None

None

None

None

November 14, 1962
1962-11-14 00:00:00

None

None

None

None

None

December 12, 1963
1963-12-12 00:00:00

None

None

11 December 1963
1963-12-11 00:00:00

None


In [17]:
for movie in data:
    movie['Release date in datetime'] = date_conversion(movie.get('Release dates', 'N/A'))

December 21, 1937
February 7, 1940
October 23, 1941
August 9, 1942
August 24, 1942
December 21, 1944
April 20, 1946
November 12, 1946
November 29, 1948
February 15, 1950
June 22, 1950
July 26, 1951
13 March 1952
26 October 1953
June 24, 1959
June 1960
November 14, 1962
December 12, 1963
11 December 1963
August 27, 1964
February 4, 1966
June 29, 1966
June 23, 1967
December 24, 1968
December 11, 1970
October 7, 1971
December 17, 1976
Release dates
Release dates
December 18, 1979
April 17, 1980
December 6, 1980
Release dates
3 October 1990
September 29, 1991
November 5, 1994
June 10, 1995
January 22, 1995
November 19, 1995
April 12, 1996
June 19, 1996
November 18, 1996
March 7, 1997
November 16, 1997
June 5, 1998
July 20, 1998
November 14, 1998
Release dates
June 12, 1999
May 21, 1999
November 13, 1999
December 17, 1999
May 13, 2000
December 10, 2000
February 10, 2001
June 3, 2001
July 29, 2001
October 28, 2001
February 10, 2002
June 16, 2002
July 21, 2002
October 23, 2002
November 6, 200

save data as json and csv and pickle

In [None]:
import pickle

with open('disney_movie_cleaned_data.pickle', 'wb') as f:
    pickle.dump(data, f, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
disney_movies_copy = [movie.copy() for movie in data]

for movie in disney_movies_copy:  # converting datetime to str for saving in json
    if movie['Release date in datetime']:
        movie['Release date in datetime'] = movie['Release date in datetime'].strftime('%B %d, %Y')
    else: 
        movie['Release date in datetime'] = None

import json

with open('final_data.json', 'w') as fp:
    json.dump(disney_movies_copy, fp)


In [None]:
import pandas as pd

df = pd.DataFrame(data)
df.to_csv('final_dataset.csv')