In [1]:
from bs4 import BeautifulSoup as bs
import requests
import re
import time

In [2]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

soup = bs(r.content)

contents = soup.prettify()

In [2]:
def get_listed_data(data):
    patt = data.get_text(" ", strip=True)
    if data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in data.find_all('li')]
    elif data.find('br'):
        return [text for text in data.stripped_strings]
    else:
        return data.get_text(" ", strip=True).replace("\xa0", " ")

def clean_tags(soup):
    for tag in soup.find_all('sup'):
        tag.decompose()
    

    
def get_movie_info_to_dict(href):
    req = "https://en.wikipedia.org" + href
    r = requests.get(req)
    soup = bs(r.content)
    
    clean_tags(soup)
    
    info_box = soup.find(class_ = "infobox vevent")
    info_rows = info_box.find_all("tr")

    movie_info = {}    
    for index, value in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = value.find('th').get_text(" ", strip=True)
        else:
            header = value.find('th')
            if header:  
                key = value.find(class_ = "infobox-label").get_text(" ", strip=True)
                content = get_listed_data(value.find(class_ = "infobox-data"))
                movie_info[key] = content
    return movie_info



In [3]:
get_movie_info_to_dict('/wiki/Howl%27s_Moving_Castle_(film)')

{'title': "Howl's Moving Castle",
 'Japanese': 'ハウルの動く城',
 'Hepburn': 'Hauru no Ugoku Shiro',
 'Directed by': 'Hayao Miyazaki',
 'Screenplay by': 'Hayao Miyazaki',
 'Based on': ["Howl's Moving Castle", 'by', 'Diana Wynne Jones'],
 'Produced by': 'Toshio Suzuki',
 'Starring': ['Chieko Baisho', 'Takuya Kimura', 'Akihiro Miwa'],
 'Cinematography': 'Atsushi Okui',
 'Edited by': 'Takeshi Seyama',
 'Music by': 'Joe Hisaishi',
 'Production company': 'Studio Ghibli',
 'Distributed by': 'Toho',
 'Release dates': ['5 September 2004 ( 2004-09-05 ) ( Venice )',
  '20 November 2004 ( 2004-11-20 ) (Japan)'],
 'Running time': '119 minutes',
 'Country': 'Japan',
 'Language': 'Japanese',
 'Budget': ['¥', '2.4 billion', 'US$24 million'],
 'Box office': ['¥', '23.2 billion', 'US$236 million (worldwide)']}

In [5]:
def href_preprop(href):
    regex = '^/.+'
    pattern = re.compile(regex)
    return pattern.search(href) is not None


r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")
soup = bs(r.content)

tables_of_movies = soup.find_all(class_="wikitable sortable")

tables_of_movies.pop(len(tables_of_movies)-1)

hrefs = []


for tables in tables_of_movies:
    for i in tables.find_all("i"):
        for a in i.find_all("a", href=True):
            if href_preprop(a['href']):
                hrefs.append(a['href'])
                
  



In [6]:
movies = []
for index, href in enumerate(hrefs):
    if index % 10 == 0:
        print(index)
    try:
        movies.append(get_movie_info_to_dict(href))
    except Exception as ex:
        print(href)
        print(ex)
        continue

0
10
20
30
40
/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'get_text'
/wiki/Zorro_(1957_TV_series)#Theatrical
'NoneType' object has no attribute 'get_text'
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
/wiki/The_Beatles:_Get_Back#The_Beatles:_Get_Back_–_The_Rooftop_Concert
'NoneType' object has no attribute 'get_text'
500
/wiki/Wish_(2023_film)
'NoneType' object has no attribute 'find_all'
/wiki/Elio_(film)
'NoneType' object has no attribute 'find_all'
510


In [4]:
import json

def save_data(title, data):
    with open(title, "w", encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

In [5]:
import json

def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [8]:
movies = load_data("disney_movies_data_float.json")

In [9]:
def minutes_to_int(run_time):
    if run_time == "N/A":
        return None
    if isinstance(run_time, list):
        return int(run_time[0].split(" ")[0])
    else:
         return int(run_time.split(" ")[0])

#for movie in movies:
#    movie["Running time (int)"] = minutes_to_int(movie.get("Running time", 'N/A'))


In [12]:
import re

def get_float_nums(a):
    for index, budget in enumerate(a):
        if budget == "N/A":
            a[index] = budget #print(budget, index)
        if isinstance(budget, list):
            for elem in budget:
                pattern_1 = re.findall(r"~?\$[0-9]+[.0-9]*[–|-|—|-][0-9]+ million", elem)
                pattern_2 = re.findall(r"~?\$[0-9]+\,0+", elem)
                pattern_3 = re.findall(r"~?\$[0-9]+\.*[0-9]* million", elem) 
                if len(pattern_1)>0:
                    res = budget[0].split("-")[0].replace('$', '').replace('~', '')
                    res = res.split("-")[0]
                    res = res.split("—")[0]
                    a[index] = res.split("–")[0] + ' million'
                elif len(pattern_2)>0:
                    a[index] = pattern_2[0].replace(',000',' k').replace('$', '').replace('~', '')#print(pattern_1[0], index)
                elif len(pattern_3)>0:
                    a[index] = pattern_3[0].replace(' million',' m').replace('$', '').replace('~', '') #print(pattern_2[0], index)
                else:
                    a[index] = "N/A"

        else: 
            pattern_1 = re.findall(r"~?\$[0-9]+[.0-9]*[–|-|—|-][0-9]+ million", budget)
            pattern_2 = re.findall(r"~?\$[0-9]+\,0+", budget)
            pattern_3 = re.findall(r"~?\$[0-9]+\.*[0-9]* million", budget) 
            pattern_4 = re.findall(r"~?\$[0-9]+\,[0-9,]+", budget)
            if len(pattern_1)>0:
                res = budget.split("—")[0].replace('$', '').replace('~', '')
                res = res.split("-")[0]
                res = res.split("-")[0]
                a[index] = res.split("–")[0] + ' m'
            elif len(pattern_2)>0:
                a[index] = pattern_2[0].replace(',000',' k').replace('$', '').replace('~', '')#print(pattern_1[0], index)
            elif len(pattern_3)>0:
                a[index] = pattern_3[0].replace(' million',' m').replace('$', '').replace('~', '') #print(pattern_2[0], index)
            elif len(pattern_4)>0:
                a[index] = pattern_4[0].replace('00,000',' m').replace('$', '').replace('~', '').replace(',', '.')
            else:
                a[index] = "N/A"

    for index, budget in enumerate(a):
        if budget == "N/A":
            a[index] = None
        elif "m" in budget:
            a[index] = float(budget.replace(' m', '')) * 1000000
        elif "k" in budget:
            a[index] = float(budget.replace(' k', '')) * 1000
        else:
            a[index] = float(budget.replace(',', '').replace('.', ''))
            
    return a

[418000000.0, 164000000.0, 83300000.0, 960000.0, 1300000.0, 267399999.99999997, 1135000.0, 799000.0, 3355000.0, 3275000.0, 65000000.0, 3165000.0, 2560000.0, 575000.0, 1625000.0, 182000000.0, 4099999.9999999995, 3500000.0, 2100000.0, 87400000.0, 1000000.0, 2600000.0, None, 1750000.0, 28200000.0, 2150000.0, 187000000.0, 2100000.0, 1600000.0, 1700000.0, None, None, 2750000.0, None, 1750000.0, 6250000.0, None, 1800000.0, 2500000.0, 51600000.0, 12300000.0, 2300000.0, 1700000.0, 3100000.0, None, 3750000.0, 2300000.0, None, 40000000.0, 303000000.0, 11426000.0, 25100000.0, None, None, 4600000.0, 3500000.0, 5000000.0, None, None, None, 21745500.0, 22100000.0, 2550000.0, 3000.0, 4350000.0, 4200000.0, 22200000.0, 1600000.0, 4000.0, 2250000.0, 3500000.0, 103100000.0, 3500000.0, 1275000.0, 4000.0, 280.0, 6200000.0, 22565634.0, None, 16207116.0, 3000.0, 1900000.0, 4000.0, 378000000.0, None, 5000000.0, 21540050.0, 2250000.0, 4150000.0, 3300000.0, 51300000.0, 1300000.0, None, 5500000.0, None, 18607492

In [13]:
arr_of_float_budget = get_float_nums([movie.get("Budget", "N/A") for movie in movies])
arr_of_float_box_office = get_float_nums([movie.get("Box office", "N/A") for movie in movies])

for index, movie in enumerate(movies):
    movie["Budget (float)"] = arr_of_float_budget[index]

for index, movie in enumerate(movies):
    movie["Box office (float)"] = arr_of_float_box_office[index]

In [23]:
from datetime import datetime 

arr_of_str_data = [movie.get("Release dates", "N/A") for movie in movies]

def clean_data(date):
    return date.split("(")[0].strip()

def date_convertation(date):
    if isinstance(date, list):
        date = date[0]
    
    if date == "N/A":
        return None
    
    date_str = clean_data(date)
    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

for date in arr_of_str_data:
    print(date_convertation(date))
    print()


1937-12-21 00:00:00

1940-02-07 00:00:00

None

None

1941-10-23 00:00:00

1942-08-09 00:00:00

1942-08-24 00:00:00

None

1944-12-21 00:00:00

1946-04-20 00:00:00

1946-11-12 00:00:00

None

None

1948-11-29 00:00:00

None

1950-02-15 00:00:00

1950-06-22 00:00:00

1951-07-26 00:00:00

1952-03-13 00:00:00

None

None

None

1953-10-26 00:00:00

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

1959-06-24 00:00:00

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

None

1962-11-14 00:00:00

None

None

None

None

None

1963-12-12 00:00:00

None

None

1963-12-11 00:00:00

None

1964-08-27 00:00:00

None

None

None

None

1966-02-04 00:00:00

1966-06-25 00:00:00

None

None

None

None

None

None

None

1967-06-23 00:00:00

None

None

None

None

1968-12-24 00:00:00

None

None

None

None

None

None

1970-12-11 00:00:00

None

None

None

None

None

None

None

None

None

In [24]:
for movie in movies:
    movie["Release dates (datetime)"] = date_convertation(movie.get("Release dates", "N/A"))

movies[1]    

{'title': 'Pinocchio',
 'Directed by': ['Ben Sharpsteen',
  'Hamilton Luske',
  'Bill Roberts',
  'Norman Ferguson',
  'Jack Kinney',
  'Wilfred Jackson',
  'T. Hee'],
 'Story by': ['Ted Sears',
  'Otto Englander',
  'Webb Smith',
  'William Cottrell',
  'Joseph Sabo',
  'Erdman Penner',
  'Aurelius Battaglia'],
 'Based on': ['The Adventures of Pinocchio', 'by', 'Carlo Collodi'],
 'Produced by': 'Walt Disney',
 'Starring': ['Cliff Edwards',
  'Dickie Jones',
  'Christian Rub',
  'Walter Catlett',
  'Charles Judels',
  'Evelyn Venable',
  'Frankie Darro'],
 'Music by': ['Leigh Harline', 'Paul J. Smith'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['February 7, 1940 ( 1940-02-07 ) ( Center Theatre )',
  'February 23, 1940 ( 1940-02-23 ) (United States)'],
 'Running time': '88 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$2.6 million',
 'Box office': '$164 million',
 'Running time (int)': 88,
 

In [42]:
import pickle

def save_date_pickle(name, data):
    with open(name, "wb") as f:
        pickle.dump(data, f)

In [43]:
import pickle

def load_date_pickle(name):
    with open(name, "rb") as f:
        return pickle.load(f)  

In [44]:
save_date_pickle("disney_movies_data_clean.pickle", movies)

In [45]:
a = load_date_pickle("disney_movies_data_clean.pickle")

In [67]:
import os
import urllib
import requests

def get_omdb_info(title):
    base_url = "https://www.omdbapi.com/?"
    parameters = {
        "apikey":"e29e5c69", #os.environ['OMDb_API_Key'],
        "t":title
    }
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get("Ratings", [])
    for rating in ratings:
        if rating['Source'] == "Rotten Tomatoes":
            return rating["Value"]
    return None


In [68]:
for movie in movies:
    title = movie["title"]
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get("imdbRating", None)
    movie['Metascore'] = omdb_info.get("Metascore", None)
    movie['Rotten Tomatoes'] = get_rotten_tomato_score(omdb_info)

In [70]:
save_date_pickle("disney_movies_data_clean_final.pickle", movies)

In [71]:
import pandas as pd

df = pd.DataFrame(movies)
df.to_csv("disney_movies_data_clean_final.csv")

In [73]:
df.head(-5)

Unnamed: 0,title,Directed by,Written by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,...,Countries,Color process,Production companies,Japanese,Hepburn,Adaptation by,Traditional,Simplified,Original title,Layouts by
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by The, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( 1937-12-21 ) ( Carthay Ci...",...,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,"[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( 1940-02-07 ) ( Center Thea...",...,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,Walt Disney Productions,RKO Radio Pictures,,...,,,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,...,,,,,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...",,"[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Edward Brophy, Verna Felton, Cliff Edwards, H...","[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 ( 1941-10-23 ) (New York Cit...",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
498,Disenchanted,Adam Shankman,,"[Characters, by, Bill Kelly]","[Barry Josephson, Barry Sonnenfeld, Amy Adams]","[Amy Adams, Patrick Dempsey, Maya Rudolph, Yve...",Alan Menken,,Disney+,"[November 16, 2022 ( 2022-11-16 ) ( El Capitan...",...,,,"[Walt Disney Pictures, Josephson Entertainment...",,,,,,,
499,Strange World,Don Hall,,,Roy Conli,"[Jake Gyllenhaal, Dennis Quaid, Jaboukie Young...",Henry Jackman,,"[Walt Disney Studios, Motion Pictures]","[November 15, 2022 ( 2022-11-15 ) ( El Capitan...",...,,,"[Walt Disney Pictures, Walt Disney Animation S...",,,,,,,
500,Diary of a Wimpy Kid: Rodrick Rules,Luke Cormican,,"[Diary of a Wimpy Kid: Rodrick Rules, by Jeff ...",Jeff Kinney,"[Brady Noon, Hunter Dillon, Ethan William Chil...",John Paesano,,Disney+,,...,"[United States, Canada]",,"[Walt Disney Pictures, Bardel Entertainment]",,,,,,,
501,Night at the Museum: Kahmunrah Rises Again,Matt Danner,,"[Characters, by, Thomas Lennon, Robert Ben Gar...",Shawn Levy,"[Joshua Bassett, Jamie Demetriou, Alice Isaaz,...",John Paesano,,"[Disney+ (Worldwide), Alibaba Pictures (China)]",,...,"[United States, Canada, China]",,"[Walt Disney Pictures, 21 Laps Entertainment, ...",,,,,,,
