# Wep scraping Wikipedia Disney page.

- Web scrape movies list on disney page. 
- cleaning data 
- Using API to add more info to the data such as ratings. 
- create the data frame with pandas. 


In [1]:
# loading the necessary libraries. 
import pandas as pd 
import json  # to save and load data.
import requests 
from bs4 import BeautifulSoup as bs 
import re
from datetime import datetime
import pickle

In [2]:
# testing the method with only one movie in the list, 
# and then apply it to movies list. 

r = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3')

soup = bs(r.content)

# printing the contents to make sure the webpage loaded. 

#contents = soup.prettify()
#print(contents)

In [3]:
# finding the info tables in the HTML file

box = soup.find(class_='infobox vevent')
rows_info = box.find_all('tr')
for row in rows_info:
  print(row.prettify())

<tr>
 <th class="summary" colspan="2" style="text-align:center;font-size:125%;font-weight:bold;font-size:110%;font-style:italic;">
  Toy Story 3
 </th>
</tr>

<tr>
 <td colspan="2" style="text-align:center">
  <a class="image" href="/wiki/File:Toy_Story_3_poster.jpg" title="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3.">
   <img alt="All of the toys packed close together, holding up a large numeral '3', with Buzz, who is putting a friendly arm around Woody's shoulder, and Woody holding the top of the 3." class="thumbborder" data-file-height="326" data-file-width="220" decoding="async" height="326" src="//upload.wikimedia.org/wikipedia/en/6/69/Toy_Story_3_poster.jpg" width="220"/>
  </a>
  <div style="font-size:95%;padding:0.35em 0.35em 0.25em;line-height:1.25em;">
   Theatrical release poster
  </div>
 </td>
</tr>

<tr>
 <th scope="row" style="white-space:nowra

In [7]:
# getting the values as a dic. 

# writing function to help with the errors and extra characters. 
def get_content_value(row_data):
  if row_data.find('li'):
    return [li.get_text(' ', strip=True).replace('\xa0', ' ') for li in row_data.find_all('li')]
  else:
    return row_data.get_text(' ', strip=True).replace('\xa0',' ')

# dic for the movie
movie_info = {}

for index , row in enumerate(rows_info):
  if index == 0:
    movie_info['title'] = row.find('th').get_text(' ',strip=True)
  elif index == 1:
    continue
  else:
    content_key = row.find('th').get_text(' ', strip=True)
    content_value = get_content_value(row.find('td'))
    movie_info[content_key] = content_value

movie_info


{'title': 'Toy Story 3',
 'Directed by': 'Lee Unkrich',
 'Produced by': 'Darla K. Anderson',
 'Screenplay by': 'Michael Arndt',
 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'],
 'Starring': ['Tom Hanks',
  'Tim Allen',
  'Joan Cusack',
  'Don Rickles',
  'Wallace Shawn',
  'John Ratzenberger',
  'Estelle Harris',
  'Blake Clark',
  'Jeff Pidgeon',
  'Ned Beatty',
  'Michael Keaton',
  'Jodi Benson',
  'John Morris'],
 'Music by': 'Randy Newman',
 'Cinematography': ['Jeremy Lasky', 'Kim White'],
 'Edited by': 'Ken Schretzmann',
 'Production company': ['Walt Disney Pictures', 'Pixar Animation Studios'],
 'Distributed by': 'Walt Disney Studios Motion Pictures',
 'Release date': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )',
  'June 18, 2010 ( 2010-06-18 ) (United States)'],
 'Running time': '103 minutes [1]',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$200 million [1]',
 'Box office': '$1.067 billion [1]'}

### looping through the movies list in the main page, and apply the previous method to all movies. 

In [8]:
r = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films')

soup = bs(r.content)

# printing test 
#print(soup.prettify())

In [9]:
# navigate the movies locaction.
movies = soup.select('.wikitable.sortable i')
movies[0:5]

[<i><a href="/wiki/Academy_Award_Review_of_Walt_Disney_Cartoons" title="Academy Award Review of Walt Disney Cartoons">Academy Award Review of Walt Disney Cartoons</a></i>,
 <i><a href="/wiki/Snow_White_and_the_Seven_Dwarfs_(1937_film)" title="Snow White and the Seven Dwarfs (1937 film)">Snow White and the Seven Dwarfs</a></i>,
 <i><a href="/wiki/Pinocchio_(1940_film)" title="Pinocchio (1940 film)">Pinocchio</a></i>,
 <i><a href="/wiki/Fantasia_(1940_film)" title="Fantasia (1940 film)">Fantasia</a></i>,
 <i><a href="/wiki/The_Reluctant_Dragon_(1941_film)" title="The Reluctant Dragon (1941 film)">The Reluctant Dragon</a></i>]

In [10]:

# handling all errors. 
def get_content_value(row_data):
    if row_data.find("li"):
        return [li.get_text(" ", strip=True).replace("\xa0", " ") for li in row_data.find_all("li")]
    elif row_data.find("br"):
        return [text for text in row_data.stripped_strings]
    else:
        return row_data.get_text(" ", strip=True).replace("\xa0", " ")

# remove all the unwanted tags.
def clean_tags(soup):
    for tag in soup.find_all(["sup", "span"]):
        tag.decompose()



# taking a URL and returns a dic.
def get_info_box(url):

    r = requests.get(url)
    soup = bs(r.content)
    info_box = soup.find(class_="infobox vevent")
    info_rows = info_box.find_all("tr")
    
    clean_tags(soup)

    movie_info = {}
    for index, row in enumerate(info_rows):
        if index == 0:
            movie_info['title'] = row.find("th").get_text(" ", strip=True)
        else:
            header = row.find('th')
            if header:
                content_key = row.find("th").get_text(" ", strip=True)
                content_value = get_content_value(row.find("td"))
                movie_info[content_key] = content_value
            
    return movie_info

In [11]:
# testing 

get_info_box('https://en.wikipedia.org/wiki/Bambi')

{'title': 'Bambi',
 'Directed by': ['Supervising director',
  'David Hand',
  'Sequence directors',
  'James Algar',
  'Samuel Armstrong',
  'Graham Heid',
  'Bill Roberts',
  'Paul Satterfield',
  'Norman Wright'],
 'Produced by': 'Walt Disney',
 'Story by': ['Story direction',
  'Perce Pearce',
  'Story adaptation',
  'Larry Morey',
  'Story development',
  'Vernon Stallings',
  'Melvin Shaw',
  'Carl Fallberg',
  'Chuck Couch',
  'Ralph Wright'],
 'Based on': ['Bambi, a Life in the Woods', 'by', 'Felix Salten'],
 'Starring': 'see below',
 'Music by': ['Frank Churchill', 'Edward H. Plumb'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release date': ['August 9, 1942 (World Premiere-London)',
  'August 13, 1942 (Premiere-New York City)',
  'August 21, 1942 (U.S.)'],
 'Running time': '70 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$858,000',
 'Box office': '$267.4 million'}

In [14]:



r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")


soup = bs(r.content)
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []

# printing every 10 indexes to or through an exception
for index, movie in enumerate(movies):
    if index % 10 == 0:
        print(index)
    try:
        relative_path = movie['href']
        full_path = base_path + relative_path
        title = movie['title']
        
        movie_info_list.append(get_info_box(full_path))
        
    except Exception as e:
        print(movie.get_text())
        print(e)

0
10
20
30
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
50
60
70
80
90
100
110
120
True-Life Adventures
'NoneType' object has no attribute 'find_all'
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440


In [15]:
# using json to save/load data 

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)

def load_data(title):
    with open(title, encoding="utf-8") as f:
        return json.load(f)

In [16]:
save_data("disney_data_cleaned.json", movie_info_list)

In [17]:
# load the data 
movie_info_list = load_data("disney_data_cleaned.json")

In [18]:
# exploring the data to identify any issues. 

movie_info_list[40]

{'title': 'Sleeping Beauty',
 'Directed by': ['Clyde Geronimi (supervising)',
  'Eric Larson',
  'Wolfgang Reitherman',
  'Les Clark'],
 'Produced by': 'Walt Disney',
 'Written by': 'Erdman Penner',
 'Story by': ['Milt Banta',
  'Winston Hibler',
  'Bill Peet',
  'Joe Rinaldi',
  'Ted Sears',
  'Ralph Wright'],
 'Based on': ['Sleeping Beauty', 'by', 'Charles Perrault'],
 'Starring': ['Mary Costa',
  'Bill Shirley',
  'Eleanor Audley',
  'Verna Felton',
  'Barbara Luddy',
  'Barbara Jo Allen',
  'Taylor Holmes',
  'Bill Thompson'],
 'Narrated by': 'Marvin Miller',
 'Music by': "George Bruns (adapted from Tchaikovsky's Sleeping Beauty Ballet )",
 'Edited by': ['Roy M. Brewer Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 29, 1959'],
 'Running time': '75 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$6 million',
 'Box office': '$51.6 million (United States/Ca

In [19]:
print([movie.get('Running time', 'N/A') for movie in movie_info_list])

['41 minutes (74 minutes 1966 release)', '83 minutes', '88 minutes', '126 minutes', '74 minutes', '64 minutes', '70 minutes', '42 minutes', '65 min.', '71 minutes', '75 minutes', '94 minutes', '73 minutes', '75 minutes', '82 minutes', '68 minutes', '74 minutes', '96 minutes', '75 minutes', '84 minutes', '77 minutes', '92 minutes', '69 minutes', '81 minutes', ['60 minutes (VHS version)', '71 minutes (original)'], '127 minutes', '92 minutes', '76 minutes', '75 minutes', '73 minutes', '85 minutes', '81 minutes', '70 minutes', '90 min.', '80 minutes', '75 minutes', '83 minutes', '83 minutes', '72 minutes', '97 minutes', '75 minutes', '104 minutes', '93 minutes', '105 minutes', '95 minutes', '97 minutes', '134 minutes', '69 minutes', '92 minutes', '126 minutes', '79 minutes', '97 minutes', '128 minutes', '74 minutes', '91 minutes', '105 minutes', '98 minutes', '130 minutes', '89 min.', '93 minutes', '67 minutes', '98 minutes', '100 minutes', '118 minutes', '103 Minutes', '110 minutes', '80 

In [20]:
# - removing "minutes"
# - change the minutes to int. 

def minutes_to_integer(running_time):
    if running_time == "N/A":
        return None
    
    if isinstance(running_time, list):
        return int(running_time[0].split(" ")[0])
    else: # is a string
        return int(running_time.split(" ")[0])

for movie in movie_info_list:
    movie['Running time (int)'] = minutes_to_integer(movie.get('Running time', "N/A"))

In [21]:
# testing
print([movie.get('Running time (int)', 'N/A') for movie in movie_info_list])

[41, 83, 88, 126, 74, 64, 70, 42, 65, 71, 75, 94, 73, 75, 82, 68, 74, 96, 75, 84, 77, 92, 69, 81, 60, 127, 92, 76, 75, 73, 85, 81, 70, 90, 80, 75, 83, 83, 72, 97, 75, 104, 93, 105, 95, 97, 134, 69, 92, 126, 79, 97, 128, 74, 91, 105, 98, 130, 89, 93, 67, 98, 100, 118, 103, 110, 80, 79, 91, 91, 97, 118, 139, 92, 131, 87, 116, 93, 110, 110, 131, 101, 108, 84, 78, 75, 164, 106, 110, 99, 113, 108, 112, 93, 91, 93, 100, 100, 79, 96, 113, 89, 118, 92, 88, 92, 87, 93, 93, 93, 90, 83, 96, 88, 89, 91, 93, 92, 97, 100, 100, 89, 91, 112, 115, 95, 91, 95, 104, 74, 48, 77, 104, 128, 101, 94, 104, 90, 100, 88, 93, 98, 100, 112, 84, 98, 97, 114, 96, 100, 109, 83, 90, 107, 96, 103, 91, 95, 105, 113, 80, 101, 89, 74, 90, 89, 110, 74, 93, 84, 83, 69, 77, 107, 93, 88, 108, 84, 121, 89, 104, 90, 86, 84, 108, 107, 96, 98, 105, 108, 94, 106, 102, 88, 102, 102, 97, 111, 100, 96, 98, 78, 81, 108, 89, 99, 89, 81, 92, 100, 89, 79, 91, 101, 104, 103, 86, 105, 93, 92, 98, 95, 93, 87, 93, 87, 128, 86, 95, 114, 93, 

In [22]:
# Changing the budgets and replace "thousend,million,billion with int"
# dealing with errors and issues on the budgets data. 
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|–)?({number})?\s({amounts})"
value_re = rf"\${number}"

def word_to_value(word):
    value_dict = {"thousand": 1000, "million": 1000000, "billion": 1000000000}
    return value_dict[word]

def parse_word_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    word = re.search(amounts, string, flags=re.I).group().lower()
    word_value = word_to_value(word)
    return value*word_value

def parse_value_syntax(string):
    value_string = re.search(number, string).group()
    value = float(value_string.replace(",", ""))
    return value


def money_conversion(money):
    if money == "N/A":
        return None

    if isinstance(money, list):
        money = money[0]
        
    word_syntax = re.search(word_re, money, flags=re.I)
    value_syntax = re.search(value_re, money)

    if word_syntax:
        return parse_word_syntax(word_syntax.group())

    elif value_syntax:
        return parse_value_syntax(value_syntax.group())

    else:
        return None


In [23]:

# adding the budget float , and box office float. 
for movie in movie_info_list:
    movie['Budget (float)'] = money_conversion(movie.get('Budget', "N/A"))
    movie['Box office (float)'] = money_conversion(movie.get('Box office', "N/A"))

In [24]:
# convert date into datetime. 

dates = [movie.get('Release date', 'N/A') for movie in movie_info_list]

def clean_date(date):
    return date.split("(")[0].strip()

def date_conversion(date):
    if isinstance(date, list):
        date = date[0]
        
    if date == "N/A":
        return None
        
    date_str = clean_date(date)

    fmts = ["%B %d, %Y", "%d %B %Y"]
    for fmt in fmts:
        try:
            return datetime.strptime(date_str, fmt)
        except:
            pass
    return None

In [25]:
# creating a datetime release date. 

for movie in movie_info_list:
    movie['Release date (datetime)'] = date_conversion(movie.get('Release date', 'N/A'))

In [26]:
# testing 

movie_info_list[50]

{'title': 'One Hundred and One Dalmatians',
 'Directed by': ['Clyde Geronimi', 'Hamilton Luske', 'Wolfgang Reitherman'],
 'Produced by': 'Walt Disney',
 'Story by': 'Bill Peet',
 'Based on': ['The Hundred and One Dalmatians', 'by', 'Dodie Smith'],
 'Starring': ['Rod Taylor',
  'Cate Bauer',
  'Betty Lou Gerson',
  'Ben Wright',
  'Bill Lee (singing voice)',
  'Lisa Davis',
  'Martha Wentworth'],
 'Music by': 'George Bruns',
 'Edited by': ['Roy M. Brewer, Jr.', 'Donald Halliday'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': ['January 25, 1961'],
 'Running time': '79 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$3.6 million',
 'Box office': '$303 million',
 'Running time (int)': 79,
 'Budget (float)': 3600000.0,
 'Box office (float)': 303000000.0,
 'Release date (datetime)': datetime.datetime(1961, 1, 25, 0, 0)}

In [27]:

# using pickle to save/load data 
# becuase Object of type 'datetime' is not JSON serializable.

def save_data_pickle(name, data):
    with open(name, 'wb') as f:
        pickle.dump(data, f)

def load_data_pickle(name):
    with open(name, 'rb') as f:
        return pickle.load(f)

In [28]:
save_data_pickle("disney_movie_data_cleaned_more.pickle", movie_info_list)

In [29]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [30]:
a == movie_info_list

True

## adding ratings columns via API

In [31]:
import urllib

def get_omdb_info(title):
    base_url = "http://www.omdbapi.com/?"
    parameters = {"apikey": '51f035e3', 't': title}
    params_encoded = urllib.parse.urlencode(parameters)
    full_url = base_url + params_encoded
    return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info):
    ratings = omdb_info.get('Ratings', [])
    for rating in ratings:
        if rating['Source'] == 'Rotten Tomatoes':
            return rating['Value']
    return None

In [32]:
# testing 

get_omdb_info("toy story 3")

{'Title': 'Toy Story 3',
 'Year': '2010',
 'Rated': 'G',
 'Released': '18 Jun 2010',
 'Runtime': '103 min',
 'Genre': 'Animation, Adventure, Comedy, Family, Fantasy',
 'Director': 'Lee Unkrich',
 'Writer': 'John Lasseter (story by), Andrew Stanton (story by), Lee Unkrich (story by), Michael Arndt (screenplay by)',
 'Actors': 'Tom Hanks, Tim Allen, Joan Cusack, Ned Beatty',
 'Plot': "The toys are mistakenly delivered to a day-care center instead of the attic right before Andy leaves for college, and it's up to Woody to convince the other toys that they weren't abandoned and to return home.",
 'Language': 'English, Spanish',
 'Country': 'USA',
 'Awards': 'Won 2 Oscars. Another 59 wins & 95 nominations.',
 'Poster': 'https://m.media-amazon.com/images/M/MV5BMTgxOTY4Mjc0MF5BMl5BanBnXkFtZTcwNTA4MDQyMw@@._V1_SX300.jpg',
 'Ratings': [{'Source': 'Internet Movie Database', 'Value': '8.3/10'},
  {'Source': 'Rotten Tomatoes', 'Value': '98%'},
  {'Source': 'Metacritic', 'Value': '92/100'}],
 'Metas

In [33]:
# adding all the ratings ( IMDB , metascore , rotten tomatoes)

for movie in movie_info_list:
    title = movie['title']
    omdb_info = get_omdb_info(title)
    movie['imdb'] = omdb_info.get('imdbRating', None)
    movie['metascore'] = omdb_info.get('Metascore', None)
    movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [34]:
# testing 

movie_info_list[30]

{'title': 'The Great Locomotive Chase',
 'Directed by': 'Francis D. Lyon',
 'Produced by': ['Lawrence Edward Watkin', 'Walt Disney'],
 'Written by': 'Lawrence Edward Watkin',
 'Starring': ['Fess Parker',
  'Jeffrey Hunter',
  'John Lupton',
  'Jeff York',
  'Slim Pickens'],
 'Music by': 'Paul J. Smith',
 'Cinematography': 'Charles Boyle',
 'Edited by': 'Ellsworth Hoagland',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'June 8, 1956',
 'Running time': '85 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$1.7 million (US)',
 'Running time (int)': 85,
 'Budget (float)': None,
 'Box office (float)': 1700000.0,
 'Release date (datetime)': datetime.datetime(1956, 6, 8, 0, 0),
 'imdb': '6.9',
 'metascore': 'N/A',
 'rotten_tomatoes': '53%'}

In [35]:
# saving the data 
save_data_pickle('disney_movies.pickle', movie_info_list)

In [36]:
movie_info_copy = [movie.copy() for movie in movie_info_list]

In [37]:
# saving the data as json 
# converting datetime to str. 

for movie in movie_info_copy:
    current_date = movie['Release date (datetime)']
    if current_date:
        movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
    else:
        movie['Release date (datetime)'] = None

In [38]:
save_data("disney_data_final.json", movie_info_copy)

In [39]:
# reading the data in pandas and making sure everything working as expected.

df = pd.DataFrame(movie_info_list)

In [40]:
df.head()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Cinematography,Edited by,Screenplay by,Production companies,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
0,Academy Award Review of,Walt Disney Productions,"[May 19, 1937]",41 minutes (74 minutes 1966 release),United States,English,$45.472,41.0,,45.472,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,"[December 21, 1937 ( Carthay Circle Theatre , ...",83 minutes,United States,English,$418 million,83.0,1490000.0,418000000.0,...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$164 million,88.0,2600000.0,164000000.0,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,"[November 13, 1940]",126 minutes,United States,English,$76.4–$83.3 million,126.0,2280000.0,83300000.0,...,James Wong Howe,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,"[June 20, 1941]",74 minutes,United States,English,"$960,000 (worldwide rentals)",74.0,600000.0,960000.0,...,Bert Giennon,Paul Weatherwax,,,,,,,,


In [41]:
df.tail()

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Cinematography,Edited by,Screenplay by,Production companies,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
433,Soul,"[Walt Disney Pictures, Pixar Animation Studios]","[October 11, 2020 ( BFI London Film Festival )...",100 minutes,United States,English,,100.0,150000000.0,,...,"[Matt Aspbury, Ian Megibben]",Kevin Nolting,,,,,,,,
434,Raya and the Last Dragon,"[Walt Disney Pictures, Walt Disney Animation S...","[March 12, 2021]",,United States,English,,,,,...,,,,,,,,,,
435,Cruella,"[Walt Disney Pictures, Gunn Films, Marc Platt ...",,,United States,English,,,,,...,Nicolas Karakatsanis,Tatiana S. Riegel,"[Aline Brosh McKenna, Jez Butterworth, Dana Fo...",,,,,,,
436,Jungle Cruise,"[Walt Disney Pictures, Davis Entertainment, Se...","[July 30, 2021 (United States)]",,United States,English,,,,,...,Flavio Labiano,Joel Negron,"[Michael Green, Glenn Ficarra, John Requa]",,,,,,,
437,The Beatles: Get Back,,[27 August 2021],,"[United Kingdom, New Zealand, United States]",English,,,,,...,,Jabez Olssen,,"[Walt Disney Pictures, Apple Corps, WingNut Fi...",,,,,,


In [42]:
df.sample(2)

Unnamed: 0,title,Production company,Release date,Running time,Country,Language,Box office,Running time (int),Budget (float),Box office (float),...,Cinematography,Edited by,Screenplay by,Production companies,Japanese,Hepburn,Adaptation by,Animation by,Traditional,Simplified
12,Fun and Fancy Free,Walt Disney Productions,"[September 27, 1947]",73 minutes,United States,English,"$3,165,000 (worldwide rentals)",73.0,,3165000.0,...,,Jack Bachom,,,,,,,,
37,The Light in the Forest,Walt Disney Productions,"[July 9, 1958]",83 minutes,United States,English,,83.0,,,...,Ellsworth Fredericks,,,,,,,,,


In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 438 entries, 0 to 437
Data columns (total 34 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    438 non-null    object        
 1   Production company       399 non-null    object        
 2   Release date             436 non-null    object        
 3   Running time             427 non-null    object        
 4   Country                  434 non-null    object        
 5   Language                 436 non-null    object        
 6   Box office               367 non-null    object        
 7   Running time (int)       427 non-null    float64       
 8   Budget (float)           276 non-null    float64       
 9   Box office (float)       357 non-null    float64       
 10  Release date (datetime)  433 non-null    datetime64[ns]
 11  imdb                     427 non-null    object        
 12  metascore                427 non-nul