**Disney Dataset Creation**

In [56]:
import requests
from bs4 import BeautifulSoup as bs

## Task#1 : Get Info Box Toy Story 3(Store in Python Dictionary)

In [57]:
r = requests.get("https://en.wikipedia.org/wiki/Toy_Story_3")

# Convert to t a beautifyl soup object
soup = bs(r.content)

In [58]:
info_box = soup.find(class_= "infobox vevent")
info_rows = info_box.find_all("tr")


Enumerate() method adds a counter to an iterable and returns it in a form of enumerating object. This enumerated object can then be used directly for loops or converted into a list of tuples using the list() method.

In [59]:
def get_content_value(row_data) : 
  # If row contains a list , iterate through that list and fetch all elements
  
  # Replace \xa0 with white space
  # and set strip = True which removes leading and trailing whitespaces
  
  if row_data.find("li") : 
    return [li.get_text(" ", strip = True).replace("\xa0", " ") for li in row_data.find_all("li")]
  
  else : 
    return row_data.get_text(" ", strip=True).replace("\xa0", " ")

In [60]:
movie_info = {}

for index , row in enumerate(info_rows) :
  # First will be movie title
  if index == 0 :
    movie_info['title'] = row.find("th").get_text(" ", strip = True)
  
  # Index == 1 tr is about image so skip it
  elif index == 1 :
    continue
  
  else :
    content_key = row.find("th").get_text(" ", strip = True)
    content_value = get_content_value(row.find("td"))
    
    movie_info[content_key] = content_value
    
    
print(movie_info)

{'title': 'Toy Story 3', 'Directed by': 'Lee Unkrich', 'Screenplay by': 'Michael Arndt', 'Story by': ['John Lasseter', 'Andrew Stanton', 'Lee Unkrich'], 'Produced by': 'Darla K. Anderson', 'Starring': ['Tom Hanks', 'Tim Allen', 'Joan Cusack', 'Don Rickles', 'Wallace Shawn', 'John Ratzenberger', 'Estelle Harris', 'Ned Beatty', 'Michael Keaton', 'Jodi Benson', 'John Morris'], 'Cinematography': ['Jeremy Lasky', 'Kim White'], 'Edited by': 'Ken Schretzmann', 'Music by': 'Randy Newman', 'Production companies': ['Walt Disney Pictures', 'Pixar Animation Studios'], 'Distributed by': 'Walt Disney Studios Motion Pictures', 'Release dates': ['June 12, 2010 ( 2010-06-12 ) ( Taormina Film Fest )', 'June 18, 2010 ( 2010-06-18 ) (United States)'], 'Running time': '103 minutes [1]', 'Country': 'United States', 'Language': 'English', 'Budget': '$200 million [1]', 'Box office': '$1.067 billion [1]'}


## Task#2 : Get info box for all movies

In [102]:
def get_content_value(row_data) : 
  # If row contains a list , iterate through that list and fetch all elements
  
  # Replace \xa0 with white space
  # and set strip = True which removes leading and trailing whitespaces
  
  if row_data.find("li") : 
    return [li.get_text(" ", strip = True).replace("\xa0", " ") for li in row_data.find_all("li")]
  
  # To split long strings of names together
  elif row_data.find("br") :
    return [text for text in row_data.stripped_strings]
    
  else : 
    return row_data.get_text(" ", strip=True).replace("\xa0", " ")

#To remove rferences just need to remove the "sup" tag
def clean_tags(soup) :
  for tag in soup.find_all(["sup" , "span"]) :
    tag.decompose()
    
def get_info_box(url) :   
  
  r = requests.get(url)

# Convert to t a beautifyl soup object
  soup = bs(r.content)

  contents = soup.prettify()
  
  info_box = soup.find(class_= "infobox vevent")
  info_rows = info_box.find_all("tr")
 
  clean_tags(soup)
  
  movie_info = {}

  for index , row in enumerate(info_rows) :
    # First will be movie title
    if index == 0 :
      movie_info['title'] = row.find("th").get_text(" ", strip = True)
  
    # Index == 1 tr is about image so skip it
    else :
      header = row.find("th")
      if header : 
        content_key = row.find("th").get_text(" ", strip = True)
        
        content_value = get_content_value(row.find("td"))
    
        movie_info[content_key] = content_value
    
  return movie_info

In [103]:
get_info_box("https://en.wikipedia.org/wiki/The_Nightmare_Before_Christmas")

{'title': 'The Nightmare Before Christmas',
 'Directed by': 'Henry Selick',
 'Screenplay by': 'Caroline Thompson',
 'Adaptation by': 'Michael McDowell',
 'Based on': ['Story and characters', 'by Tim Burton'],
 'Produced by': ['Tim Burton', 'Denise Di Novi'],
 'Starring': ['Danny Elfman',
  'Chris Sarandon',
  "Catherine O'Hara",
  'William Hickey',
  'Glenn Shadix',
  'Paul Reubens',
  'Ken Page',
  'Ed Ivory'],
 'Cinematography': 'Pete Kozachik',
 'Edited by': 'Stan Webb',
 'Music by': 'Danny Elfman',
 'Production companies': ['Touchstone Pictures', 'Skellington Productions'],
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release dates': ['October 13, 1993 (limited)',
  'October 29, 1993 (United States)'],
 'Running time': '76 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$24 million',
 'Box office': '$91.5 million'}

In [104]:
r = requests.get("https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films")

# Convert int beautifyl soup object
soup = bs(r.content)

# Get movies that have a link/url
movies = soup.select(".wikitable.sortable i a")

base_path = "https://en.wikipedia.org/"

movie_info_list = []
for index , movie in enumerate(movies) :# 
  if index % 20 == 0:
    print(index)
  try : 
    relative_path = movie['href']
    title = movie['title']
    full_path = base_path + relative_path
    movie_info_list.append(get_info_box(full_path))
    
  except Exception as e : 
    print(movie.get_text())
    print(e)

0
20
40
Zorro the Avenger
'NoneType' object has no attribute 'find'
The Sign of Zorro
'NoneType' object has no attribute 'find'
60
80
100
120
140
160
180
200
220
240
260
280
300
320
340
360
380
400
420
440
460
480
The Beatles: Get Back – The Rooftop Concert
'NoneType' object has no attribute 'find'
500
Wish
'NoneType' object has no attribute 'find_all'
Elio
'NoneType' object has no attribute 'find_all'
61
'NoneType' object has no attribute 'find_all'
All Night Long
'NoneType' object has no attribute 'find'
Big Thunder Mountain Railroad
'NoneType' object has no attribute 'find_all'
520
Keeper of the Lost Cities
'NoneType' object has no attribute 'find_all'
Muppet Man
'NoneType' object has no attribute 'find_all'
One Thousand and One Nights
'NoneType' object has no attribute 'find_all'
Shrunk
'NoneType' object has no attribute 'find'
Sister Act 3
'NoneType' object has no attribute 'find'
The Graveyard Book
'NoneType' object has no attribute 'find_all'
The Thief
'NoneType' object has no a

In [105]:
len(movie_info_list)
 

529

### Save / Reload Movie Data

In [30]:
import json

def save_data(title , data) : 
  with open(title , 'w' , encoding = 'utf-8') as f :
    json.dump(data , f , ensure_ascii = False , indent = 2 )

In [31]:
import json

def load_data(title) :
  with open(title , encoding = "utf-8") as f : 
    return json.load(f)

In [107]:
save_data("disney_data.json" , movie_info_list)

## Task#3 : Clean our data

In [8]:
movie_info_list = load_data("disney_data.json")

### Subtasks
- Clean up references [1]
- Convert running time into an integer
- Convert dates into datetime object
- Split up the long strings
- Convert Budget and Box office to numbers

#### Clean up references (remove [1] [2] etc)

clean_tags method created to remove the "sup" tag also "span" tag removed to clean even further

#### Split up long strings

Earlier break statements were handled with joining content with " " 

Now when we get br tag we store the elements in a list and return the list

The following elif condition was added to get_content() function

elif row_data.find("br") :
    return [text for text in row_data.stripped_strings]

#### Convert running time into an integer

In [9]:
movie_time = [movie.get('Running time', 'N/A') for movie in movie_info_list]

In [10]:
int(movie_time[0].split(" ")[0])

83

In [11]:
def minutes_to_integer(running_time) :
  if running_time == "N/A" :
    return None
  
    #If it is a list
  elif isinstance(running_time , list) :
    return int(running_time[0].split(" ")[0])
  # If it is a string
  else :
    return int(running_time.split(" ")[0])

In [123]:
for movie in movie_info_list :
  movie['Running time (int)'] = minutes_to_integer(movie.get("Running time" , "N/A"))

In [12]:
movie_info_list[-100]

{'title': 'Zootopia',
 'Directed by': ['Byron Howard', 'Rich Moore'],
 'Screenplay by': ['Jared Bush', 'Phil Johnston'],
 'Story by': ['Byron Howard',
  'Rich Moore',
  'Jared Bush',
  'Jim Reardon',
  'Josie Trinidad',
  'Phil Johnston',
  'Jennifer Lee'],
 'Produced by': 'Clark Spencer',
 'Starring': ['Ginnifer Goodwin',
  'Jason Bateman',
  'Idris Elba',
  'Jenny Slate',
  'Nate Torrence',
  'Bonnie Hunt',
  'Don Lake',
  'Tommy Chong',
  'J. K. Simmons',
  'Octavia Spencer',
  'Alan Tudyk',
  'Shakira'],
 'Cinematography': ['Nathan Warner (layout)', 'Brian Leach (lighting)'],
 'Edited by': ['Fabienne Rawley', 'Jeremy Milton'],
 'Music by': 'Michael Giacchino',
 'Production companies': ['Walt Disney Pictures',
  'Walt Disney Animation Studios'],
 'Distributed by': ['Walt Disney Studios', 'Motion Pictures'],
 'Release dates': ['February 13, 2016 (Belgium)',
  'March 4, 2016 (United States)'],
 'Running time': '108 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budge

#### Convert Budget and Box office to numbers

In [13]:
print([movie.get('Budget' , 'N/A') for movie in movie_info_list])

['$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million', 'N/A', '$2.2 million', '$1.8 million', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', '$6.3 

In [17]:
import re

amounts = r"thousand|million|billion"
number = r"\d+(,\d{3})*\.*\d*"

word_re = rf"\${number}(-|\sto\s|-)?({number})?\s({amounts})"
value_re = rf"\${number}"


# money_convesion("$12.2 million") = 12200000 -> word syntax
# money_conversion("$790,000") = 79000 -> value syntax

def word_to_value(word) :
  value_dict = {"thousand" : 1000 , "million" : 1000000 , "billion" : 1000000000}
  return value_dict[word]

def parse_word_syntax(string) :
  value_string = re.search(number , string).group()
  value = float(value_string.replace("," , ""))
  word = re.search(amounts , string).group()
  word_value = word_to_value(word)
  return value * word_value
  
def parse_value_syntax(string) :
  value_string = re.search(number , string).group()
  value = float(value_string.replace("," , ""))
  return value

def money_conversion(money) :
  
  if isinstance(money,list) :
    money = money[0]
  
  value_syntax = re.search(value_re , money)
  word_syntax = re.search(word_re , money)
  
  if word_syntax :
    return parse_word_syntax(word_syntax.group())

  elif value_syntax :
    return parse_value_syntax(value_syntax.group())
    

In [18]:
for movie in movie_info_list :
  movie['Budget (float)'] = money_conversion(movie.get('Budget' , 'N/A'))
  movie['Box office (float)'] = money_conversion(movie.get('Box office' , 'N/A'))

In [20]:
movie_info_list[286]

{'title': 'Miracle',
 'Directed by': "Gavin O'Connor",
 'Written by': ['Eric Guggenheim', 'Mike Rich'],
 'Produced by': ['Mark Ciardi',
  'Gordon Gray',
  'Ross Greenburg',
  'Justis Greene',
  'Jon Mone',
  "Greg O'Connor"],
 'Starring': ['Kurt Russell', 'Patricia Clarkson', 'Noah Emmerich'],
 'Cinematography': 'Dan Stoloff',
 'Edited by': ['John Gilroy', 'Daric Loo'],
 'Music by': 'Mark Isham',
 'Production companies': ['Walt Disney Pictures', 'Mayhem Pictures'],
 'Distributed by': 'Buena Vista Pictures Distribution',
 'Release date': ['February 20, 2004'],
 'Running time': '136 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$28 million',
 'Box office': '$64.5 million',
 'Budget (float)': 28000000.0,
 'Box office (float)': 64500000.0}

#### Convert Dates into datetime

In [32]:
# June 28 , 1950
from datetime import datetime

dates = [movie.get('Release date' , 'N/A') for movie in movie_info_list]

def clean_date(date) :
  return date.split("(")[0].strip()

def date_conversion(date) :
  if isinstance(date , list) :
    date = date[0]
  
  if date == "N/A" :
    return None
  
  date_str = clean_date(date)
  print(date_str)

  fmts= ["%B %d, %Y" , "%d %B %Y"]
  
  for fmt in fmts :
    try : 
      return datetime.strptime(date_str, fmt)
    except : 
      pass
  
  return None

In [33]:
for movie in movie_info_list :
  movie['Release date (datetime)'] = date_conversion(movie.get('Release date' , 'N/A')) 
  

November 13, 1940
June 27, 1941
July 17, 1943
September 27, 1947
May 27, 1948
October 5, 1949
February 5, 1953
July 23, 1953
November 10, 1953
August 17, 1954
December 23, 1954
May 25, 1955
June 22, 1955
September 14, 1955
December 22, 1955
June 8, 1956
July 18, 1956
September 4, 1956
December 20, 1956
June 19, 1957
August 28, 1957
December 25, 1957
July 8, 1958
August 12, 1958
December 25, 1958
January 29, 1959
March 19, 1959
November 10, 1959
January 21, 1960
February 24, 1960
May 19, 1960
November 1, 1960
December 21, 1960
January 25, 1961
March 16, 1961
June 21, 1961
July 12, 1961
July 17, 1961
December 14, 1961
April 5, 1962
May 17, 1962
June 6, 1962
September 26, 1962
November 7, 1962
January 16, 1963
March 29, 1963
June 1, 1963
July 7, 1963
November 20, 1963
March 12, 1964
February 11, 1964
July 2, 1964
November 10, 1964
December 18, 1964
August 18, 1965
December 2, 1965
October 1, 1966
December 1, 1966
February 8, 1967
June 15, 1967
July 12, 1967
October 18, 1967
October 19, 19

In [34]:
movie_info_list[29]

{'title': 'The Great Locomotive Chase',
 'Directed by': 'Francis D. Lyon',
 'Written by': 'Lawrence Edward Watkin',
 'Produced by': ['Lawrence Edward Watkin', 'Walt Disney'],
 'Starring': ['Fess Parker',
  'Jeffrey Hunter',
  'John Lupton',
  'Jeff York',
  'Slim Pickens'],
 'Cinematography': 'Charles Boyle',
 'Edited by': 'Ellsworth Hoagland',
 'Music by': 'Paul J. Smith',
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'Buena Vista Distribution',
 'Release date': 'June 8, 1956',
 'Running time': '85 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Box office': '$1.7 million (US)',
 'Budget (float)': None,
 'Box office (float)': 1700000.0,
 'Release date (datetime)': datetime.datetime(1956, 6, 8, 0, 0)}

## Save data

In [25]:
import pickle

def save_data_pickle(name , data) : 
  with open(name , 'wb') as f :
    pickle.dump(data , f)

In [2]:
import pickle

def load_data_pickle(name) : 
  with open(name , 'rb') as f :
    return pickle.load(f)

In [37]:
save_data_pickle("disney_movie_data_cleaned_more.pickle" , movie_info_list)

In [38]:
a = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [39]:
a[5]

{'title': 'Bambi',
 'Directed by': ['Supervising director',
  'David Hand',
  'Sequence directors',
  'James Algar',
  'Samuel Armstrong',
  'Graham Heid',
  'Bill Roberts',
  'Paul Satterfield',
  'Norman Wright'],
 'Story by': ['Story direction',
  'Perce Pearce',
  'Story adaptation',
  'Larry Morey',
  'Story development',
  'Vernon Stallings',
  'Melvin Shaw',
  'Carl Fallberg',
  'Chuck Couch',
  'Ralph Wright'],
 'Based on': ['Bambi, a Life in the Woods', 'by', 'Felix Salten'],
 'Produced by': 'Walt Disney',
 'Starring': 'see below',
 'Music by': ['Frank Churchill', 'Edward H. Plumb'],
 'Production company': 'Walt Disney Productions',
 'Distributed by': 'RKO Radio Pictures',
 'Release dates': ['August 9, 1942 ( London )',
  'August 21, 1942 (United States)'],
 'Running time': '70 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget': '$858,000',
 'Box office': '$267.4 million',
 'Budget (float)': 858000.0,
 'Box office (float)': 267399999.99999997,
 'Release d

In [40]:
a == movie_info_list

True

## Attach IMDB / Rotten Tomatoes / Metascore Scores

In [3]:
movie_info_list = load_data_pickle("disney_movie_data_cleaned_more.pickle")

In [4]:
#http://www.omdbapi.com/?apikey=[yourkey]&

In [20]:
import requests
import urllib
import os

def get_omdb_info(title) : 
  base_url = "http://www.omdbapi.com/?"
  parameters = {'i' : 'tt3896198' ,'apikey' : os.environ['OMDB_API_KEY'] , 't' : title}
  params_encoded = urllib.parse.urlencode(parameters)
  full_url = base_url + params_encoded
  return requests.get(full_url).json()

def get_rotten_tomato_score(omdb_info) :
  ratings = omdb_info.get('Ratings' , [])
  for rating in ratings : 
    if rating['Source'] == 'Rotten Tomatoes' :
      return rating['Value']
  return None

In [22]:
for movie in movie_info_list :
  title = movie['title']
  omdb_info = get_omdb_info(title)
  movie['imdb'] = omdb_info.get('imdbRating' , None)
  movie['metascore'] = omdb_info.get('Metascore' , None)
  movie['rotten_tomatoes'] = get_rotten_tomato_score(omdb_info)

In [23]:
movie_info_list[-50]

{'title': 'Flora & Ulysses',
 'Directed by': 'Lena Khan',
 'Screenplay by': 'Brad Copeland',
 'Based on': ['Flora & Ulysses', 'by', 'Kate DiCamillo'],
 'Produced by': 'Gil Netter',
 'Starring': ['Matilda Lawler',
  'Alyson Hannigan',
  'Ben Schwartz',
  'Anna Deavere Smith',
  'Danny Pudi',
  'Benjamin Evan Ainsworth',
  'Janeane Garofalo',
  'Kate Micucci'],
 'Cinematography': 'Andrew Dunn',
 'Edited by': 'Jamie Gross',
 'Music by': 'Jake Monaco',
 'Production companies': ['Walt Disney Pictures', 'Netter Productions'],
 'Distributed by': 'Disney+',
 'Release date': ['February 19, 2021'],
 'Running time': '95 minutes',
 'Country': 'United States',
 'Language': 'English',
 'Budget (float)': None,
 'Box office (float)': None,
 'Release date (datetime)': datetime.datetime(2021, 2, 19, 0, 0),
 'imdb': '6.2',
 'metascore': '62',
 'rotten_tomatoes': None}

In [26]:
save_data_pickle('disney_movie_data_final.pickle'  , movie_info_list)

## Task#5 : Save Data as JSON and CSV

In [27]:
# Replace datetime with string

movie_info_copy = [movie.copy() for movie in movie_info_list]

In [28]:
for movie in movie_info_copy : 
  current_date = movie['Release date (datetime)']
  if current_date :
    movie['Release date (datetime)'] = current_date.strftime("%B %d, %Y")
  else :
    movie['Release date (datetime)'] = None

In [32]:
save_data('disney_data_final.json' , movie_info_copy)

### Convert Data to CSV

In [33]:
import pandas as pd

df = pd.DataFrame(movie_info_list)

In [35]:
df.head(100)

Unnamed: 0,title,Directed by,Written by,Based on,Produced by,Starring,Music by,Production company,Distributed by,Release dates,Running time,Country,Language,Budget,Box office,Budget (float),Box office (float),Release date (datetime),imdb,metascore,rotten_tomatoes,Story by,Narrated by,Cinematography,Release date,Edited by,Languages,Screenplay by,Countries,Color process,Production companies,Japanese,Hepburn,Adaptation by,Traditional,Simplified,Original title,Layouts by,Music,Lyrics,Book,Basis,Productions,Awards
0,Snow White and the Seven Dwarfs,"[David Hand, William Cottrell, Wilfred Jackson...","[Ted Sears, Richard Creedon, Otto Englander, D...","[Snow White, by The, Brothers Grimm]",Walt Disney,"[Adriana Caselotti, Lucille La Verne, Harry St...","[Frank Churchill, Paul Smith, Leigh Harline]",Walt Disney Productions,RKO Radio Pictures,"[December 21, 1937 ( Carthay Circle Theatre ),...",83 minutes,United States,English,$1.49 million,$418 million,1490000.0,418000000.0,NaT,7.6,96,,,,,,,,,,,,,,,,,,,,,,,,
1,Pinocchio,"[Ben Sharpsteen, Hamilton Luske, Bill Roberts,...",,"[The Adventures of Pinocchio, by, Carlo Collodi]",Walt Disney,"[Cliff Edwards, Dickie Jones, Christian Rub, W...","[Leigh Harline, Paul J. Smith]",Walt Disney Productions,RKO Radio Pictures,"[February 7, 1940 ( Center Theatre ), February...",88 minutes,United States,English,$2.6 million,$164 million,2600000.0,164000000.0,NaT,7.5,99,100%,"[Ted Sears, Otto Englander, Webb Smith, Willia...",,,,,,,,,,,,,,,,,,,,,,
2,Fantasia,"[Samuel Armstrong, James Algar, Bill Roberts, ...",,,"[Walt Disney, Ben Sharpsteen]","[Leopold Stokowski, Deems Taylor]",See program,Walt Disney Productions,RKO Radio Pictures,,126 minutes,United States,English,$2.28 million,$76.4–$83.3 million (United States and Canada),2280000.0,83300000.0,1940-11-13,7.7,96,95%,"[Joe Grant, Dick Huemer]",Deems Taylor,James Wong Howe,"[November 13, 1940]",,,,,,,,,,,,,,,,,,,
3,The Reluctant Dragon,"[Alfred Werker, (live action), Hamilton Luske,...","[Live-action:, Ted Sears, Al Perkins, Larry Cl...",,Walt Disney,"[Robert Benchley, Frances Gifford, Buddy Peppe...","[Frank Churchill, Larry Morey]",Walt Disney Productions,RKO Radio Pictures,,74 minutes,United States,English,"$600,000","$960,000 (worldwide rentals)",600000.0,960000.0,1941-06-27,6.8,,100%,,,Bert Glennon,"[June 27, 1941]",Paul Weatherwax,,,,,,,,,,,,,,,,,,
4,Dumbo,"[Ben Sharpsteen, Norman Ferguson, Wilfred Jack...",,"[Dumbo, the Flying Elephant, by, Helen Aberson...",Walt Disney,"[Edward Brophy, Verna Felton, Cliff Edwards, H...","[Frank Churchill, Oliver Wallace]",Walt Disney Productions,RKO Radio Pictures,"[October 23, 1941 (New York City), October 31,...",64 minutes,United States,English,"$950,000",>$1.3 million (est. United States/Canada renta...,950000.0,1300000.0,NaT,7.2,96,98%,"[Joe Grant, Dick Huemer]",John McLeish,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,The Boatniks,Norman Tokar,Arthur Julian,,Ron Miller,"[Robert Morse, Stefanie Powers, Phil Silvers, ...","[Bruce Belland, Robert F. Brunner, Franklyn Ma...",Walt Disney Productions,Buena Vista Distribution,,100 minutes,United States,English,,"$18,607,492",,18607492.0,1970-07-01,5.5,,,Mary Roth,,William E. Snyder,"July 1, 1970",Cotton Warburton,,,,,,,,,,,,,,,,,,
96,The Wild Country,Robert Totten,"[Calvin Clements Jr., Paul Savage]",,Ron Miller,"[Steve Forrest, Vera Miles]",Robert F. Brunner,Walt Disney Productions,Buena Vista Distribution,,100 minutes,United States,English,,$4 million (rentals) (US/Canada),,4000000.0,1970-12-15,6.2,,,,,Frank V. Phillips,"[December 15, 1970]",Robert Stafford,,,,,,,,,,,,,,,,,,
97,The Aristocats,Wolfgang Reitherman,,"[Tom McGowan, Tom Rowe]","[Winston Hibler, Wolfgang Reitherman]","[Phil Harris, Eva Gabor, Sterling Holloway, Sc...",George Bruns,Walt Disney Productions,Buena Vista Distribution,"[December 11, 1970 (premiere), December 24, 19...",79 minutes,United States,English,$4 million,$191 million,4000000.0,191000000.0,NaT,7.1,66,63%,"[Ken Anderson, Larry Clemmons, Eric Cleworth, ...",,,,Tom Acosta,,,,,,,,,,,,,,,,,,
98,The Barefoot Executive,Robert Butler,,,Bill Anderson,"[Kurt Russell, Joe Flynn, Wally Cox, Heather N...","[Robert F. Brunner, Franklyn Marks, Bruce Bell...",Walt Disney Productions,Buena Vista Distribution,,96 minutes,United States,English,,,,,1971-03-17,5.9,55,83%,"[Lila Garrett, Bernie Kahn, Stewart C. Billett]",Kurt Russell,Charles F. Wheeler,"March 17, 1971",Robert Stafford,,Joseph L. McEveety,,,,,,,,,,,,,,,,


In [36]:
df.shape

(529, 44)

In [37]:
df.to_csv("disney_movie_data_final.csv")

In [38]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 529 entries, 0 to 528
Data columns (total 44 columns):
 #   Column                   Non-Null Count  Dtype         
---  ------                   --------------  -----         
 0   title                    529 non-null    object        
 1   Directed by              525 non-null    object        
 2   Written by               218 non-null    object        
 3   Based on                 290 non-null    object        
 4   Produced by              516 non-null    object        
 5   Starring                 491 non-null    object        
 6   Music by                 519 non-null    object        
 7   Production company       209 non-null    object        
 8   Distributed by           527 non-null    object        
 9   Release dates            194 non-null    object        
 10  Running time             514 non-null    object        
 11  Country                  465 non-null    object        
 12  Language                 505 non-nul