# Walt Disney Films 

        Related Data was scraped from wikipedia, converted to a json dataset and then cleaned using Beautiful Soup and Pandas.

## Task 1: Get info box (store in python dictionary)

In [1]:
from bs4 import BeautifulSoup
import requests

In [None]:
toy_story = requests.get('https://en.wikipedia.org/wiki/Toy_Story_3').text
soup = BeautifulSoup(toy_story,'lxml')

In [None]:
info_box = soup.find('table', class_ = 'infobox vevent')
print(info_box.prettify())

In [None]:
# extracting labels and values to be stored in the dictionary
keys = soup.find_all(class_= 'infobox-label' )
values = soup.find_all(class_= 'infobox-data' )
movie_name = soup.find(class_='infobox-above summary').text

#creating dictionary
info_dict = {}
info_dict['Name'] = movie_name
for key, value in zip(keys, values):
    info_dict[key.text] = value.text.replace("\n", ",").replace("\xa0", "")
    
info_dict

## Task 2: Get info box of all movies

In [None]:
movies = requests.get('https://en.wikipedia.org/wiki/List_of_Walt_Disney_Pictures_films').text
soup = BeautifulSoup(movies,'lxml')
links = soup.select('.wikitable.sortable i a')
movie_info_list = []

In [None]:
# including only released movies
type(links)
links = links[0:495] # slicing links resultset to avoid errors

In [None]:
def clean_tags(soup): # removes citation signs and numeric dates (redundant)
    for tag in soup.find_all(['sup', 'span']):
        tag.decompose()
        
def get_infobox(url):
    movie = requests.get(url).text
    soup = BeautifulSoup(movie,'lxml')
    info_box = soup.find('table', class_ = 'infobox vevent')
    
    # extracting labels and values to be stored in the dictionary
    keys = soup.find_all(class_= 'infobox-label' )
    values = soup.find_all(class_= 'infobox-data' )
    movie_name = soup.find(class_='infobox-above summary').text
    
    clean_tags(soup)
    
    #creating dictionary
    info_dict = {}
    info_dict['Name'] = movie_name
    for key, value in zip(keys, values):
        info_dict[key.text] = value.text.replace("\n", ",").replace("\xa0", "")

    return info_dict

In [None]:
for link in links:
        link = 'https://en.wikipedia.org' + link.get('href')
        movie_info_list.append(get_infobox(link))
        print(link)

In [None]:
movie_info_list

In [None]:
len(movie_info_list)

### Saving and loading data as json

In [2]:
import json

def save_data(title, data):
    with open(title, 'w', encoding='utf-8') as f:
        json.dump(data, f, ensure_ascii=False, indent=4)

In [3]:
def load_data(title):
    with open(title, encoding='utf-8') as f:
        return json.load(f)

In [None]:
save_data('disney_movies.json', movie_info_list)

## Data Cleaning

In [4]:
movie_info_list = load_data('disney_movies.json')

### Removing non-existent values

In [5]:
import pandas as pd
import numpy as np

In [6]:
movie_data = pd.read_json('disney_movies.json')
movie_data

Unnamed: 0,Name,Productioncompany,Distributed by,Release date,Running time,Country,Language,Box office,Directed by,Written by,...,Suggested by,Layouts by,Original concept by,Studio,Director,Original language,Producers,Editor,Production companies,Distributor
0,Academy Award Review of Walt Disney Cartoons,Walt Disney Productions,United Artists,",May19,1937,",41 minutes (74 minutes 1966 release),United States,English,$45.472,,,...,,,,,,,,,,
1,Snow White and the Seven Dwarfs,Walt Disney Productions,RKO Radio Pictures,,83 minutes,United States,English,$418 million,"Supervising Director ,David Hand,,Sequence Dir...",",Ted Sears,Richard Creedon,Otto Englander,Dick...",...,,,,,,,,,,
2,Pinocchio,Walt Disney Productions,RKO Radio Pictures,,88 minutes,United States,English,$164million,"Supervising Directors,Ben Sharpsteen,Hamilton ...",,...,,,,,,,,,,
3,Fantasia,Walt Disney Productions,RKO Radio Pictures,",November13,1940,",126 minutes,United States,English,$76.4–$83.3 million (United States and Canada),",Samuel Armstrong,James Algar,Bill Roberts,Pau...",,...,,,,,,,,,,
4,The Reluctant Dragon,Walt Disney Productions,RKO Radio Pictures,",June27,1941,",74 minutes,United States,English,"$960,000 (worldwide rentals)",Alfred Werker (live action)Hamilton Luske (ani...,Live-action:Ted SearsAl PerkinsLarry ClemmonsB...,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,Polar Bear,Disneynature,Disney+,",April22,2022,",84 minutes,United States,English,,",Alastair Fothergill,Jeff Wilson,",,...,,,,,,,,,,
491,Chip 'n Dale: Rescue Rangers,,Disney+,,96 minutes,United States,English,"$636,163",Akiva Schaffer,",Dan Gregor,Doug Mand,",...,,,,,,,,,,
492,Hollywood Stargirl,,Disney+,,105 minutes,United States,English,,Julia Hart,",Jordan Horowitz,Julia Hart,",...,,,,,,,,,,
493,Lightyear,,Walt Disney StudiosMotion Pictures,,100 minutes,United States,English,$213.4 million,Angus MacLane,,...,,,,,,,,,,


In [7]:
movie_data.isnull().values.any()

True

In [8]:
columns = movie_data.columns[movie_data.isnull().any()].tolist()
columns

['Productioncompany',
 'Distributed by',
 'Release date',
 'Running time',
 'Country',
 'Language',
 'Box office',
 'Directed by',
 'Written by',
 'Based on',
 'Produced by',
 'Starring',
 'Music by',
 'Release dates',
 'Budget',
 'Story by',
 'Narrated by',
 'Cinematography',
 'Edited by',
 'Languages',
 'Screenplay by',
 'Released',
 'Label',
 'Countries',
 'Genre',
 'Length',
 'Producer',
 'Color process',
 'Recorded',
 'Created by',
 'Country of origin',
 'No. of seasons',
 'No. of episodes',
 'Executive producers',
 'Production company',
 'Original network',
 'Picture format',
 'Audio format',
 'Original release',
 'Related shows',
 'Productioncompanies',
 'Japanese',
 'Hepburn',
 'Adaptation by',
 'Animation by',
 'Traditional',
 'Simplified',
 'Developer(s)',
 'Publisher(s)',
 'Platform(s)',
 'Release',
 'Genre(s)',
 'Mode(s)',
 'Compiler',
 '',
 'Original title',
 'Suggested by',
 'Layouts by',
 'Original concept by',
 'Studio',
 'Director',
 'Original language',
 'Producers',


In [9]:
# Check if all values in column 'B' are NaN
for column in columns:
    if movie_data[column].isnull().all():
        print(f"All values in the column {column} are NaN")
    else:
        print(f"All values in the column {column} are not NaN")

All values in the column Productioncompany are not NaN
All values in the column Distributed by are not NaN
All values in the column Release date are not NaN
All values in the column Running time are not NaN
All values in the column Country are not NaN
All values in the column Language are not NaN
All values in the column Box office are not NaN
All values in the column Directed by are not NaN
All values in the column Written by are not NaN
All values in the column Based on are not NaN
All values in the column Produced by are not NaN
All values in the column Starring are not NaN
All values in the column Music by are not NaN
All values in the column Release dates are not NaN
All values in the column Budget are not NaN
All values in the column Story by are not NaN
All values in the column Narrated by are not NaN
All values in the column Cinematography are not NaN
All values in the column Edited by are not NaN
All values in the column Languages are not NaN
All values in the column Screenpla

since all above columns have NaN values hence we cannot consider to drop an entire column. Hence we will consider dropping columns that are not needed. We donot require following columns:
-distributor
-studio
-songwriter
-layouts
-orignal concept
-compiler
-mode
-production companies
-platforms
-publishers
-developers
-no.of seasons
-no.of episodes
-simplified
-traditional
-animation by
-adaptation by
-hepburn
-japanese
-related shows
-audio format
-picture format
-orignal network
-recorded
-color process

In [10]:


movie_data = movie_data.drop(columns =['Distributor', 'Studio', '', 'Layouts by', 'Original concept by', 'Compiler', 'Mode(s)',
                         'Production companies', 'Platform(s)', 'Publisher(s)', 'Developer(s)', 'No. of seasons',
                         'No. of episodes', 'Simplified', 'Traditional', 'Animation by', 'Adaptation by', 'Hepburn',
                         'Japanese', 'Related shows', 'Audio format', 'Picture format', 'Original network', 'Recorded',
                         'Color process'])

In [11]:
movie_data = movie_data.drop(columns =['Original title', 'Original language','Producers', 'Productioncompanies'])

In [12]:
movie_data = movie_data.drop(columns =['Created by', 'Country of origin','Suggested by', 'Executive producers', 'Production company'])

In [13]:

movie_data = movie_data.drop(columns=['Length', 'Label', 'Editor'])

In [14]:

movie_data.columns

Index(['Name', 'Productioncompany', 'Distributed by', 'Release date',
       'Running time', 'Country', 'Language', 'Box office', 'Directed by',
       'Written by', 'Based on', 'Produced by', 'Starring', 'Music by',
       'Release dates', 'Budget', 'Story by', 'Narrated by', 'Cinematography',
       'Edited by', 'Languages', 'Screenplay by', 'Released', 'Countries',
       'Genre', 'Producer', 'Original release', 'Release', 'Genre(s)',
       'Director'],
      dtype='object')

In [15]:
movie_data[['Release date', 'Release dates', 'Released' ,'Original release', 'Release']]

Unnamed: 0,Release date,Release dates,Released,Original release,Release
0,",May19,1937,",,,,
1,,",December21,1937 (Carthay Circle Theatre),Febr...",,,
2,,",February7,1940 (Center Theatre),February23,19...",,,
3,",November13,1940,",,,,
4,",June27,1941,",,,,
...,...,...,...,...,...
490,",April22,2022,",,,,
491,,",May19,2022 (Hollywood, California),May20,2022...","May20,2022",,
492,,",May23,2022 (El Capitan Theatre),June3,2022 (U...",,,
493,,",June8,2022 (El Capitan Theatre),June17,2022 (...",,,




as you can see that released, orignal releases and release are redundant and mostly empty, they will be deleted and later on release date and release dates will be combined to create a single column

In [16]:
movie_data = movie_data.drop(columns=['Released', 'Original release', 'Release'])

In [17]:
movie_data.columns

Index(['Name', 'Productioncompany', 'Distributed by', 'Release date',
       'Running time', 'Country', 'Language', 'Box office', 'Directed by',
       'Written by', 'Based on', 'Produced by', 'Starring', 'Music by',
       'Release dates', 'Budget', 'Story by', 'Narrated by', 'Cinematography',
       'Edited by', 'Languages', 'Screenplay by', 'Countries', 'Genre',
       'Producer', 'Genre(s)', 'Director'],
      dtype='object')

In [18]:

movie_data[['Edited by', 'Director' ,'Directed by', 'Countries']]

Unnamed: 0,Edited by,Director,Directed by,Countries
0,,,,
1,,,"Supervising Director ,David Hand,,Sequence Dir...",
2,,,"Supervising Directors,Ben Sharpsteen,Hamilton ...",
3,,,",Samuel Armstrong,James Algar,Bill Roberts,Pau...",
4,Paul Weatherwax,,Alfred Werker (live action)Hamilton Luske (ani...,
...,...,...,...,...
490,,,",Alastair Fothergill,Jeff Wilson,",
491,Brian Olds,,Akiva Schaffer,
492,Shayar BhansaliTracey Wadmore-Smith,,Julia Hart,
493,Anthony J. Greenberg,,Angus MacLane,


In [19]:
movie_data = movie_data.drop(columns=['Countries', 'Director'])

In [20]:
movie_data[['Productioncompany', 'Produced by' ,'Genre', 'Genre(s)']]

Unnamed: 0,Productioncompany,Produced by,Genre,Genre(s)
0,Walt Disney Productions,,,
1,Walt Disney Productions,Walt Disney,,
2,Walt Disney Productions,Walt Disney,,
3,Walt Disney Productions,",Walt Disney,Ben Sharpsteen,",,
4,Walt Disney Productions,Walt Disney,,
...,...,...,...,...
490,Disneynature,",Alastair Fothergill,Keith Scholey,Roy Conli,J...",,
491,,",David Hoberman,Todd Lieberman,",,
492,,",Ellen Goldsmith-Vein,Lee Stollman,",,
493,,Galyn Susman,,


In [21]:
movie_data = movie_data.drop(columns=['Genre', 'Genre(s)'])

In [22]:
movie_data.columns

Index(['Name', 'Productioncompany', 'Distributed by', 'Release date',
       'Running time', 'Country', 'Language', 'Box office', 'Directed by',
       'Written by', 'Based on', 'Produced by', 'Starring', 'Music by',
       'Release dates', 'Budget', 'Story by', 'Narrated by', 'Cinematography',
       'Edited by', 'Languages', 'Screenplay by', 'Producer'],
      dtype='object')

In [23]:
movie_data[['Language', 'Languages']] # to be merged

Unnamed: 0,Language,Languages
0,English,
1,English,
2,English,
3,English,
4,English,
...,...,...
490,English,
491,English,
492,English,
493,English,


In [24]:
movie_data["Language"] + movie_data["Languages"]

0      NaN
1      NaN
2      NaN
3      NaN
4      NaN
      ... 
490    NaN
491    NaN
492    NaN
493    NaN
494    NaN
Length: 495, dtype: object

In [25]:
movie_data["Language"].astype('str')

0      English
1      English
2      English
3      English
4      English
        ...   
490    English
491    English
492    English
493    English
494        nan
Name: Language, Length: 495, dtype: object

In [26]:
movie_data['hello'] = movie_data["Language"].astype('str') + movie_data["Languages"]

In [27]:
movie_data['hello'].loc[400:450]

400                                         NaN
401                            nanFrenchEnglish
402                                         NaN
403                                         NaN
404                                         NaN
405                                         NaN
406                                         NaN
407                                         NaN
408                                         NaN
409                                         NaN
410                                         NaN
411                                         NaN
412                                         NaN
413                                         NaN
414                                         NaN
415                                         NaN
416                                         NaN
417                                         NaN
418                                         NaN
419                                         NaN
420                                     

as seen that even by combining the two columns the results are not sufficent to deduce any conclusion regarding the language of the movie hence it is suitable to drop these columns as well.

In [28]:
movie_data = movie_data.drop(columns=['Language', 'Languages'])

In [29]:
movie_data.columns

Index(['Name', 'Productioncompany', 'Distributed by', 'Release date',
       'Running time', 'Country', 'Box office', 'Directed by', 'Written by',
       'Based on', 'Produced by', 'Starring', 'Music by', 'Release dates',
       'Budget', 'Story by', 'Narrated by', 'Cinematography', 'Edited by',
       'Screenplay by', 'Producer', 'hello'],
      dtype='object')

In [30]:
movie_data = movie_data.drop(columns=['hello'])

In [31]:
movie_data[['Productioncompany', 'Producer', 'Release date', 'Release dates', 'Written by', 'Produced by']].head(5)

Unnamed: 0,Productioncompany,Producer,Release date,Release dates,Written by,Produced by
0,Walt Disney Productions,,",May19,1937,",,,
1,Walt Disney Productions,,,",December21,1937 (Carthay Circle Theatre),Febr...",",Ted Sears,Richard Creedon,Otto Englander,Dick...",Walt Disney
2,Walt Disney Productions,,,",February7,1940 (Center Theatre),February23,19...",,Walt Disney
3,Walt Disney Productions,,",November13,1940,",,,",Walt Disney,Ben Sharpsteen,"
4,Walt Disney Productions,,",June27,1941,",,Live-action:Ted SearsAl PerkinsLarry ClemmonsB...,Walt Disney


As for producer, production company and produced by, they need to be combined since all of them deal with the production of the film. Same with release date and realse dates

In [32]:
movie_data['Production by'] = movie_data['Productioncompany'].astype('str') + ", " + movie_data['Producer'].astype('str') + ", " + movie_data['Produced by'].astype('str')
movie_data['Production by'].loc[400:450]

400                                   nan, nan, Joe Roth
401                 nan, nan, Grady CandlerAlix Tidmarsh
402                                   nan, nan, Kori Rae
403         nan, nan, ,Jerry Bruckheimer,Gore Verbinski,
404                      nan, nan, Tracy Balthazor-Flynn
405                 nan, nan, Anna McRobertsRobert Vince
406                            nan, nan, Peter Del Vecho
407         nan, nan, Alison OwenIan ColliePhilip Steuer
408            nan, nan, ,David Hoberman,Todd Lieberman,
409                           nan, nan, Jenni Magee-Cook
410                                        nan, nan, nan
411    nan, nan, Michael MandtNeil MandtJoe RothMark ...
412                                   nan, nan, Joe Roth
413                             nan, nan, Ferrell Barron
414    nan, nan, Rhea KapoorAnil KapoorSiddharth Roy ...
415            nan, nan, Shawn LevyDan LevineLisa Henson
416                                  nan, nan, Roy Conli
417    nan, nan, ,Rob Marshall,

In [33]:
movie_data['Release'] = movie_data['Release date'].astype('str') + ", " + movie_data['Release dates'].astype('str')
movie_data['Release']

0                                      ,May19,1937,, nan
1      nan, ,December21,1937 (Carthay Circle Theatre)...
2      nan, ,February7,1940 (Center Theatre),February...
3                                 ,November13,1940,, nan
4                                     ,June27,1941,, nan
                             ...                        
490                                  ,April22,2022,, nan
491    nan, ,May19,2022 (Hollywood, California),May20...
492    nan, ,May23,2022 (El Capitan Theatre),June3,20...
493    nan, ,June8,2022 (El Capitan Theatre),June17,2...
494                                   ,June24,2022,, nan
Name: Release, Length: 495, dtype: object

Now that the columns are combined we can delete the ones used in the combination

In [34]:
movie_data = movie_data.drop(columns=['Release date', 'Release dates', 'Productioncompany', 'Producer', 'Produced by'])

In [35]:
movie_data.columns

Index(['Name', 'Distributed by', 'Running time', 'Country', 'Box office',
       'Directed by', 'Written by', 'Based on', 'Starring', 'Music by',
       'Budget', 'Story by', 'Narrated by', 'Cinematography', 'Edited by',
       'Screenplay by', 'Production by', 'Release'],
      dtype='object')

now for the columns merged, all the nan strings present in the dataset need to be cleared

In [36]:
movie_data[['Release', 'Production by', ]]

Unnamed: 0,Release,Production by
0,",May19,1937,, nan","Walt Disney Productions, nan, nan"
1,"nan, ,December21,1937 (Carthay Circle Theatre)...","Walt Disney Productions, nan, Walt Disney"
2,"nan, ,February7,1940 (Center Theatre),February...","Walt Disney Productions, nan, Walt Disney"
3,",November13,1940,, nan","Walt Disney Productions, nan, ,Walt Disney,Ben..."
4,",June27,1941,, nan","Walt Disney Productions, nan, Walt Disney"
...,...,...
490,",April22,2022,, nan","Disneynature, nan, ,Alastair Fothergill,Keith ..."
491,"nan, ,May19,2022 (Hollywood, California),May20...","nan, Joe LisantiBrian Tyler, ,David Hoberman,T..."
492,"nan, ,May23,2022 (El Capitan Theatre),June3,20...","nan, nan, ,Ellen Goldsmith-Vein,Lee Stollman,"
493,"nan, ,June8,2022 (El Capitan Theatre),June17,2...","nan, nan, Galyn Susman"


In [37]:
movie_data[['Release','Production by' ]].replace(' nan', "")

Unnamed: 0,Release,Production by
0,",May19,1937,, nan","Walt Disney Productions, nan, nan"
1,"nan, ,December21,1937 (Carthay Circle Theatre)...","Walt Disney Productions, nan, Walt Disney"
2,"nan, ,February7,1940 (Center Theatre),February...","Walt Disney Productions, nan, Walt Disney"
3,",November13,1940,, nan","Walt Disney Productions, nan, ,Walt Disney,Ben..."
4,",June27,1941,, nan","Walt Disney Productions, nan, Walt Disney"
...,...,...
490,",April22,2022,, nan","Disneynature, nan, ,Alastair Fothergill,Keith ..."
491,"nan, ,May19,2022 (Hollywood, California),May20...","nan, Joe LisantiBrian Tyler, ,David Hoberman,T..."
492,"nan, ,May23,2022 (El Capitan Theatre),June3,20...","nan, nan, ,Ellen Goldsmith-Vein,Lee Stollman,"
493,"nan, ,June8,2022 (El Capitan Theatre),June17,2...","nan, nan, Galyn Susman"


In [38]:
# removing nan strings
movie_data['Release'] = movie_data['Release'].apply(lambda x: x.replace("nan", ""))
movie_data['Production by'] = movie_data['Production by'].apply(lambda x: x.replace("nan", ""))

# now replacing all extra commas
movie_data['Release'] = movie_data['Release'].apply(lambda x: x.replace(", ,", ","))
movie_data['Production by'] = movie_data['Production by'].apply(lambda x: x.replace(", ,", ","))
movie_data['Release'] = movie_data['Release'].apply(lambda x: x.replace(",,", ","))
movie_data['Production by'] = movie_data['Production by'].apply(lambda x: x.replace(",,", ","))

# removing commas on left and right
movie_data['Production by'] = movie_data['Production by'].str.lstrip(',')
movie_data['Production by'] = movie_data['Production by'].str.rstrip(',')

movie_data['Release'] = movie_data['Release'].str.lstrip(',')
movie_data['Release'] = movie_data['Release'].str.rstrip(',')

movie_data[['Release', 'Production by']]

Unnamed: 0,Release,Production by
0,"May19,1937,","Walt Disney Productions,"
1,"December21,1937 (Carthay Circle Theatre),Febru...","Walt Disney Productions, Walt Disney"
2,"February7,1940 (Center Theatre),February23,194...","Walt Disney Productions, Walt Disney"
3,"November13,1940,","Walt Disney Productions, ,Walt Disney,Ben Shar..."
4,"June27,1941,","Walt Disney Productions, Walt Disney"
...,...,...
490,"April22,2022,","Disneynature, ,Alastair Fothergill,Keith Schol..."
491,"May19,2022 (Hollywood, California),May20,2022 ...","Joe LisantiBrian Tyler,David Hoberman,Todd Li..."
492,"May23,2022 (El Capitan Theatre),June3,2022 (Un...",",Ellen Goldsmith-Vein,Lee Stollman"
493,"June8,2022 (El Capitan Theatre),June17,2022 (U...",Galyn Susman


In [39]:
movie_data

Unnamed: 0,Name,Distributed by,Running time,Country,Box office,Directed by,Written by,Based on,Starring,Music by,Budget,Story by,Narrated by,Cinematography,Edited by,Screenplay by,Production by,Release
0,Academy Award Review of Walt Disney Cartoons,United Artists,41 minutes (74 minutes 1966 release),United States,$45.472,,,,,,,,,,,,"Walt Disney Productions,","May19,1937,"
1,Snow White and the Seven Dwarfs,RKO Radio Pictures,83 minutes,United States,$418 million,"Supervising Director ,David Hand,,Sequence Dir...",",Ted Sears,Richard Creedon,Otto Englander,Dick...",Snow Whiteby The Brothers Grimm,",Adriana Caselotti,Lucille La Verne,Harry Stoc...",",Frank Churchill,Paul Smith,Leigh Harline,",$1.49 million,,,,,,"Walt Disney Productions, Walt Disney","December21,1937 (Carthay Circle Theatre),Febru..."
2,Pinocchio,RKO Radio Pictures,88 minutes,United States,$164million,"Supervising Directors,Ben Sharpsteen,Hamilton ...",,The Adventures of Pinocchioby Carlo Collodi,",Cliff Edwards,Dickie Jones,Christian Rub,Walt...",Leigh HarlinePaul J. Smith,$2.6 million,Ted SearsOtto EnglanderWebb SmithWilliam Cottr...,,,,,"Walt Disney Productions, Walt Disney","February7,1940 (Center Theatre),February23,194..."
3,Fantasia,RKO Radio Pictures,126 minutes,United States,$76.4–$83.3 million (United States and Canada),",Samuel Armstrong,James Algar,Bill Roberts,Pau...",,,",Leopold Stokowski,Deems Taylor,",See program,$2.28 million,",Joe Grant,Dick Huemer,",Deems Taylor,James Wong Howe,,,"Walt Disney Productions, ,Walt Disney,Ben Shar...","November13,1940,"
4,The Reluctant Dragon,RKO Radio Pictures,74 minutes,United States,"$960,000 (worldwide rentals)",Alfred Werker (live action)Hamilton Luske (ani...,Live-action:Ted SearsAl PerkinsLarry ClemmonsB...,,Robert BenchleyFrances GiffordBuddy PepperNana...,Frank ChurchillLarry Morey,"$600,000",,,Bert Glennon,Paul Weatherwax,,"Walt Disney Productions, Walt Disney","June27,1941,"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
490,Polar Bear,Disney+,84 minutes,United States,,",Alastair Fothergill,Jeff Wilson,",,,,Harry Gregson-Williams,,,Catherine Keener,,,,"Disneynature, ,Alastair Fothergill,Keith Schol...","April22,2022,"
491,Chip 'n Dale: Rescue Rangers,Disney+,96 minutes,United States,"$636,163",Akiva Schaffer,",Dan Gregor,Doug Mand,",Rescue Rangers properties and charactersby Dis...,",John Mulaney,Andy Samberg,KiKi Layne,Will Arn...",Brian Tyler,~$70 million,,,Larry Fong,Brian Olds,,"Joe LisantiBrian Tyler,David Hoberman,Todd Li...","May19,2022 (Hollywood, California),May20,2022 ..."
492,Hollywood Stargirl,Disney+,105 minutes,United States,,Julia Hart,",Jordan Horowitz,Julia Hart,",Charactersby Jerry Spinelli,",Grace VanderWaal,Elijah Richardson,Tyrel Jack...",Michael Penn (songs) Rob Simonsen Duncan Bli...,,,,Bryce Fortner,Shayar BhansaliTracey Wadmore-Smith,,",Ellen Goldsmith-Vein,Lee Stollman","May23,2022 (El Capitan Theatre),June3,2022 (Un..."
493,Lightyear,Walt Disney StudiosMotion Pictures,100 minutes,United States,$213.4 million,Angus MacLane,,,",Chris Evans,Keke Palmer,Peter Sohn,Taika Wait...",Michael Giacchino,$200 million,",Angus MacLane,Matthew Aldrich,Jason Headley,",,",Jeremy Lasky (camera),Ian Megibben (lighting),",Anthony J. Greenberg,",Jason Headley,Angus MacLane,",Galyn Susman,"June8,2022 (El Capitan Theatre),June17,2022 (U..."


look for any null values for release date, which need to be removed as such movies are of no use

In [40]:
movie_data['Release'].isnull().values.any()

False

this means that all movies are valid

## Convert running time to integer

In [41]:
# removing movies with no running time
movie_data['Running time'] = movie_data['Running time'].astype('str')

In [42]:
# removing non numeric strings
for i in range(len(movie_data['Running time'])):
    movie_data['Running time'][i] = movie_data['Running time'][i].split(" ")[0]

movie_data['Running time']

0       41
1       83
2       88
3      126
4       74
      ... 
490     84
491     96
492    105
493    100
494    nan
Name: Running time, Length: 495, dtype: object

In [43]:
# removing non numeric strings
count = 0
for i in range(len(movie_data['Running time'])):
    if movie_data['Running time'][i].isnumeric() == False:
        print(movie_data['Running time'][i])
        count = count + 1

print(f"count: {count}" )

22–24
22–24
nan
nan
nan
84minutes
nan
,157
nan
count: 9


As evident, nine values are alphanumric. Hence, we ignore the movies with no running time since it is practically impossible. Second, for the ranges, we provide average. And for extra strings we convert them to the numeric substring.|

In [44]:
movie_data = movie_data.drop(movie_data[movie_data['Running time'] == "nan"].index)

In [45]:
print(movie_data.loc[(movie_data['Running time'] == "84minutes")].index)
movie_data.at[413,'Running time'] = "84"

print(movie_data.loc[(movie_data['Running time'] == ",157")].index)
movie_data.at[486,'Running time'] = "157"

Int64Index([413], dtype='int64')
Int64Index([486], dtype='int64')


In [46]:
print(movie_data.loc[(movie_data['Running time'] == "22–24")].index)
movie_data.at[43,'Running time'] = "23"
movie_data.at[48,'Running time'] = "23"

Int64Index([43, 48], dtype='int64')


In [47]:
movie_data['Running time'].astype(int)

0       41
1       83
2       88
3      126
4       74
      ... 
489     91
490     84
491     96
492    105
493    100
Name: Running time, Length: 490, dtype: int64

## Cleaning "Budget" 

In [48]:
movie_data['Budget']

0                NaN
1      $1.49 million
2       $2.6 million
3      $2.28 million
4           $600,000
           ...      
489              NaN
490              NaN
491     ~$70 million
492              NaN
493     $200 million
Name: Budget, Length: 490, dtype: object

In [51]:
movie_data["Budget"] = movie_data["Budget"].astype(str)  # cast to string

# all the string surgery goes in here
movie_data["Budget"] = movie_data["Budget"].astype(str).replace('million', '')  # assuming ',' is the thousand's separator in your locale

In [52]:
movie_data['Budget']

0                nan
1      $1.49 million
2       $2.6 million
3      $2.28 million
4           $600,000
           ...      
489              nan
490              nan
491     ~$70 million
492              nan
493     $200 million
Name: Budget, Length: 490, dtype: object

In [54]:
movie_data['Budget']

0                nan
1      $1.49 million
2       $2.6 million
3      $2.28 million
4           $600,000
           ...      
489              nan
490              nan
491     ~$70 million
492              nan
493     $200 million
Name: Budget, Length: 490, dtype: object

In [56]:
print([movie.get('Budget', 'N/A') for movie in movie_info_list])

['N/A', '$1.49 million', '$2.6 million', '$2.28 million', '$600,000', '$950,000', '$858,000', 'N/A', '$788,000', 'N/A', '$1.35 million', '$2.125 million', 'N/A', '$1.5 million', '$1.5 million ', 'N/A', '$2.2 million', '$1,800,000', '$3 million', 'N/A', '$4 million', '$2 million', '$300,000', '$1.8 million', 'N/A', '$5 million', 'N/A', '$4 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$700,000', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$6 million', 'under $1 million or $1,250,000', 'N/A', 'N/A', '$2 million', 'N/A', 'N/A', '$2.5 million', 'N/A', 'N/A', 'N/A', '$4 million', '$3.6 million', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$3 million', 'N/A', 'N/A', 'N/A', 'N/A', '$4.4–6million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4million', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', '$5 million', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', 'N/A', '$4 million', 'N/A', 'N

In [103]:
import re
movie_data['Budget'] = movie_data['Budget'].replace({"\$": "", "~": "", ",": ""}, regex=True)

for movie in movie_info_list:
    m_budget = movie.get("Budget")
    m_budget = str(m_budget).replace("$", "").replace("~", "").replace(",", "")
    movie['Budget'] = m_budget
    print(movie['Budget'])
    if "under" in m_budget:
        m_budget = str(m_budget).replace("under 1 million or ", "")
        
    elif "-" in m_budget:
        m_budget = str(m_budget).replace("million", "").replace("-", " ").replace(".21", "")
        
        #extracting numbers from range 
        num_list = re.findall(r'\d+', m_budget)
        
        # taking average of extracted numbers then converting to million
        m_budget = int(int(num_list[0])+ int(num_list[1]))/2*1000000
        
    elif "around" in m_budget:
        m_budget = str(m_budget).replace(m_budget,"8700000")
    
    elif "estimated" in m_budget:
        m_budget = str(m_budget).replace("(estimated)", "").replace(" ", "")
        if "million" in m_budget:
            m_budget = int(m_budget.replace("million", ""))*1000000
        
    elif "US" in m_budget:
        m_budget = str(m_budget).split("US")[1].replace(" million)", "").replace(" million", "").replace("million)", "")
        
        if "15–19.2" in m_budget:
            num_list = re.findall(r'\d+', m_budget)
            m_budget = str(int(int(num_list[0])+ int(num_list[1]))/2*1000000)
            
        else:
            m_budget = str(float(m_budget)*100000)
            
    elif "gross" in m_budget:
        m_budget = str(float(str(m_budget).split("(gross)")[0].replace(" million ", ""))*1000000) 
        
    elif "million" in m_budget:
        m_budget = m_budget.split("million")[0]
        num_list = re.findall('\d+', m_budget )
        if len(num_list) == 2:
            m_budget = str(int(int(num_list[0])+ int(num_list[1]))/2*1000000)
        else:
            m_budget = int(num_list[0])*1000000 

    movie['Budget'] = str(m_budget)  

None
25000000.0
4000000.0
15000000.0
600000
950000
858000
None
788000
None
18000000.0
63500000.0
None
3000000.0
3000000.0
None
2000000.0
1800000
3000000
None
4000000
2000000
300000
4500000.0
None
5000000
None
4000000
None
None
None
None
None
None
700000
None
None
None
None
None
6000000
1250000
None
None
2000000
None
None
3500000.0
None
None
None
4000000
4500000.0
None
None
None
None
3000000
None
3000000
None
None
None
None
None
None
None
None
None
3000000
None
None
None
None
4000000
None
None
None
None
None
None
None
None
None
None
None
4000000
None
5000000
None
None
None
None
5000000
None
None
None
None
None
None
4000000
None
None
None
4500000.0
None
None
None
None
None
None
None
None
5000000
None
None
None
None
8000000
None
None
None
None
None
None
1000000
None
None
None
None
5000000
None
None
None
6000000.0
None
10000000
None
None
3000000
None
None
None
15000000.0
20000000
None
9000000
None
7000000.0
20000000
None
None
18000000
12000000
14000000
10000000
17000000
5000000
unknown
200