In [1]:
import os
import requests as rq
import pandas as pd
import numpy as np
import json
import re


In [2]:
# the filepath for each data file
wiki_movie_file = os.path.join('Data', 'wikipedia-movies.json')
ratings_file = os.path.join('Data', 'ratings.csv')
movie_meta_data_file = os.path.join('Data', 'movies_metadata.csv')


In [3]:
# Load the files
with open(wiki_movie_file, 'r') as file1:
    raw_movie_json = json.load(file1)
    file1.close()

# file 2, not cleaning this one yet
# raw_ratings_df = pd.read_csv(ratings_file, sep=',', header=0)

# file 3, not cleaning this one yet
# raw_meta_data_df = pd.read_csv(movie_meta_data_file, sep=',', low_memory=False)


In [4]:
# Explore the data, commenting results
raw_movie_json # Results: list of dicts, each movie is it's own dict
len(raw_movie_json) # Result: 7311 Movies
len(raw_movie_json[10].keys()) # changing the index looking for number of keys; 10 random indexes Results: 17 to 22 keys
raw_movie_df = pd.DataFrame(raw_movie_json) # Results: 7311 rows × 193 columns with lots of Nan fields, not every movie has the same "22" keys
raw_movie_df

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Predecessor,Founders,Area served,Products,Services,Russian,Hebrew,Revenue,Operating income,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990.0,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990.0,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,,,,,,,,,,
2,https://en.wikipedia.org/wiki/Air_America_(film),1990.0,https://www.imdb.com/title/tt0099005/,Air America,Roger Spottiswoode,Daniel Melnick,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",...,,,,,,,,,,
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990.0,https://www.imdb.com/title/tt0099012/,Alice,Woody Allen,Robert Greenhut,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",...,,,,,,,,,,
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990.0,https://www.imdb.com/title/tt0099018/,Almost an Angel,John Cornell,John Cornell,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7306,https://en.wikipedia.org/wiki/Holmes_%26_Watson,2018.0,https://www.imdb.com/title/tt1255919/,Holmes & Watson,Etan Cohen,"[Will Ferrell, Adam McKay, Jimmy Miller, Clayt...",Etan Cohen,,"[Sherlock Holmes, and, Dr. Watson, by, Sir Art...","[Will Ferrell, John C. Reilly, Rebecca Hall, R...",...,,,,,,,,,,
7307,https://en.wikipedia.org/wiki/Vice_(2018_film),2018.0,https://www.imdb.com/title/tt6266538/,Vice,Adam McKay,"[Brad Pitt, Dede Gardner, Jeremy Kleiner, Kevi...",,,,"[Christian Bale, Amy Adams, Steve Carell, Sam ...",...,,,,,,,,,,
7308,https://en.wikipedia.org/wiki/On_the_Basis_of_Sex,2018.0,https://www.imdb.com/title/tt4669788/,On the Basis of Sex,Mimi Leder,Robert W. Cort,,,,"[Felicity Jones, Armie Hammer, Justin Theroux,...",...,,,,,,,,,,
7309,https://en.wikipedia.org/wiki/Destroyer_(2018_...,2018.0,https://www.imdb.com/title/tt7137380/,Destroyer,Karyn Kusama,"[Fred Berger, Phil Hay, Matt Manfredi]",,,,"[Nicole Kidman, Sebastian Stan, Toby Kebbell, ...",...,,,,,,,,,,


In [5]:
# Explore the data, commenting results
raw_movie_df.info() # Result: too much data to display any granular info about the columns
raw_movie_df.isnull().sum() # Result: 10 columns visible, Null values seem to range from 158 to 7310; find the minimum number
min(raw_movie_df.isnull().sum())  # Results: 158 is the minimum, url and year columns are the lowest
raw_movie_df.keys().to_list() # Results: column names suggest more than movies; tv shows, people, and possibly books also.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7311 entries, 0 to 7310
Columns: 193 entries, url to Polish
dtypes: float64(1), object(192)
memory usage: 10.8+ MB


['url',
 'year',
 'imdb_link',
 'title',
 'Directed by',
 'Produced by',
 'Screenplay by',
 'Story by',
 'Based on',
 'Starring',
 'Narrated by',
 'Music by',
 'Cinematography',
 'Edited by',
 'Productioncompany ',
 'Distributed by',
 'Release date',
 'Running time',
 'Country',
 'Language',
 'Budget',
 'Box office',
 'Written by',
 'Genre',
 'Theme music composer',
 'Country of origin',
 'Original language(s)',
 'Producer(s)',
 'Editor(s)',
 'Production company(s)',
 'Original network',
 'Original release',
 'Productioncompanies ',
 'Executive producer(s)',
 'Production location(s)',
 'Distributor',
 'Picture format',
 'Audio format',
 'Voices of',
 'Followed by',
 'Composer(s)',
 'Created by',
 'Also known as',
 'Opening theme',
 'No. of episodes',
 'Preceded by',
 'Author',
 'Publisher',
 'Publication date',
 'Media type',
 'Pages',
 'ISBN',
 'OCLC',
 'LC Class',
 'Cover artist',
 'Series',
 'Set in',
 'Adaptation by',
 'Suggested by',
 'Biographical data',
 'Born',
 'Died',
 'Resti

In [6]:
# select for records with an imdb link, 'Director' or 'Directed by', and not references to tv shows.
cleaning_wiki = [movie for movie in raw_movie_json if (('imdb_link' and ('Directed by' or 'Director')) in movie.keys()) and ('No. of episodes' not in movie.keys())]
cleaning_df = pd.DataFrame(cleaning_wiki)
cleaning_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 74 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Directed by             7099 non-null   object
 5   Produced by             6737 non-null   object
 6   Screenplay by           2323 non-null   object
 7   Story by                1004 non-null   object
 8   Based on                2196 non-null   object
 9   Starring                6913 non-null   object
 10  Narrated by             283 non-null    object
 11  Music by                6502 non-null   object
 12  Cinematography          6396 non-null   object
 13  Edited by               6398 non-null   object
 14  Productioncompany       4539 non-null   object
 15  Dist

In [7]:
# # lots of the columns are languages:  "Arabic", "Japanese", "Mandarin", "Polish", "Yiddish", "Romanized"
cleaning_df[cleaning_df['Arabic'].notnull()]['Arabic']

6856    قضية رقم ٢٣
7081      کفرناحوم‎
Name: Arabic, dtype: object

In [8]:
# staring to define a function to clean the data. troubleshooting and ajustments were decided in the cell below this one.
def cleaning_movie(movie):
    """wiki_movie_file is a json file containing a list of dictionaries with each index as a different movie 
    with different key:value pairs. This function is specially designed to clean that specific dataset. 
    Wikipedia is managed by many people and they do not all use the same words.
    """
    alternate_titles = dict()
    fixed_movie = dict(movie)
###  Start with alternate titles stored in language keys and merge them into one.
###  Language keys i could find:      
    language_keys = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']
    for key in language_keys:
        if key in fixed_movie.keys():
            alternate_titles[key] = fixed_movie[key]
            fixed_movie.pop(key)
        else:
            pass
##  if there were alternate titles, add them to the movie.       
    if len(alternate_titles) > 0:
        fixed_movie['alternate_titles'] = alternate_titles
    else:
        pass
    return fixed_movie



In [9]:
## using this cell to debug the above function and decide on key(s) to keep, merge, and delete. 
## Starting with the alternate titles hidden in the language keys.
language_keys1 = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']

cleaning_wiki2 = [cleaning_movie(movie) for movie in cleaning_wiki]

## find a column to remove, add it to the list, check the columns again.
columns = sorted(pd.DataFrame(cleaning_wiki2).columns.to_list())
columns


['Actor control',
 'Adaptation by',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Country of origin',
 'Created by',
 'Directed by',
 'Distributed by',
 'Distributor',
 'Edited by',
 'Editor(s)',
 'Engine(s)',
 'Executive producer(s)',
 'Followed by',
 'Format(s)',
 'Genre',
 'Genre(s)',
 'Language',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Picture format',
 'Preceded by',
 'Produced by',
 'Producer(s)',
 'Production company',
 'Production company(s)',
 'Production location(s)',
 'Productioncompanies ',
 'Productioncompany ',
 'Release date',
 'Release(s)',
 'Running time',
 'Screen story by',
 'Screenplay by',
 'Starring',
 'Story by',
 'Suggested by',
 'Theme music composer',
 'Voices of',
 'Written by',
 'alternate_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [10]:
## Where I am at for this point. 
pd.DataFrame(cleaning_wiki2).info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 55 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Directed by             7099 non-null   object
 5   Produced by             6737 non-null   object
 6   Screenplay by           2323 non-null   object
 7   Story by                1004 non-null   object
 8   Based on                2196 non-null   object
 9   Starring                6913 non-null   object
 10  Narrated by             283 non-null    object
 11  Music by                6502 non-null   object
 12  Cinematography          6396 non-null   object
 13  Edited by               6398 non-null   object
 14  Productioncompany       4539 non-null   object
 15  Dist

In [11]:
def cleaning_movie(movie):
    """wiki_movie_file is a json file containing a list of dictionaries with each index as a different movie 
    with different key:value pairs. This function is specially designed to clean that specific dataset. 
    Wikipedia is managed by many people and they do not all use the same words.
    """
    alternate_titles = dict()
    fixed_movie = dict(movie)
###  Start with alternate titles stored in language keys and merge them into one.
###  Language keys i could find:      
    language_keys = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']
    for key in language_keys:
        if key in fixed_movie.keys():
            alternate_titles[key] = fixed_movie[key]
            fixed_movie.pop(key)
        else:
            pass
##  if there were alternate titles, add them to the movie.       
    if len(alternate_titles) > 0:
        fixed_movie['alternate_titles'] = alternate_titles
    else:
        pass
        
###  Alternative titles are fixed, now merge columns that are similar.     
    keys_to_merge = {'Director':'Directed by', 'Country': 'Country of origin', 'Distributor(s)':'Distributed by',
                     'Editor(s)':'Edited by',  'Language':'Original language(s)', 'Producer(s)':'Produced by',
                     'Genre(s)': 'Genre', 'Composer(s)': ['Music by', 'Theme music composer'], 
                     'Release date': ['Release(s)', 'Original release'], 'Distributor(s)':['Distributed by','Distributor'],
                     'Writer(s)':['Written by', 'Story by', 'Screenplay by', 'Screen story by', 'Adaptation by'],
                     'Production Comapany': ['Production company', 'Production company(s)', 'Productioncompanies ','Productioncompany ']  }
##  item = key, from the key:value pair and the key I want; values = the movie key(s) I do not want. 
    for item in keys_to_merge: 
        if type(keys_to_merge[item]) == type(list()):
            for n in keys_to_merge[item]:
                if n in fixed_movie.keys():
                    fixed_movie[item] = fixed_movie.pop(n)
                else:
                    pass
        else: 
            if keys_to_merge[item] in fixed_movie.keys():
                fixed_movie[item] = fixed_movie.pop(keys_to_merge[item])
            else:
                pass
    
    return fixed_movie




In [12]:
## using this cell to debug the above function and decide on key(s) to keep, merge, and delete. 
## keys that are the same idea need merged, key = the key I am keeping, Value = similar key(s).
keys_to_merge1 = {'Director':'Directed by', 'Country': 'Country of origin', 'Distributor(s)':'Distributed by',
                 'Editor(s)':'Edited by',  'Language':'Original language(s)', 'Producer(s)':'Produced by',
                 'Genre(s)': 'Genre', 'Composer(s)': ['Music by', 'Theme music composer'], 
                 'Release date': ['Release(s)', 'Original release'], 'Distributor(s)':['Distributed by','Distributor'],
                 'Writer(s)':['Written by', 'Story by', 'Screenplay by', 'Screen story by', 'Adaptation by'],
                 'Production Comapany': ['Production company', 'Production company(s)', 'Productioncompanies ','Productioncompany ']  }


cleaning_wiki3 = [cleaning_movie(movie) for movie in cleaning_wiki]

## find common columns, add them to the dictionary and check the columns again.
columns = sorted(pd.DataFrame(cleaning_wiki3).columns.to_list())
columns



['Actor control',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor(s)',
 'Editor(s)',
 'Engine(s)',
 'Executive producer(s)',
 'Followed by',
 'Format(s)',
 'Genre(s)',
 'Language',
 'Narrated by',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production Comapany',
 'Production location(s)',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Voices of',
 'Writer(s)',
 'alternate_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [13]:
## Where I am at for this point. 
cleaning_wiki_df3 = pd.DataFrame(cleaning_wiki3)
cleaning_wiki_df3.info()
# Down to 38 columns from 193.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Based on                2196 non-null   object
 5   Starring                6913 non-null   object
 6   Narrated by             283 non-null    object
 7   Cinematography          6396 non-null   object
 8   Release date            7067 non-null   object
 9   Running time            6956 non-null   object
 10  Country                 6860 non-null   object
 11  Language                7014 non-null   object
 12  Budget                  4774 non-null   object
 13  Box office              5530 non-null   object
 14  Director                7099 non-null   object
 15  Dist

In [14]:
# extract the imdb id from the imdb link to link with other data sets.
cleaning_wiki_df3['imdb_link']   
pattern = r'(tt\d{7})'
cleaning_wiki_df3['imdb_id'] = cleaning_wiki_df3['imdb_link'].str.extract(pattern)

# checking for duplicate data
len(cleaning_wiki_df3['imdb_id'].unique()) # 7074 rows have imdb links, 7032 are unique. There are duplicate movies in the dataset.

7032

In [15]:
## Drop the duplicate rows
cleaning_wiki_df4 = pd.DataFrame(cleaning_wiki_df3.drop_duplicates(subset='imdb_id'))
cleaning_wiki_df4

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Narrated by,Cinematography,Release date,Running time,...,Created by,Preceded by,Suggested by,alternate_titles,Animation by,Color process,Engine(s),Actor control,Format(s),imdb_id
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","Andrew ""Dice"" Clay",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",102 minutes,...,,,,,,,,,,tt0098987
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",,Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",114 minutes,...,,,,,,,,,,tt0098994
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",,Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",113 minutes,...,,,,,,,,,,tt0099005
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",106 minutes,...,,,,,,,,,,tt0099012
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",,Russell Boyd,"December 19, 1990",95 minutes,...,,,,,,,,,,tt0099018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7094,https://en.wikipedia.org/wiki/Holmes_%26_Watson,2018,https://www.imdb.com/title/tt1255919/,Holmes & Watson,"[Sherlock Holmes, and, Dr. Watson, by, Sir Art...","[Will Ferrell, John C. Reilly, Rebecca Hall, R...",,Oliver Wood,"[December 25, 2018, (, 2018-12-25, ), (United ...",90 minutes,...,,,,,,,,,,tt1255919
7095,https://en.wikipedia.org/wiki/Vice_(2018_film),2018,https://www.imdb.com/title/tt6266538/,Vice,,"[Christian Bale, Amy Adams, Steve Carell, Sam ...",,Greig Fraser,"[December 11, 2018, (, 2018-12-11, ), (, Samue...",132 minutes,...,,,,,,,,,,tt6266538
7096,https://en.wikipedia.org/wiki/On_the_Basis_of_Sex,2018,https://www.imdb.com/title/tt4669788/,On the Basis of Sex,,"[Felicity Jones, Armie Hammer, Justin Theroux,...",,Michael Grady,"[November 8, 2018, (, 2018-11-08, ), (, AFI Fe...",120 minutes,...,,,,,,,,,,tt4669788
7097,https://en.wikipedia.org/wiki/Destroyer_(2018_...,2018,https://www.imdb.com/title/tt7137380/,Destroyer,,"[Nicole Kidman, Sebastian Stan, Toby Kebbell, ...",,Julie Kirkwood,"[August 31, 2018, (, 2018-08-31, ), (, Telluri...",123 minutes,...,,,,,,,,,,tt7137380


In [16]:
# drop columns that are 90% or more null
columns_to_keep = [column for column in cleaning_wiki_df4 if cleaning_wiki_df4[column].isnull().sum() < (len(cleaning_wiki_df4['url'])*0.9)]
cleaning_wiki_df5 = cleaning_wiki_df4[columns_to_keep]
cleaning_wiki_df5.info() # down to 21 useful columns; from 193 messy, where 80%+ of the cells were null Values.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7098
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   url                  7032 non-null   object
 1   year                 7032 non-null   int64 
 2   imdb_link            7031 non-null   object
 3   title                7031 non-null   object
 4   Based on             2181 non-null   object
 5   Starring             6850 non-null   object
 6   Cinematography       6343 non-null   object
 7   Release date         7000 non-null   object
 8   Running time         6893 non-null   object
 9   Country              6798 non-null   object
 10  Language             6948 non-null   object
 11  Budget               4738 non-null   object
 12  Box office           5486 non-null   object
 13  Director             7032 non-null   object
 14  Distributor(s)       6677 non-null   object
 15  Editor(s)            6486 non-null   object
 16  Produc

In [17]:
## Data types:  Budget, Box office, and Running time should be numbers not strings/lists/objects, Release Date and year should be a Datetime.  
# cleaning_wiki_df5.groupby(['Box office']).count() # error because mixture of data types
[oddity for oddity in cleaning_wiki_df5['Box office'] if type(oddity)==type(str())]  # sensible, not that dirty
[oddity for oddity in cleaning_wiki_df5['Box office'] if type(oddity)==type(list())]  # lists are a dirty mess. multiple currency types and odd entries
[oddity for oddity in cleaning_wiki_df5['Box office'] if (type(oddity)!=type(list())) and (type(oddity)!=type(str())) ] # lots of nan values
[(index, oddity) for index, oddity in enumerate(cleaning_wiki_df5['Box office']) if type(oddity)==type(list())]  # looking for a specific movie to check the wiki page

[(34, ['US$', '4,212,828']),
 (54, ['$6,698,361 (', 'United States', ')', '[2]']),
 (74, ['$6,488,144', '(US)', '[1]']),
 (126, ['US$1,531,489', '(domestic)']),
 (130, ['US$', '4,803,039']),
 (137, ['$92,706', '(domestic)']),
 (178, ['$3,331', '(USA)']),
 (204, ['$739,104', '(North America)', '[2]']),
 (211, ['$1.2 million', '(US)', '[1]']),
 (255, ['$14.6 million', '(North America)', '[3]']),
 (272, ['$38 million', '(US)', '[2]']),
 (279, ['$57.5 million', '(North America)', '[1]']),
 (339, ['£739,989 (UK)', '[1]', '$4,413,473 (US)', '[1]']),
 (344, ['$4,654,288 (', 'US', ')', '[1]']),
 (376, ['$6.4 million', '(North America)', '[1]']),
 (412, ['$46.7 million', '[4]', '[3]', '(USA)']),
 (488, ['$14.1 million', '[', 'citation needed', ']']),
 (512, ['$10.7 million', '(North America)', '[2]']),
 (532, ['$75.5 million', '(North America)', '[2]']),
 (564, ['$27.2 million', '(North America)', '[3]']),
 (615, ['HK$2,662,446', '(Hong Kong)']),
 (648, ['$13,747,138', '70,542 admissions (Franc

In [18]:
box_office = cleaning_wiki_df5['Box office'].dropna() # no point in modifying a nan value
len(box_office) # Results: 5486 items
box_office

0          $21.4 million
1           $2.7 million
2            $57,718,089
3             $7,331,647
4       $6,939,946 (USA)
              ...       
7093       $19.4 million
7094       $41.9 million
7095       $76.1 million
7096       $38.4 million
7097        $5.5 million
Name: Box office, Length: 5486, dtype: object

In [19]:
len(box_office[box_office.map(lambda x: type(x) != str)]) # 136 items are lists, not strings
box_office[box_office.map(lambda x: type(x) != str)]


34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
126                [US$1,531,489, (domestic)]
130                          [US$, 4,803,039]
                        ...                  
7003               [$99.6, million, [4], [5]]
7017                   [$365.6, million, [1]]
7018                         [$53.8, million]
7038                     [$435, million, [7]]
7071                   [$529.3, million, [4]]
Name: Box office, Length: 136, dtype: object

In [20]:
# make a regular expression pattern for the strings
pattern1 = r'(\$\d+\.?\d*\s*[bm]illi?on)'  # searching for pattern that is similar to: "$45.3 million/billion"

# counting occurances of the first pattern
box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False).sum()  # 3826 items in the list with this pattern
matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False) # save the filter

box_office.str.extract(pattern1, flags=re.IGNORECASE).dropna()  # View the items matching pattern #1

  box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False).sum()  # 3826 items in the list with this pattern
  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False) # save the filter


Unnamed: 0,0
0,$21.4 million
1,$2.7 million
10,$195.3 million
11,$53.2 million
12,$15.7 million
...,...
7093,$19.4 million
7094,$41.9 million
7095,$76.1 million
7096,$38.4 million


In [21]:
pattern2 = r'(\$\d+(?:[,\.]\d{3})+\.?\d*)(?!\s*[bm]illi?on)'   # searching for pattern that is similar to $123,456,789.0
# counting occurances of the second pattern
box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False).sum()  # 1490 items in the list with this pattern
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False) # save the filter
box_office.str.extract(pattern2, flags=re.IGNORECASE).dropna()  # view the items matching pattern 2

  box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False).sum()  # 1490 items in the list with this pattern
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False) # save the filter


Unnamed: 0,0
2,"$57,718,089"
3,"$7,331,647"
4,"$6,939,946"
9,"$855,810"
15,"$12,626,043"
...,...
7045,"$401,463"
7051,"$260,136"
7062,"$100,116"
7067,"$19,996"


In [22]:
box_office[~matches_from_one & ~matches_from_two] # this result matches the search I did Earlier for data types.
problem_lists = box_office[~matches_from_one & ~matches_from_two]

# solving for the easy ones, they fit the pattern and just the wrong data type
for i in problem_lists.index:
    if len(re.findall(pattern1, str(problem_lists[i]))) == 1: # fits the pattern
        problem_lists[i] = re.findall(pattern1, str(problem_lists[i]))[0]
    elif len(re.findall(pattern2, str(problem_lists[i]))) == 1: # fits the pattern
        problem_lists[i] = re.findall(pattern2, str(problem_lists[i]))[0]
    elif len(re.findall(pattern1, str(problem_lists[i]))) > 1: # saving for easier parsing later
        problem_lists[i] = re.findall(pattern1, str(problem_lists[i]))[1]
    elif len(re.findall(pattern2, str(problem_lists[i]))) > 1:  # saving for easier parsing later
        problem_lists[i] = re.findall(pattern2, str(problem_lists[i]))[1]
    else:
        pass
    
problem_lists

34                [US$, 4,212,828]
54                      $6,698,361
74                      $6,488,144
110             $4.35-4.37 million
126                     $1,531,489
                   ...            
7003    [$99.6, million, [4], [5]]
7017        [$365.6, million, [1]]
7018              [$53.8, million]
7038          [$435, million, [7]]
7071        [$529.3, million, [4]]
Name: Box office, Length: 162, dtype: object

In [31]:
# testing to see if i can make the list parsing easier by removing some of the [\d] wiki references

problem_lists.str.replace("\[\d+\]\s*", "")
problem_lists

  problem_lists.str.replace("\[\d+\]\s*", "")


34          $4,212,828
110      $4.35 million
130         $4,803,039
602              $5000
734        $11,146,270
             ...      
7003     $99.6 million
7017    $365.6 million
7018     $53.8 million
7038      $435 million
7071    $529.3 million
Name: Box office, Length: 67, dtype: object

In [32]:
# save the changes to the box_office
for p in problem_lists.index:
    for b in box_office.index:
        if p == b:
#             if type(problem_lists[p]) == str:
            box_office[b] = problem_lists[p]
#             else:
#                 continue
        else:
            continue

# Rerun the two filters to see what still needs fixed
matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)            
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
box_office[~matches_from_one & ~matches_from_two]

  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)


602                              $5000
1073                        35,254,617
1483                        £3 million
1870                      ¥1.1 billion
2037                               N/A
2096                              $309
2672       926,423 admissions (France)
3096    [$32, [2], –33.1 million, [1]]
3640                               TBA
3889          CN¥3.650 million (China)
4128                        £7,385,434
4318                            $20-30
4575           $45.2k (only in Turkey)
5465                             £2.56
5802                          413 733$
6034                           Unknown
6391                              $111
6392                              $588
6615                    less than $372
6865                           8 crore
Name: Box office, dtype: object

In [33]:
modified_p1 = r"(\$\d{1,3}\.?\d*',\s*'[bm]illi?on)"  # for when 43.5 Million is broken into '4.35' and 'million'
modified_p2 = r"(\$', '\d{1,3}(?:[,\.]\d{3})+\.?\d*)"
pattern3 = r"(\$\d*\.?\d*)(?:[-—–]\d*\.?\d*)(\s[bm]illi?on)" # modified pattern #1 version 3
pattern4 = r"(\$\s\d{1,3}(?:[,\.]\d{3})+\.?\d*)" # modified pattern #2 version 3
pattern5 = r"(\$',\s*'\d{1,3}\.?\d*\s[bm]illi?on)" # modified pattern #1 version 4
pattern6 = r"^(\$\d{3,})" # new pattern
problem_lists = box_office[~matches_from_one & ~matches_from_two]
for i in problem_lists.index:  
    if len(re.findall(modified_p1, str(problem_lists[i]))) > 0:
        problem_lists[i] = re.findall(modified_p1, str(problem_lists[i]))[0].replace("', '", " ")
#         print(i, problem_lists[i])
    elif len(re.findall(modified_p2, str(problem_lists[i]))) >0:
        problem_lists[i] = re.findall(modified_p2, str(problem_lists[i]))[0].replace("', '", "")
    elif len(re.findall(pattern3, str(problem_lists[i]))) >0:
        fix = re.findall(pattern3, str(problem_lists[i]))
        problem_lists[i] = fix[0][0] + fix[0][1]
    elif len(re.findall(pattern4, str(problem_lists[i])))>0:
        problem_lists[i] = re.findall(pattern4, str(problem_lists[i]))[0].replace(" ", "")
    elif len(re.findall(pattern5, str(problem_lists[i])))>0:
        problem_lists[i] = re.findall(pattern5, str(problem_lists[i]))[0].replace("', '", "")
    elif len(re.findall(pattern6, str(problem_lists[i])))>0:
        problem_lists[i] = re.findall(pattern6, str(problem_lists[i]))[0]
    else:
        pass
    
problem_lists

602                              $5000
1073                        35,254,617
1483                        £3 million
1870                      ¥1.1 billion
2037                               N/A
2096                              $309
2672       926,423 admissions (France)
3096    [$32, [2], –33.1 million, [1]]
3640                               TBA
3889          CN¥3.650 million (China)
4128                        £7,385,434
4318                            $20-30
4575           $45.2k (only in Turkey)
5465                             £2.56
5802                          413 733$
6034                           Unknown
6391                              $111
6392                              $588
6615                    less than $372
6865                           8 crore
Name: Box office, dtype: object

In [34]:
## merge the fixes from problem_lists into box_office
for p in problem_lists.index:
    for b in box_office.index:
        if p == b:
            box_office[b] = problem_lists[p]
            

matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)
box_office[~matches_from_one & ~matches_from_two & ~matches_from_three]

  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
  matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)


1073                        35,254,617
1483                        £3 million
1870                      ¥1.1 billion
2037                               N/A
2672       926,423 admissions (France)
3096    [$32, [2], –33.1 million, [1]]
3640                               TBA
3889          CN¥3.650 million (China)
4128                        £7,385,434
4318                            $20-30
4575           $45.2k (only in Turkey)
5465                             £2.56
5802                          413 733$
6034                           Unknown
6615                    less than $372
6865                           8 crore
Name: Box office, dtype: object

In [35]:
## not going to salvage these rows, change them to np.nan. Lots of unknown meaning and some requires currency conversion.
problem_lists = box_office[~matches_from_one & ~matches_from_two & ~matches_from_three]
for i in problem_lists.index:
    problem_lists[i] = np.nan
    
problem_lists    

1073    NaN
1483    NaN
1870    NaN
2037    NaN
2672    NaN
3096    NaN
3640    NaN
3889    NaN
4128    NaN
4318    NaN
4575    NaN
5465    NaN
5802    NaN
6034    NaN
6615    NaN
6865    NaN
Name: Box office, dtype: object

In [36]:
# pur the new np.nan's into box_office
for p in problem_lists.index:
    for b in box_office.index:
        if p == b:
            box_office[b] = problem_lists[p]

matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)
box_office[~matches_from_one & ~matches_from_two & ~matches_from_three]

  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
  matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)


1073    NaN
1483    NaN
1870    NaN
2037    NaN
2672    NaN
3096    NaN
3640    NaN
3889    NaN
4128    NaN
4318    NaN
4575    NaN
5465    NaN
5802    NaN
6034    NaN
6615    NaN
6865    NaN
Name: Box office, dtype: object

In [37]:
## replace the old box office info in the DataFrame with the newly filtered box office info. 
for b in box_office.index:
    for c in cleaning_wiki_df5.index:
        if b == c:
            if box_office[b] != cleaning_wiki_df5['Box office'][c]:
                cleaning_wiki_df5['Box office'][c] = box_office[b]
                break
            else:
#                 print(b, cleaning_wiki_df5['Box office'][c], box_office[b])
##                 found some non number strings missed from the first filtering that should have been caught
                fix1 = re.findall(pattern1, box_office[b])
                fix2 = re.findall(pattern2, box_office[b])
                if (len(fix1) >0) and (fix1[0] != box_office[b]):
                    cleaning_wiki_df5['Box office'][c] = fix1[0]
                    break
                elif len(fix2) >0 and (fix2[0] != box_office[b]):
                    cleaning_wiki_df5['Box office'][c] = fix2[0]
                    break
                else:
                    break
cleaning_wiki_df5['Box office']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df5['Box office'][c] = fix2[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df5['Box office'][c] = fix1[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df5['Box office'][c] = box_office[b]


0       $21.4 million
1        $2.7 million
2         $57,718,089
3          $7,331,647
4          $6,939,946
            ...      
7094    $41.9 million
7095    $76.1 million
7096    $38.4 million
7097     $5.5 million
7098              NaN
Name: Box office, Length: 7032, dtype: object

In [38]:
# def parse_dollars(s):   
    # patterns to change into numbers
    
    # if s is string, then change it, otherwise np.nan
        # if input is of the form $###.## Million, pattern #1
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            # return value
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2

            # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            # return value
            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        
            # Remove dollar sign and commas, convert to float
            # return value

    # else: return np.nan 
    
    

In [39]:
test= re.search("(\d+\.?\d*)", cleaning_wiki_df5['Box office'][0])
print(type(test), "  ||  ", test)
print(test[0], type(test[0]), type(float(test[0])), f"  |  test * Million = {float(test[0])*1000000}")


<class 're.Match'>   ||   <re.Match object; span=(1, 5), match='21.4'>
21.4 <class 'str'> <class 'float'>   |  test * Million = 21400000.0


In [40]:
def parse_dollars(s):   
    # patterns to change into numbers
    p1 = "(\$\d+\.?\d*\s*milli?on)" # pulled from pattern #1 above
    p2 = "(\$\d+\.?\d*\s*billi?on)" # pulled from pattern #1 above
    
    # if s is string, then change it, otherwise np.nan
    if type(s) == str:
  
        # if input is of the form $###.## Million, pattern #1
        if re.match(p1, s, flags=re.IGNORECASE):
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000
            
            # return value
            return num
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2
        elif re.match(p2, s, flags=re.IGNORECASE):
        # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000000
            # return value
            return num

            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        
            # Remove dollar sign and commas, convert to float
            # return value

    # else: return np.nan 
    
    

In [41]:
test = re.findall(r"(?:\$)(\d+(?:[,\.]\d{3})+\.?\d*)", cleaning_wiki_df5["Box office"][2])
print(test, test[0], test[0].replace(",", ""), float(test[0].replace(",", "")))

['57,718,089'] 57,718,089 57718089 57718089.0


In [42]:
def parse_dollars(s):   
    # patterns to change into numbers
    p1 = "(\$\d+\.?\d*\s*milli?on)" # pulled from pattern #1 above
    p2 = "(\$\d+\.?\d*\s*billi?on)" # pulled from pattern #1 above
    p3 = "(\$\d+(?:[,\.]\d{3})+\.?\d*)"  # pulled from pattern #2 above
    
    # if s is string, then change it, otherwise np.nan
    if type(s) == str:
  
        # if input is of the form $###.## Million, pattern #1
        if re.match(p1, s, flags=re.IGNORECASE):
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000
            
            # return value
            return num
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2
        elif re.match(p2, s, flags=re.IGNORECASE):
        # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000000
            # return value
            return num

            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        elif re.match(p3, s, flags=re.IGNORECASE):
            # Remove dollar sign and commas, convert to float
            num = float(re.findall(r"(?:\$)(\d+(?:[,\.]\d{3})+\.?\d*)", s)[0].replace(",", ""))

            # return value
            return num
    # else: return np.nan 
    else: 
        return np.nan
    
    

In [43]:
# find the first instance of "billion" to test parse_dollars()

for c in cleaning_wiki_df5.index:
    if type(cleaning_wiki_df5['Box office'][c]) == str:
        if re.findall(pattern1, cleaning_wiki_df5['Box office'][c], flags=re.IGNORECASE):
            if "billion" in cleaning_wiki_df5['Box office'][c].lower():
                print(c, cleaning_wiki_df5['Box office'][c])
                break
cleaning_wiki_df5['Box office'].head()        

766 $1.030 billion


0    $21.4 million
1     $2.7 million
2      $57,718,089
3       $7,331,647
4       $6,939,946
Name: Box office, dtype: object

In [44]:
cleaning_wiki_df6 = pd.DataFrame(cleaning_wiki_df5)
# Choose the indexes for the test
print(cleaning_wiki_df6['Box office'][0], cleaning_wiki_df6['Box office'][766], cleaning_wiki_df6['Box office'][2])

# test parse_dollars()
print(parse_dollars(cleaning_wiki_df6['Box office'][0]), parse_dollars(cleaning_wiki_df6['Box office'][766]), parse_dollars(cleaning_wiki_df6['Box office'][2]))

# The function works where I need it to, check for where the if statements miss something.

$21.4 million $1.030 billion $57,718,089
21400000.0 1030000000.0 57718089.0


In [45]:
for c in cleaning_wiki_df6.index:
    test = parse_dollars(cleaning_wiki_df6['Box office'][c])
    if type(test) != float:
        print(c, test, type(test), cleaning_wiki_df6['Box office'][c])

602 None <class 'NoneType'> $5000
2096 None <class 'NoneType'> $309
6391 None <class 'NoneType'> $111
6392 None <class 'NoneType'> $588


In [46]:
def parse_dollars(s):   
    # patterns to change into numbers
    p1 = "(\$\d+\.?\d*\s*milli?on)" # pulled from pattern #1 above
    p2 = "(\$\d+\.?\d*\s*billi?on)" # pulled from pattern #1 above
    p3 = "(\$\d+(?:[,\.]\d{3})+\.?\d*)"  # pulled from pattern #2 above
    
    # if s is string, then change it, otherwise np.nan
    if type(s) == str:
  
        # if input is of the form $###.## Million, pattern #1
        if re.match(p1, s, flags=re.IGNORECASE):
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000
            
            # return value
            return num
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2
        elif re.match(p2, s, flags=re.IGNORECASE):
        # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000000
            # return value
            return num

            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        elif re.match(p3, s, flags=re.IGNORECASE):
            # Remove dollar sign and commas, convert to float
            num = float(re.findall(r"(?:\$)(\d+(?:[,\.]\d{3})+\.?\d*)", s)[0].replace(",", ""))

            # return value
            return num
        else: 
            if len(s) <= 6:
                if re.match("(\$\d{1,6}$)", s, flags=re.IGNORECASE):
                    num = float(s.replace("$", ""))
                    return num

    else: 
        return np.nan


In [47]:
for c in cleaning_wiki_df6.index:
    cleaning_wiki_df6['Box office'][c] = parse_dollars(cleaning_wiki_df6['Box office'][c])
    
cleaning_wiki_df6['Box office']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df6['Box office'][c] = parse_dollars(cleaning_wiki_df6['Box office'][c])


0       21400000.0
1        2700000.0
2       57718089.0
3        7331647.0
4        6939946.0
           ...    
7094    41900000.0
7095    76100000.0
7096    38400000.0
7097     5500000.0
7098           NaN
Name: Box office, Length: 7032, dtype: object

In [48]:
# cleaning_wiki_df6.groupby(['Budget']).count()  # error because of lists and strings again.
def find_dollars(s):
    # declared patterns from before
    p1 = "(\$\d+\.?\d*\s*[bm]illi?on)" # pulled from pattern #1 above
    p2 = "(\$\d+(?:[,\.]\d{3})+\.?\d*)"
    mp1 = r"(\$\d{1,3}\.?\d*',\s*'[bm]illi?on)"  # for when 43.5 Million is broken into '4.35' and 'million'
    mp2 = r"(\$', '\d{1,3}(?:[,\.]\d{3})+\.?\d*)"  # modified pattern #2 version 2 
    p3 = r"(\$\d*\.?\d*)(?:[-—–]\d*\.?\d*)(\s[bm]illi?on)" # modified pattern #1 version 3
    p4 = r"(\$\s\d{1,3}(?:[,\.]\d{3})+\.?\d*)" # modified pattern #2 version 3
    p5 = r"(\$',\s*'\d{1,3}\.?\d*\s[bm]illi?on)" # modified pattern #1 version 4
    p6 = r"^(\$\d{3,6})" # new pattern
    if len(re.findall(p1, str(s))) == 1:
        num = re.findall(p1, str(s))[0]
        return num
    elif len(re.findall(p2, str(s))) == 1:
        num = re.findall(p2, str(s))[0]
        return num
    elif len(re.findall(p1, str(s))) > 1: 
        num = re.findall(p1, str(s))[1]
        return num
    elif len(re.findall(p2, str(problem_lists[i]))) > 1:  
        num = re.findall(p2, str(s))[1]
        return num
    elif len(re.findall(mp1, str(s))) > 0:
        num = re.findall(mp1, str(s))[0].replace("', '", " ")
        return num
    elif len(re.findall(mp2, str(s))) >0:
        num = re.findall(mp2, str(s))[0].replace("', '", "")
        return num
    elif len(re.findall(p3, str(s))) >0:
        fix = re.findall(p3, str(s))
        num = fix[0][0] + fix[0][1]
        return num
    elif len(re.findall(p4, str(s)))>0:
        num = re.findall(p4, str(s))[0].replace(" ", "")
        return num
    elif len(re.findall(p5, str(s)))>0:
        num = re.findall(p5, str(s))[0].replace("', '", "")
        return num
    elif len(re.findall(p6, str(s)))>0:
        num = re.findall(p6, str(s))[0]
        return num
    else:
        return np.nan


In [49]:
budget = cleaning_wiki_df6['Budget']
for b in budget.index:
    print(budget[b], find_dollars(budget[b]), parse_dollars(find_dollars(budget[b])))
    

$20 million $20 million 20000000.0
$6 million $6 million 6000000.0
$35 million $35 million 35000000.0
$12 million $12 million 12000000.0
$25 million $25 million 25000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$50 million $50 million 50000000.0
$22 million $22 million 22000000.0
nan nan nan
$29 million $29 million 29000000.0
$40 million $40 million 40000000.0
$7 million $7 million 7000000.0
['$2,500,000', '[', 'citation needed', ']'] $2,500,000 2500000.0
nan nan nan
$20 million $20 million 20000000.0
nan nan nan
nan nan nan
nan nan nan
$47 million $47 million 47000000.0
nan nan nan
nan nan nan
$15 million $15 million 15000000.0
nan nan nan
$10 million $10 million 10000000.0
nan nan nan
$10 million $10 million 10000000.0
nan nan nan
nan nan nan
nan nan nan
$13 million $13 million 13000000.0
nan nan nan
nan nan nan
nan nan nan
$5.2 million $5.2 million 5200000.0
nan nan nan
$17.5 million $17.5 million 17500000.0
nan nan nan
nan nan nan
US$18 million $18 million 1800

nan nan nan
nan nan nan
$55 million $55 million 55000000.0
nan nan nan
$17 million $17 million 17000000.0
$50 million $50 million 50000000.0
nan nan nan
$7 million $7 million 7000000.0
nan nan nan
$3 million $3 million 3000000.0
$60 million $60 million 60000000.0
$24 million $24 million 24000000.0
$33–40 million $33 million 33000000.0
nan nan nan
nan nan nan
$15 million $15 million 15000000.0
nan nan nan
$30 million $30 million 30000000.0
nan nan nan
$1.3 million $1.3 million 1300000.0
$20 million $20 million 20000000.0
$12.5 million $12.5 million 12500000.0
nan nan nan
$11.7 million $11.7 million 11700000.0
$42 million $42 million 42000000.0
nan nan nan
$35 million $35 million 35000000.0
$20 million $20 million 20000000.0
$40 million $40 million 40000000.0
$25 million $25 million 25000000.0
$26 million $26 million 26000000.0
$20 million (estimate) $20 million 20000000.0
nan nan nan
$20 million $20 million 20000000.0
nan nan nan
$20 million $20 million 20000000.0
$5 million $5 million 

nan nan nan
$62 million $62 million 62000000.0
['$27,575', '$230,000 (post)'] nan nan
['$45 million', '[', 'citation needed', ']'] $45 million 45000000.0
nan nan nan
nan nan nan
$40 million $40 million 40000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$22 million $22 million 22000000.0
$35 million $35 million 35000000.0
nan nan nan
$14 million $14 million 14000000.0
['$23 million', '[1]', '[', 'dead link', ']'] $23 million 23000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$12 million $12 million 12000000.0
$5 million $5 million 5000000.0
$190,000 $190,000 190000.0
$55 million $55 million 55000000.0
nan nan nan
nan nan nan
$7.8 million $7.8 million 7800000.0
nan nan nan
$45 million $45 million 45000000.0
$17 million $17 million 17000000.0
$18 million $18 million 18000000.0
$8 million $8 million 8000000.0
$25-30 million $25 million 25000000.0
nan nan nan
$1 million $1 million 1000000.0
nan nan nan
$12 million $12 million 12000000.0
$1 million (estimated) $1 million 1000

$6 million $6 million 6000000.0
nan nan nan
$22 million $22 million 22000000.0
$30 million $30 million 30000000.0
$16 million $16 million 16000000.0
$20 million $20 million 20000000.0
nan nan nan
$172–175 million $172 million 172000000.0
$800,000 $800,000 800000.0
$17 million $17 million 17000000.0
$7 million $7 million 7000000.0
nan nan nan
nan nan nan
$30 million $30 million 30000000.0
nan nan nan
$20 million $20 million 20000000.0
nan nan nan
nan nan nan
$75 million $75 million 75000000.0
nan nan nan
nan nan nan
$25 million $25 million 25000000.0
nan nan nan
$ 24,000,000 $24,000,000 24000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$25 million $25 million 25000000.0
nan nan nan
nan nan nan
nan nan nan
$7 million $7 million 7000000.0
$9 million $9 million 9000000.0
$3.3 million $3.3 million 3300000.0
nan nan nan
nan nan nan
nan nan nan
$12 million $12 million 12000000.0
nan nan nan
$35 million $35 million 35000000.0
nan nan nan
$15 million $15 million 15000000.0


nan nan nan
nan nan nan
$4 million $4 million 4000000.0
$100,000 $100,000 100000.0
$18 million $18 million 18000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$1,000,000 $1,000,000 1000000.0
nan nan nan
$15 million $15 million 15000000.0
$7,000,000 $7,000,000 7000000.0
$4.7 million $4.7 million 4700000.0
nan nan nan
$6 million $6 million 6000000.0
$36 million $36 million 36000000.0
nan nan nan
nan nan nan
nan nan nan
$19 million $19 million 19000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$32 million $32 million 32000000.0
$250,000 $250,000 250000.0
nan nan nan
nan nan nan
nan nan nan
$8 million $8 million 8000000.0
nan nan nan
$5 million $5 million 5000000.0
nan nan nan
$75 million $75 million 75000000.0
$80 million $80 million 80000000.0
$90 million $90 million 90000000.0
$15 million $15 million 15000000.0
nan nan nan
nan nan nan
$8,500,000 $8,500,000 8500000.0
$28 million $28 million 28000000.0
$116 million $116 million 116000000.0
$20,000,000 $20,000,000 20000000.0

nan nan nan
$70 million $70 million 70000000.0
['$2 million', 'USD', '[1]'] $2 million 2000000.0
nan nan nan
nan nan nan
$5 million $5 million 5000000.0
$40 million $40 million 40000000.0
$2,000,000 (est.) $2,000,000 2000000.0
$73 million $73 million 73000000.0
$60 million $60 million 60000000.0
nan nan nan
nan nan nan
$35 million $35 million 35000000.0
$73–80 million $73 million 73000000.0
$25 million $25 million 25000000.0
$70 million $70 million 70000000.0
$50 million $50 million 50000000.0
$1.1 million $1.1 million 1100000.0
$20 million $20 million 20000000.0
['$23', 'million', '[2]'] $23 million 23000000.0
nan nan nan
$52 million $52 million 52000000.0
$60 million $60 million 60000000.0
$20 million $20 million 20000000.0
$45 million $45 million 45000000.0
$14 million $14 million 14000000.0
$20 million $20 million 20000000.0
$30 million $30 million 30000000.0
$23 million $23 million 23000000.0
$18 million $18 million 18000000.0
$85-90 million $85 million 85000000.0
nan nan nan
$6 m

nan nan nan
$14 million $14 million 14000000.0
$75 million $75 million 75000000.0
nan nan nan
$8.5 million $8.5 million 8500000.0
$33 million $33 million 33000000.0
$8.5 million $8.5 million 8500000.0
$100 million $100 million 100000000.0
$60 million $60 million 60000000.0
$70 million $70 million 70000000.0
$32 million $32 million 32000000.0
$15 million $15 million 15000000.0
$28 million $28 million 28000000.0
$3 million $3 million 3000000.0
$41.3 million $41.3 million 41300000.0
$8 million $8 million 8000000.0
$55 million $55 million 55000000.0
['$75', 'million', '[2]'] $75 million 75000000.0
$24 million $24 million 24000000.0
$1.2 million $1.2 million 1200000.0
$42 million $42 million 42000000.0
$62 million $62 million 62000000.0
$100 million $100 million 100000000.0
$30 million $30 million 30000000.0
$107 million $107 million 107000000.0
nan nan nan
$60 million $60 million 60000000.0
$3 million $3 million 3000000.0
$64.4 million $64.4 million 64400000.00000001
nan nan nan
$35 millio

$15 million $15 million 15000000.0
$15 million $15 million 15000000.0
$4 million $4 million 4000000.0
$85 million $85 million 85000000.0
nan nan nan
$25 million $25 million 25000000.0
$10,000,000 (estimated) $10,000,000 10000000.0
$60 million $60 million 60000000.0
$78 million $78 million 78000000.0
$15 million $15 million 15000000.0
$11 million $11 million 11000000.0
nan nan nan
$5,000,000 $5,000,000 5000000.0
$17 million $17 million 17000000.0
nan nan nan
$35 million $35 million 35000000.0
$68 million $68 million 68000000.0
$19 million $19 million 19000000.0
$40 million $40 million 40000000.0
$3 million $3 million 3000000.0
$3 million $3 million 3000000.0
$33 million $33 million 33000000.0
['$', '500,000'] $500,000 500000.0
nan nan nan
nan nan nan
$30 million $30 million 30000000.0
$26 million $26 million 26000000.0
$94 million $94 million 94000000.0
$20,000 $20,000 20000.0
nan nan nan
$20 million $20 million 20000000.0
$30 million $30 million 30000000.0
$12 million $12 million 12000

nan nan nan
$5 million $5 million 5000000.0
nan nan nan
$1.2 million $1.2 million 1200000.0
$25 million $25 million 25000000.0
$40 million $40 million 40000000.0
$12 million $12 million 12000000.0
$50 million $50 million 50000000.0
$75 million $75 million 75000000.0
$8 million $8 million 8000000.0
$150 million $150 million 150000000.0
$16 million $16 million 16000000.0
nan nan nan
nan nan nan
$70 million $70 million 70000000.0
$10 million $10 million 10000000.0
nan nan nan
$16 million $16 million 16000000.0
$80 million $80 million 80000000.0
$23 million $23 million 23000000.0
$1 million $1 million 1000000.0
$200 million $200 million 200000000.0
$30 million $30 million 30000000.0
nan nan nan
$60 million $60 million 60000000.0
nan nan nan
$90 million $90 million 90000000.0
$1.5 million $1.5 million 1500000.0
$65,000 $65,000 65000.0
$20 million $20 million 20000000.0
$45 million $45 million 45000000.0
$27 million $27 million 27000000.0
$45 million $45 million 45000000.0
$25 million $25 mi

nan nan nan
nan nan nan
nan nan nan
$4,000,000 $4,000,000 4000000.0
nan nan nan
$900,000 (estimated) $900,000 900000.0
$27 million $27 million 27000000.0
nan nan nan
$6 million $6 million 6000000.0
nan nan nan
$20 million $20 million 20000000.0
$40 million $40 million 40000000.0
$15 million $15 million 15000000.0
nan nan nan
$15 million $15 million 15000000.0
nan nan nan
$135 million $135 million 135000000.0
nan nan nan
$7,000,000 (US) $7,000,000 7000000.0
$30 million $30 million 30000000.0
$75 million $75 million 75000000.0
nan nan nan
nan nan nan
$150 million $150 million 150000000.0
$30 million $30 million 30000000.0
$35 million $35 million 35000000.0
nan nan nan
$35 million $35 million 35000000.0
nan nan nan
nan nan nan
$3 million $3 million 3000000.0
$750,000 $750,000 750000.0
['USD', '$4,000,000'] $4,000,000 4000000.0
nan nan nan
$450,000 (estimated) $450,000 450000.0
nan nan nan
nan nan nan
$5,000,000 (estimate) $5,000,000 5000000.0
$30,000 $30,000 30000.0
$25 million $25 millio

$70 million $70 million 70000000.0
nan nan nan
$45 million $45 million 45000000.0
$20 million $20 million 20000000.0
$80 million $80 million 80000000.0
$10 million $10 million 10000000.0
nan nan nan
$20 million $20 million 20000000.0
$67.5–70 million $67.5 million 67500000.0
$150 million $150 million 150000000.0
$12.5 million $12.5 million 12500000.0
$20 million $20 million 20000000.0
$37 million $37 million 37000000.0
$12 million $12 million 12000000.0
$55 million $55 million 55000000.0
nan nan nan
US$58 million $58 million 58000000.0
$3.4 million $3.4 million 3400000.0
$225 million $225 million 225000000.0
$55 million $55 million 55000000.0
$25 million $25 million 25000000.0
$7 million $7 million 7000000.0
$25 million $25 million 25000000.0
$167 million $167 million 167000000.0
['$185', 'million', '[3]'] $185 million 185000000.0
$80 million $80 million 80000000.0
nan nan nan
nan nan nan
$45 million $45 million 45000000.0
$25 million $25 million 25000000.0
$32 million $32 million 3200

$24 million $24 million 24000000.0
$25 million $25 million 25000000.0
$14 million $14 million 14000000.0
$21 million $21 million 21000000.0
$2.9 million $2.9 million 2900000.0
$52 million $52 million 52000000.0
nan nan nan
$35 million $35 million 35000000.0
['$13 million', '[', 'citation needed', ']'] $13 million 13000000.0
$40 million $40 million 40000000.0
$7 million $7 million 7000000.0
$32 million $32 million 32000000.0
$3,200,000 $3,200,000 3200000.0
$6 million $6 million 6000000.0
$100 million $100 million 100000000.0
$25 million $25 million 25000000.0
$80 million $80 million 80000000.0
$112 million $112 million 112000000.0
nan nan nan
$16 million $16 million 16000000.0
['$250 million', '(Shared with', 'Part 2', ')', '[2]', '[3]'] $250 million 250000000.0
$50 million $50 million 50000000.0
$36 million $36 million 36000000.0
['$120 million', '(gross)', '$100 million', '(net)', '[2]'] $100 million 100000000.0
$165 million $165 million 165000000.0
nan nan nan
$15 million $15 million

$12,000 $12,000 12000.0
nan nan nan
nan nan nan
nan nan nan
$100 million $100 million 100000000.0
$100,000 $100,000 100000.0
$1.3–2 million $1.3 million 1300000.0
nan nan nan
nan nan nan
nan nan nan
$30 million $30 million 30000000.0
nan nan nan
$31 million $31 million 31000000.0
$1.3 million $1.3 million 1300000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$325,000 $325,000 325000.0
$39 million $39 million 39000000.0
Less than $10 million $10 million 10000000.0
$14 million $14 million 14000000.0
nan nan nan
nan nan nan
$57-75 million $57 million 57000000.0
nan nan nan
nan nan nan
nan nan nan
$14 million $14 million 14000000.0
$25 million $25 million 25000000.0
$40 million $40 million 40000000.0
$23 million $23 million 23000000.0
nan nan nan
$42 million $42 million 42000000.0
$7 million $7 million 7000000.0
$10 million $10 million 10000000.0
$2 million $2 million 2000000.0
$30 million $30 million 30000000.0
$85 million $85 million 85000000.0
$6.9 million $6.9 million 6900000.0
$78

nan nan nan
$15 million $15 million 15000000.0
$50 million $50 million 50000000.0
$2 million $2 million 2000000.0
nan nan nan
nan nan nan
nan nan nan
$725,000 $725,000 725000.0
$46 million $46 million 46000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$84 million $84 million 84000000.0
$5 million $5 million 5000000.0
$38 million $38 million 38000000.0
['$130-154 million', '[4]', '[', 'dead link', ']', '[5]'] $130 million 130000000.0
nan nan nan
$15 million $15 million 15000000.0
nan nan nan
$30 million $30 million 30000000.0
$28 million $28 million 28000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$35 million $35 million 35000000.0
nan nan nan
$20 million $20 million 20000000.0
nan nan nan
nan nan nan
$90 million $90 million 90000000.0
nan nan nan
nan nan nan
nan nan nan
under $1 million $1 million 1000000.0
nan nan nan
$30 million $30 million 30000000.0
nan nan nan
nan nan nan
$105 million $105 million 105000000.0
nan nan nan
$25 million $25 million 25000000.0
nan nan

$11 million $11 million 11000000.0
$15 million $15 million 15000000.0
$34 million $34 million 34000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$50 million $50 million 50000000.0
$5–10 million $5 million 5000000.0
$30–40 million $30 million 30000000.0
nan nan nan
nan nan nan
$1 million $1 million 1000000.0
$2 million $2 million 2000000.0
nan nan nan
$1.9 million $1.9 million 1900000.0
nan nan nan
$20 million $20 million 20000000.0
$25 million $25 million 25000000.0
['$444–495.2 million (gross)', '[2]', '[3]', '$365.5 million (net)', '[2]'] $365.5 million 365500000.0
['£', '12 million', '[3]'] nan nan
$3 million $3 million 3000000.0
$35 million $35 million 35000000.0
$1.4 million $1.4 million 1400000.0
$150 million $150 million 150000000.0
$29–31 million $29 million 29000000.0
$35 million $35 million 35000000.0
$180–190 million $180 million 180000000.0
['¥1.15 billion', '(', '$10.5 million', ')'] $10.5 million 10500000.0
$37–52 million $37 million 37000000.0
$110 million $110 m

$3.5 million $3.5 million 3500000.0
nan nan nan
$25 million $25 million 25000000.0
$32 million $32 million 32000000.0
$13.5 million $13.5 million 13500000.0
$4 million $4 million 4000000.0
$8.6 millon $8.6 millon 8600000.0
nan nan nan
$316–400 million $316 million 316000000.0
$6 million $6 million 6000000.0
$8 million US $8 million 8000000.0
nan nan nan
$12 million $12 million 12000000.0
nan nan nan
nan nan nan
nan nan nan
$10.5 million $10.5 million 10500000.0
$30 million $30 million 30000000.0
$6 million $6 million 6000000.0
nan nan nan
nan nan nan
$2.9 million $2.9 million 2900000.0
$110 million $110 million 110000000.0
$10 million $10 million 10000000.0
$3.5 million $3.5 million 3500000.0
$5.5 million $5.5 million 5500000.0
$275–300 million $275 million 275000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$19 million $19 million 19000000.0
$35 million $35 million 35000000.0
$3 million $3 million 3000000.0
nan nan nan
nan nan nan
$70 million $70 million 70000000.0
nan nan nan

In [50]:
for b in cleaning_wiki_df6.index:
    cleaning_wiki_df6['Budget'][b] = parse_dollars(find_dollars(cleaning_wiki_df6['Budget'][b]))

cleaning_wiki_df6['Budget']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df6['Budget'][b] = parse_dollars(find_dollars(cleaning_wiki_df6['Budget'][b]))


0       20000000.0
1        6000000.0
2       35000000.0
3       12000000.0
4       25000000.0
           ...    
7094    42000000.0
7095    60000000.0
7096    20000000.0
7097     9000000.0
7098           NaN
Name: Budget, Length: 7032, dtype: object

In [57]:
# clean the release date column find 
release_date = cleaning_wiki_df6['Release date']
count_s = 0
count_l = 0
count_o =0
for r in release_date.index:
    if type(release_date[r]) == str:
        count_s += 1
    elif type(release_date[r]) == list:
        count_l += 1
    elif release_date[r] == np.nan:
        count+=1
    else:
        count_o += 1

print(f"String count: {count_s} \n List count: {count_l} \n Nan count: {count_o} ")    

String count: 474 
 List count: 6526 
 Nan count: 32 


In [70]:
release_date

0                       [July 11, 1990, (, 1990-07-11, )]
1       [May 17, 1990, (, 1990-05-17, ), (Cannes Film ...
2                     [August 10, 1990, (, 1990-08-10, )]
3                   [December 25, 1990, (, 1990-12-25, )]
4                                       December 19, 1990
                              ...                        
7094    [December 25, 2018, (, 2018-12-25, ), (United ...
7095    [December 11, 2018, (, 2018-12-11, ), (, Samue...
7096    [November 8, 2018, (, 2018-11-08, ), (, AFI Fe...
7097    [August 31, 2018, (, 2018-08-31, ), (, Telluri...
7098                 [28 December 2018, (, 2018-12-28, )]
Name: Release date, Length: 7032, dtype: object

In [78]:
# used this cell to find the patterns
for r in release_date.index:
    if re.findall(date_p1, str(release_date[r]), flags=re.IGNORECASE):  # Month DD, YYYY
        spam = re.findall(date_p1, str(release_date[r]), flags=re.IGNORECASE)[0]
    elif re.findall(date_p2, str(release_date[r]), flags=re.IGNORECASE): # DD Month YYYY
        spam = re.findall(date_p2, str(release_date[r]), flags=re.IGNORECASE)[0]
    elif re.findall(date_p3, str(release_date[r]), flags=re.IGNORECASE): # Month YYYY
        spam = re.findall(date_p3, str(release_date[r]), flags=re.IGNORECASE)[0]
    elif re.findall(date_p4, str(release_date[r]), flags=re.IGNORECASE): # YYYY
        spam = re.findall(date_p4, str(release_date[r]), flags=re.IGNORECASE)[0]

    else:
        print(r, release_date[r])





214 nan
1047 nan
1176 nan
1237 nan
1306 nan
1350 nan
1640 nan
1649 nan
1782 nan
1841 nan
1927 nan
1973 nan
2107 nan
2114 nan
2250 nan
2771 nan
3001 nan
3050 nan
3198 nan
3248 nan
3618 nan
3957 nan
4008 nan
4015 nan
4154 nan
4221 nan
4264 nan
5526 nan
5575 nan
5601 nan
5899 nan
6819 nan


In [77]:
# date patterns to parse
# Month DD, YYYY
date_p1 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*[123]?\d,\s*\d{4}"

# DD Month YYYY
date_p2 = r"[123]?\d\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{4}"

# Month YYYY
date_p3 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December),?\s*\d{4}"

# YYYY
date_p4 = r"\d{4}"


In [87]:
# used this cell to find the patterns
def find_dates(s):
    # declared patterns
    p1 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*[123]?\d,\s*\d{4}"
    p2 = r"[123]?\d\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{4}"
    p3 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December),?\s*\d{4}"
    p4 = r"\d{4}"
    if re.findall(p1, str(s), flags=re.IGNORECASE):  # Month DD, YYYY
        spam = re.findall(p1, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p2, str(s), flags=re.IGNORECASE): # DD Month YYYY
        spam = re.findall(p2, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p3, str(s), flags=re.IGNORECASE): # Month YYYY
        spam = re.findall(p3, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p4, str(s), flags=re.IGNORECASE): # YYYY
        spam = re.findall(p4, str(s), flags=re.IGNORECASE)[0]
        return spam
    else:
        return np.nan



In [91]:
for r in release_date.index:
    release_date[r] = find_dates(release_date[r])

release_date

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  release_date[r] = find_dates(release_date[r])


0 July 11, 1990
1 May 17, 1990
2 August 10, 1990
3 December 25, 1990
4 December 19, 1990
5 March 22, 1990
6 October 6, 1990
7 March 8, 1991
8 March 7, 1990
9 February 23, 1990
10 June 8, 1990
11 July 18, 1990
12 October 5, 1990
13 December 20, 1990
14 May 25, 1990
15 March 9, 1990
16 March 2, 1990
17 June 22, 1990
18 May 18, 1990
19 February 11, 1990
20 October 12, 1990
21 March 16, 1990
22 December 21, 1990
23 September 8, 1990
24 January 18, 1990
25 May 18, 1990
26 September 23, 1990
27 December 14, 1990
28 April 21, 1990
29 April 3, 1990
30 February 25, 1990
31 1990
32 May 1, 1990
33 November 9, 1990
34 November 2, 1990
35 January 31, 1990
36 1988
37 May 11, 1990
38 September 28, 1990
39 December 23, 1990
40 March 9, 1990
41 September 1990
42 April 13, 1990
43 March 14, 1990
44 May 4, 1990
45 October 19, 1990
46 1990
47 August 24, 1990
48 January 26, 1990
49 June 27, 1990
50 February 4, 1994
51 June 10, 1990
52 December 2, 1990
53 May 11, 1990
54 August 24, 1990
55 January 1990
56 O

641 September 11, 1992
642 December 17, 1992
643 October 30, 1992
644 February 21, 1992
645 June 1, 1992
646 June 8, 1992
647 April 3, 1992
648 March 11, 1992
649 December 6, 1992
650 October 1, 1992
651 April 17, 1992
652 December 18, 1992
653 December 25, 1992
654 September 4, 1992
655 May 2, 1992
656 July 3, 1992
657 October 9, 1992
658 August 3, 1992
659 July 10, 1992
660 June 26, 1992
661 December 16, 1992
662 1992
663 May 13, 1992
664 June 16, 1992
665 February 14, 1992
666 September 11, 1992
667 August 7, 1992
668 March 27, 1992
669 April 24, 1992
670 1994
671 September 11, 1992
672 April 24, 1992
673 October 23, 1992
674 July 15, 1992
675 June 3, 1993
676 November 19, 1993
677 April 2, 1993
678 September 17, 1993
679 February 4, 1994
680 September 17, 1993
681 January 15, 1993
682 March 5, 1993
683 July 23, 1993
684 January 6, 1993
685 October 9, 1992
686 January 22, 1993
687 August 20, 1993
688 December 3, 1993
689 December 25, 1993
691 April 23, 1993
692 March 5, 1993
693 Oct

1068 October 21, 1994
1069 September 16, 1994
1070 1994
1071 October 21, 1994
1072 September 9, 1994
1073 August 24, 1994
1074 February 18, 1994
1075 March 11, 1994
1076 June 3, 1994
1077 May 9, 1994
1078 December 21, 1994
1079 September 30, 1994
1080 October 28, 1994
1081 July 22, 1994
1083 July 31, 1994
1084 December 25, 1994
1085 September 15, 1994
1086 November 11, 1994
1087 September 3, 1994
1088 September 30, 1994
1089 June 21, 1996
1091 September 12, 1994
1092 April 13, 1994
1093 July 1, 1994
1094 1994
1095 September 10, 1994
1096 March 11, 1994
1097 October 28, 1994
1098 February 1, 1994
1099 September 2, 1994
1100 1994
1101 September 23, 1994
1102 July 15, 1994
1103 October 7, 1994
1104 December 16, 1994
1105 June 10, 1994
1106 October 28, 1994
1107 November 18, 1994
1108 October 28, 1994
1109 December 23, 1994
1110 December 1994
1111 February 25, 1994
1112 April 15, 1994
1113 November 18, 1994
1114 September 10, 1994
1115 March 15, 1994
1116 September 23, 1994
1117 March 12, 

1515 October 4, 1996
1516 December 23, 1996
1517 November 15, 1996
1518 December 6, 1996
1519 March 5, 1997
1520 March 5, 1996
1521 September 17, 1996
1522 December 13, 1996
1523 November 1, 1996
1524 August 5, 1992
1525 February 1996
1526 March 29, 1996
1527 October 18, 1996
1528 March 22, 1996
1530 January 12, 1996
1531 July 13, 1996
1532 December 1, 1996
1533 March 1, 1996
1534 May 31, 1996
1535 1996
1536 June 19, 1998
1537 January 12, 1996
1538 November 1996
1539 March 15, 1996
1540 1996
1541 May 31, 1996
1542 August 2, 1996
1543 November 1996
1544 November 15, 1996
1545 September 27, 1996
1546 June 21, 1996
1547 August 9, 1996
1548 December 25, 1996
1549 December 8, 1996
1550 December 14, 1996
1551 March 15, 1996
1552 September 27, 1996
1553 January 12, 1996
1554 March 11, 1996
1555 1996
1556 April 19, 1996
1557 March 29, 1996
1558 August 16, 1996
1559 March 8, 1996
1560 April 12, 1996
1561 September 13, 1996
1562 1996
1563 1996
1564 August 30, 1996
1565 September 20, 1996
1566 Ju

1960 December 25, 1997
1961 May 14, 1997
1962 December 10, 1997
1963 January 19, 1997
1964 June 20, 1997
1965 November 21, 1997
1966 August 22, 1997
1967 February 25, 1997
1969 March 18, 1997
1970 November 23, 1997
1971 24 October 1997
1972 February 20, 1998
1973 nan
1974 September 25, 1997
1975 January 15, 1997
1976 May 19, 1997
1977 March 14, 1997
1978 May 11, 1997
1979 January 25, 1997
1980 April 1997
1981 1997
1982 November 7, 1997
1983 November 14, 1997
1984 August 22, 1997
1985 3 October 1997
1986 April 18, 1997
1987 November 21, 1997
1988 January 31, 1997
1989 December 22, 1952
1990 July 2, 1997
1991 September 5, 1997
1992 January 17, 1997
1993 1997
1994 August 22, 1997
1995 February 22, 1997
1996 August 22, 1997
1997 November 21, 1997
1998 October 10, 1997
1999 December 19, 1997
2000 April 18, 1997
2001 June 20, 1997
2002 September 17, 1997
2003 April 1997
2004 August 30, 1997
2005 October 22, 1996
2006 May 2, 1997
2007 October 7, 1997
2008 April 17, 1998
2009 August 23, 1997
2

2448 April 9, 1999
2449 December 10, 1999
2450 January 1999
2451 October 1, 1999
2452 July 23, 1999
2453 February 2, 1999
2454 October 8, 1999
2455 May 5, 2000
2456 October 29, 1999
2457 December 29, 1999
2458 16 April 1999
2459 April 30, 1999
2460 August 25, 1999
2461 November 5, 1999
2462 July 23, 1999
2463 June 4, 1999
2464 July 31, 1999
2465 September 24, 1999
2466 January 30, 1999
2467 January 22, 1999
2468 June 11, 1999
2469 February 26, 1999
2470 December 12, 1999
2471 March 19, 1999
2472 October 9, 1999
2473 1999
2474 July 16, 1999
2475 November 17, 1999
2476 April 16, 1999
2477 November 10, 1999
2478 October 8, 1999
2479 April 23, 1999
2480 May 21, 1999
2481 September 10, 1999
2482 December 31, 1999
2483 May 11, 1999
2484 December 17, 1999
2485 1999
2486 December 22, 1999
2487 September 13, 1999
2488 September 3, 1999
2489 March 31, 1999
2490 February 12, 1999
2491 18 October 1999
2492 August 20, 1999
2493 May 14, 1999
2494 10 December 1999
2495 March 26, 1999
2496 September 1

2925 9 May 2001
2926 May 16, 2001
2927 May 4, 2001
2928 September 14, 2001
2929 September 7, 2001
2930 September 19, 2011
2931 April 19, 2002
2932 October 12, 2001
2933 December 14, 2001
2934 September 8, 2001
2935 August 31, 2001
2936 December 7, 2001
2937 October 26, 2001
2938 November 2, 2001
2939 April 27, 2001
2940 August 3, 2001
2941 August 7, 2001
2942 August 2, 2001
2943 November 21, 2001
2944 May 21, 2001
2945 January 26, 2001
2946 December 13, 2001
2947 July 27, 2001
2948 January 19, 2001
2949 June 29, 2001
2950 August 3, 2001
2951 August 17, 2001
2952 February 10, 2001
2953 2001
2954 October 19, 2001
2955 September 7, 2001
2956 October 5, 2001
2957 August 3, 2001
2958 April 24, 2001
2959 January 12, 2001
2960 February 9, 2001
2961 March 23, 2001
2962 July 4, 2001
2963 July 13, 2001
2964 2001
2965 March 2, 2001
2966 September 13, 2001
2967 August 10, 2001
2968 November 9, 2001
2969 December 20, 2001
2970 December 18, 2001
2971 April 22, 2001
2972 November 1, 2001
2973 Novembe

3390 October 3, 2003
3391 January 18, 2003
3392 June 2002
3393 October 4, 2004
3394 December 25, 2003
3395 22 November 2003
3396 April 4, 2003
3397 October 17, 2003
3398 March 16, 2003
3399 June 28, 2003
3401 January 20, 2003
3402 December 16, 2003
3403 13 May 2003
3404 October 24, 2003
3405 April 25, 2003
3406 January 25, 2003
3407 June 27, 2003
3408 June 13, 2003
3409 October 17, 2003
3410 September 26, 2003
3411 August 8, 2003
3412 October 20, 2003
3413 October 3, 2003
3414 July 7, 2003
3415 September 19, 2003
3416 June 21, 2003
3417 February 7, 2003
3418 August 10, 2003
3419 July 2, 2003
3420 January 17, 2003
3421 July 17, 2003
3422 December 12, 2003
3423 July 25, 2003
3424 October 3, 2003
3425 December 12, 2003
3426 October 8, 2004
3427 March 7, 2003
3428 June 30, 2003
3429 March 8, 2003
3430 October 17, 2003
3431 January 17, 2003
3432 July 7, 2003
3433 July 18, 2003
3434 November 26, 2003
3435 January 22, 2003
3436 October 21, 2003
3437 September 26, 2003
3438 September 19, 2003


3892 February 14, 2005
3893 3 August 2005
3894 October 7, 2005
3895 September 2, 2005
3896 August 26, 2005
3897 August 19, 2005
3898 January 23, 2005
3899 September 16, 2005
3900 October 7, 2005
3901 September 4, 2005
3902 June 23, 2005
3903 September 30, 2005
3904 October 20, 2005
3905 July 15, 2005
3906 February 4, 2005
3907 13 May 2005
3908 January 7, 2005
3909 January 22, 2006
3910 7 December 2005
3911 April 29, 2005
3912 November 23, 2005
3913 November 11, 2005
3914 November 30, 2006
3915 September 11, 2006
3916 April 21, 2006
3917 March 3, 2006
3919 2006
3920 June 14, 2006
3921 October 3, 2006
3922 August 18, 2006
3923 November 4, 2006
3924 2006
3925 March 16, 2006
3926 September 13, 2006
3927 June 17, 2006
3928 September 22, 2006
3929 April 28, 2006
3930 16 September 2006
3931 July 28, 2006
3932 September 22, 2006
3933 2006
3934 September 22, 2006
3935 June 24, 2006
3936 September 1, 2006
3937 January 27, 2006
3938 April 29, 2006
3939 July 28, 2006
3940 December 8, 2006
3941 Feb

4345 November 30, 2007
4346 August 29, 2007
4347 February 2, 2007
4348 November 2, 2007
4349 September 6, 2007
4350 April 20, 2007
4351 November 16, 2007
4352 April 26, 2007
4353 November 5, 2007
4354 March 30, 2007
4355 December 21, 2007
4356 2007
4357 2007
4358 July 25, 2007
4359 August 3, 2007
4360 September 14, 2007
4361 February 16, 2007
4362 February 16, 2007
4363 January 20, 2007
4364 April 30, 2007
4365 September 7, 2007
4366 December 15, 2007
4367 March 18, 2007
4368 13 July 2007
4369 August 4, 2007
4370 18 June 2007
4371 October 1, 2007
4372 January 25, 2007
4373 December 21, 2007
4374 January 18, 2007
4375 January 5, 2007
4376 October 19, 2007
4377 April 27, 2007
4378 5 October 2007
4379 June 1, 2007
4380 June 14, 2007
4381 August 8, 2007
4382 February 14, 2007
4383 2007
4384 October 26, 2007
4385 November 2, 2007
4386 April 2008
4387 April 29, 2007
4388 March 16, 2007
4389 August 31, 2007
4390 May 11, 2007
4391 October 5, 2007
4393 April 4, 2007
4394 22 May 2007
4395 Septem

4808 February 8, 2008
4809 December 12, 2008
4810 April 22, 2008
4811 January 19, 2008
4812 February 22, 2008
4813 September 4, 2008
4814 September 5, 2008
4815 July 25, 2008
4816 December 9, 2008
4817 June 6, 2008
4818 September 7, 2008
4819 January 17, 2009
4820 March 19, 2009
4821 November 13, 2009
4822 September 9, 2009
4823 July 29, 2009
4824 April 3, 2009
4825 November 7, 2009
4826 April 3, 2009
4827 July 31, 2009
4828 September 4, 2009
4829 December 23, 2009
4830 October 23, 2009
4831 August 29, 2008
4832 May 4, 2009
4833 July 24, 2009
4834 June 12, 2010
4835 December 4, 2009
4836 January 17, 2008
4837 October 8, 2009
4838 December 10, 2009
4839 June 5, 2009
4840 November 20, 2009
4841 August 6, 2009
4842 September 8, 2007
4843 10 September 2009
4844 September 11, 2009
4845 August 28, 2009
4846 November 20, 2009
4847 October 19, 2009
4848 November 6, 2009
4849 January 9, 2009
4850 December 4, 2009
4852 March 13, 2009
4853 July 10, 2009
4854 April 3, 2009
4855 September 6, 2009
4

5259 April 22, 2011
5260 December 16, 2011
5261 January 24, 2011
5262 September 2, 2011
5263 January 23, 2011
5264 April 8, 2011
5265 11 November 2011
5266 April 15, 2011
5267 June 24, 2011
5268 March 8, 2011
5269 March 4, 2011
5270 July 8, 2011
5271 March 16, 2011
5272 September 11, 2010
5273 June 24, 2011
5274 February 18, 2011
5275 October 14, 2011
5276 April 8, 2011
5277 October 30, 2011
5278 April 28, 2011
5279 January 21, 2011
5280 September 9, 2011
5281 July 19, 2011
5282 1 September 2011
5283 June 18, 2011
5284 January 23, 2011
5285 August 5, 2011
5286 February 18, 2011
5287 27 July 2011
5288 August 11, 2011
5289 September 11, 2010
5290 September 3, 2011
5291 August 26, 2011
5292 July 29, 2011
5293 July 19, 2011
5294 December 22, 2011
5295 4 September 2010
5296 September 10, 2011
5297 22 January 2011
5298 March 17, 2011
5299 January 14, 2011
5301 September 21, 2011
5302 November 6, 2010
5303 October 28, 2011
5304 September 30, 2011
5305 May 20, 2011
5306 February 25, 2011
5307 

5723 March 9, 2012
5724 April 13, 2012
5725 August 24, 2012
5727 October 31, 2012
5728 August 3, 2012
5729 April 13, 2012
5730 September 21, 2012
5731 November 16, 2012
5732 January 20, 2012
5733 April 19, 2012
5734 January 22, 2012
5736 February 6, 2012
5737 February 24, 2012
5738 July 27, 2012
5739 May 18, 2012
5740 September 28, 2012
5741 January 27, 2012
5742 March 28, 2012
5743 October 29, 2012
5744 June 1, 2012
5745 December 19, 2012
5746 July 30, 2013
5747 June 4, 2013
5748 August 30, 2013
5749 January 17, 2013
5750 March 1, 2013
5751 May 22, 2013
5752 January 15, 2013
5753 April 12, 2013
5754 December 6, 2013
5755 June 6, 2013
5757 August 15, 2013
5758 January 23, 2013
5759 March 22, 2013
5760 May 1, 2013
5761 September 11, 2012
5762 January 19, 2013
5763 May 22, 2013
5764 September 27, 2013
5765 December 8, 2013
5766 November 24, 2013
5767 November 1, 2013
5768 December 6, 2012
5769 March 28, 2013
5770 August 30, 2012
5771 September 10, 2012
5772 September 14, 2012
5773 Januar

6211 January 17, 2014
6212 January 20, 2014
6213 June 5, 2014
6214 June 20, 2014
6215 June 27, 2014
6216 January 24, 2014
6217 June 19, 2014
6218 July 2, 2014
6219 June 14, 2014
6220 July 2, 2014
6221 July 2, 2014
6222 January 19, 2014
6223 June 26, 2014
6224 May 9, 2014
6225 April 12, 2014
6226 January 18, 2014
6227 July 18, 2014
6228 July 15, 2014
6229 July 18, 2014
6230 July 18, 2014
6231 18 July 2014
6232 January 18, 2014
6233 July 25, 2014
6234 January 19, 2014
6235 July 25, 2014
6236 July 25, 2014
6237 July 25, 2014
6238 August 1, 2014
6239 July 21, 2014
6240 August 5, 2014
6241 June 5, 2014
6242 August 4, 2014
6243 August 8, 2014
6244 July 9, 2014
6245 July 29, 2014
6246 August 13, 2014
6247 August 4, 2014
6248 August 11, 2014
6249 January 19, 2014
6250 January 21, 2014
6251 August 18, 2014
6252 August 22, 2014
6253 January 18, 2014
6254 August 22, 2014
6255 August 19, 2014
6256 August 29, 2014
6257 September 8, 2014
6258 September 12, 2014
6259 September 12, 2014
6260 March 7, 

6670 January 20, 2017
6671 September 9, 2016
6672 April 7, 2017
6673 April 5, 2017
6674 March 30, 2017
6675 September 9, 2016
6676 April 7, 2017
6677 April 7, 2017
6678 April 4, 2017
6679 April 22, 2016
6680 October 15, 2016
6681 September 11, 2016
6682 April 14, 2017
6683 August 12, 2016
6684 April 21, 2017
6685 September 11, 2016
6686 April 26, 2017
6687 April 28, 2017
6688 January 23, 2016
6689 April 10, 2017
6690 April 22, 2017
6691 September 12, 2015
6692 8 May 2017
6693 May 2, 2017
6694 June 1, 2016
6695 May 12, 2017
6696 12 September 2016
6697 May 4, 2017
6698 May 19, 2017
6699 May 19, 2017
6700 September 2, 2016
6701 May 13, 2017
6702 May 11, 2017
6703 May 26, 2017
6704 May 24, 2017
6705 May 15, 2017
6706 May 21, 2017
6707 May 22, 2017
6708 April 29, 2017
6709 June 9, 2017
6710 June 5, 2017
6711 January 23, 2017
6712 May 23, 2017
6713 June 12, 2017
6714 June 14, 2017
6715 12 June 2017
6716 June 14, 2017
6717 June 18, 2017
6718 May 24, 2017
6719 January 20, 2017
6720 September 6

In [95]:
cleaning_wiki_df6['Release date'] = pd.to_datetime(release_date.str.extract(f"({date_p1}|{date_p2}|{date_p3}|{date_p4})", flags=re.IGNORECASE)[0], infer_datetime_format=True)
cleaning_wiki_df6['Release date']

0      1990-07-11
1      1990-05-17
2      1990-08-10
3      1990-12-25
4      1990-12-19
          ...    
7094   2018-12-25
7095   2018-12-11
7096   2018-11-08
7097   2018-08-31
7098   2018-12-28
Name: Release date, Length: 7032, dtype: datetime64[ns]

In [120]:
running_time = cleaning_wiki_df6['Running time']
running_time

0                                 102 minutes
1                                 114 minutes
2                                 113 minutes
3                                 106 minutes
4                                  95 minutes
                        ...                  
7094                               90 minutes
7095                              132 minutes
7096                              120 minutes
7097                              123 minutes
7098    Variable; 90 minutes for default path
Name: Running time, Length: 7032, dtype: object

In [166]:
# use this cell to find the patterns and write them in the cell below
for r in running_time.index:
    if re.findall(time_p1, str(running_time[r]), flags=re.IGNORECASE): # 120 minutes, 120 min, 120 min., '120', 'min'
        spam = re.findall(time_p1, str(running_time[r]), flags=re.IGNORECASE)[0]
#         print(r, spam) # ready to convert to an int
    elif re.findall(time_p2, str(running_time[r]), flags=re.IGNORECASE): # 1h 48m, 70m
        spam = re.findall(time_p2, str(running_time[r]), flags=re.IGNORECASE)[0]
        minutes = int(re.findall(r"(\d+)m$", spam, flags=re.IGNORECASE)[0])
        try:
            hours = int(re.findall(r"(\d+)h", spam, flags=re.IGNORECASE)[0])*60 
            minutes+=hours
        except:
            pass
        
#         print(r, spam, type(spam), minutes)
    elif re.findall(time_p3, str(running_time[r]), flags=re.IGNORECASE): # 1 record says "4 hours" 
        spam = re.findall(time_p3, str(running_time[r]), flags=re.IGNORECASE)[0]
        hours = int(re.findall(r"(\d+)\s*h", spam, flags=re.IGNORECASE)[0])*60 
#         print(r, spam, type(spam), hours)
    else:
        pass
#         print(r, running_time[r])

3936 4 hours <class 'str'> 240


In [161]:
# patterns found in running_time using the cell above
time_p1 = r"(\d+)(?:', ')?\s*min\.?(?:utes)?" # 120 minutes, 120 min, 120 min., '120', 'min'
time_p2 = r"((?:\d+\s*h)?\s*\d+m)"# 1h 48m, 70m
time_p3 = r"(\d+\s*hours?)" # one record says "4 Hours"

In [176]:
# make the changes to running_time for an easier number conversion
def find_time(s):
    # Declared patterns
    p1 = r"(\d+)(?:', ')?\s*min\.?(?:utes)?" # 120 minutes, 120 min, 120 min., '120', 'min'
    p2 = r"((?:\d+\s*h)?\s*\d+m)"# 1h 48m, 70m
    p3 = r"(\d+\s*hours?)" # one record says "4 Hours"
    if type(s) == float:
        return s
    if re.findall(p1, str(s), flags=re.IGNORECASE): # 120 minutes, 120 min, 120 min., '120', 'min'
        spam = re.findall(p1, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p2, str(s), flags=re.IGNORECASE): # 1h 48m, 70m
        spam = re.findall(p2, str(s), flags=re.IGNORECASE)[0]
        egg = int(re.findall(r"(\d+)m$", spam, flags=re.IGNORECASE)[0])
        try:
            hours = int(re.findall(r"(\d+)h", spam, flags=re.IGNORECASE)[0])*60 
            egg+=hours
        except:
            pass

        return egg
    elif re.findall(p3, str(s), flags=re.IGNORECASE): # 1 record says "4 hours" 
        spam = re.findall(p3, str(s), flags=re.IGNORECASE)[0]
        egg = int(re.findall(r"(\d+)\s*h", spam, flags=re.IGNORECASE)[0])*60
        return egg
    else:
        return float(np.nan)

In [177]:
# test the find_time() function
for r in running_time.index:
    print(r, find_time(running_time[r]), running_time[r])

0 102 102 minutes
1 114 114 minutes
2 113 113 minutes
3 106 106 minutes
4 95 95 minutes
5 95 95 minutes
6 100 100 minutes
7 99 99 minutes
8 50 50 minutes
9 102 102 min
10 93 93 minutes
11 110 110 minutes
12 126 126 minutes
13 121 121 minutes
14 118 118 minutes
15 99 99 minutes
16 90 90 minutes
17 94 94 minutes
18 110 110 minutes
19 190 190 minutes
20 85 85 minutes
21 102 102 minutes
22 126 126 minutes
23 96 96 minutes
24 97 97 minutes
25 97 97 minutes
26 93 93 min
27 97 97 minutes
28 32 32 min.
29 98 98 minutes
30 nan nan
31 95 95 minutes
32 98 98 minutes
33 84 84 minutes
34 101 101 min
35 97 97 min
36 86 86 minutes
37 99 99 minutes
38 97 97 minutes
39 138 138 minutes
40 99 99 minutes
41 85 85 minutes
42 91 91 min.
43 85 85 minutes
44 95 95 minutes
45 181 181 minutes
46 nan nan
47 95 95 minutes
48 93 93 minutes
49 108 108 minutes
50 120 120 minutes
51 95 95 minutes
52 100 100 minutes
53 94 94 minutes
54 111 111 minutes
55 103 103 minutes
56 106 106 minutes
57 105 105 minutes
58 124 124

536 112 112 minutes
537 108 ['108 minutes', '141 minutes', "(director's cut)"]
538 128 128 minutes
539 108 108 minutes
540 110 110 minutes
541 118 118 minutes
542 103 103 minutes
543 85 85 minutes
544 129 129 minutes
545 84 ['84 minutes (theatrical)', "91 minutes (unrated director's cut)"]
546 102 102 minutes
547 92 92 minutes
548 117 117 minutes
549 98 98 minutes
550 202 202 minutes
551 104 104 minutes
552 95 ['95 minutes', '[1]', '92 minutes', '[2]', '(Edited cut)']
553 100 100 minutes
554 84 84 min
555 106 106 minutes
556 51 51 minutes
557 99 99 minutes
558 97 97 minutes
559 108 108 minutes
560 104 104 minutes
561 88 88 minutes
562 110 110 minutes
563 89 89 minutes
564 88 88 minutes
565 87 87 min
566 108 108 minutes
567 119 119 minutes
568 86 86 minutes
569 119 119 minutes
570 99 99 minutes
571 95 95 minutes
572 121 121 minutes
573 105 105 minutes
574 67 67 minutes
575 103 103 minutes
576 99 99 minutes
577 110 110 minutes
578 93 93 minutes
579 94 94 minutes
580 105 105 minutes
581 8

1531 91 91 minutes
1532 50 50 minutes
1533 93 93 minutes
1534 103 103 minutes
1535 76 76 minutes
1536 nan nan
1537 90 90 minutes
1538 94 94 minutes
1539 94 94 minutes
1540 88 88 minutes
1541 100 100 minutes
1542 120 120 minutes
1543 118 118 minutes
1544 162 162 minutes
1545 nan nan
1546 114 114 minutes
1547 101 101 minutes
1548 129 129 minutes
1549 101 101 minutes
1550 134 134 minutes
1551 133 133 minutes
1552 118 118 minutes
1553 101 101 minutes
1554 90 90 minutes
1555 nan varies
1556 91 91 minutes
1557 109 109 minutes
1558 116 116 minutes
1559 98 98 minutes
1560 96 96 minutes
1561 99 99 minutes
1562 84 84 min
1563 90 90 minutes
1564 101 101 minutes
1565 103 103 minutes
1566 98 98 minutes
1567 95 95 Minutes
1568 93 93 minutes
1569 107 107 minutes
1570 nan nan
1571 100 100 min.
1572 102 102 minutes
1573 102 102 minutes
1574 110 110 minutes
1575 108 108 minutes
1576 98 98 minutes
1577 99 99 min.
1578 99 99 minutes
1579 87 87 minutes
1580 120 120 minutes
1581 92 92 minutes
1582 110 110 m

1974 137 137 minutes
1975 134 134 minutes
1976 129 129 minutes
1977 108 108 minutes
1978 103 103 minutes
1979 108 108 minutes
1980 96 96 minutes
1981 11 11 min.
1982 114 114 minutes
1983 94 94 minutes
1984 105 105 minutes
1985 97 97 minutes
1986 108 108 min.
1987 104 104 minutes
1988 105 105 minutes
1989 93 93 minutes
1990 98 98 minutes
1991 127 127 minutes
1992 117 117 minutes
1993 83 83 min.
1994 106 106 minutes
1995 118 118 minutes
1996 97 97 minutes
1997 95 95 minutes
1998 99 99 minutes
1999 98 98 minutes
2000 107 107 minutes
2001 104 104 minutes
2002 93 93 minutes
2003 108 108 minutes
2004 93 93 minutes
2005 113 113 minutes
2006 93 93 minutes
2007 88 88 min.
2008 101 101 minutes
2009 111 111 minutes
2010 98 98 minutes
2011 85 85 minutes
2012 176 ['176 minutes', '(2 parts)']
2013 82 82 min
2014 91 91 min.
2015 84 84 minutes
2016 96 96 min.
2017 90 90 mins.
2018 120 120 minutes
2019 102 102 min.
2020 94 94 minutes
2021 132 132 minutes
2022 109 109 minutes
2023 108 108 minutes
2024 1

2489 136 136 minutes
2490 131 131 minutes
2491 158 158 minutes
2492 102 102 minutes
2493 116 116 minutes
2494 103 103 minutes
2495 92 92 minutes
2496 91 91 minutes
2497 112 112 minutes
2498 125 125 minutes
2499 87 87 minutes
2500 97 97 minutes
2501 123 123 minutes
2502 93 93 minutes
2503 246 246 minutes
2504 119 119 minutes
2505 121 121 minutes
2506 107 107 minutes
2507 133 133 minutes
2508 95 95 minutes
2509 124 124 minutes
2510 103 103 minutes
2511 89 89 minutes
2512 100 100 minutes
2513 126 126 minutes
2514 130 130 minutes
2515 61 61 minutes
2516 95 95 minutes
2517 104 104 minutes
2518 nan nan
2519 101 101 minutes
2520 124 124 minutes
2521 124 124 minutes
2522 105 105 minutes
2523 133 133 minutes
2524 100 100 minutes
2525 90 90 minutes
2526 95 95 minutes
2527 116 116 minutes
2528 94 94 minutes
2529 97 97 minutes
2530 85 85 minutes
2531 96 96 minutes
2532 107 107 minutes
2533 105 105 minutes
2534 111 111 minutes
2535 127 127 minutes
2536 81 81 minutes
2537 105 105 minutes
2538 133 ['

3043 132 132 minutes
3044 141 141 minutes
3045 75 75 minutes
3046 99 99 minutes
3047 89 89 minutes
3048 109 109 minutes
3049 113 113 minutes
3050 nan nan
3051 104 104 minutes
3052 116 116 minutes
3053 95 95 minutes
3054 94 94 minutes
3055 100 100 minutes
3056 108 108 minutes
3057 90 90 minutes
3058 81 81 minutes
3059 113 113 minutes
3060 131 131 minutes
3061 88 88 minutes
3062 113 113 minutes
3063 94 94 minutes
3064 95 95 minutes
3065 133 133 minutes
3066 104 104 minutes
3067 102 102 minutes
3068 109 109 minutes
3069 85 85 min.
3070 98 98 minutes
3071 116 116 minutes
3072 104 104 minutes
3073 120 120 minutes
3074 76 76 minutes
3075 100 100 minutes
3076 109 109 minutes
3077 90 90 minutes
3078 116 116 minutes
3079 107 107 minutes
3080 93 93 minutes
3081 107 107 minutes
3082 101 101 minutes
3083 105 105 min.
3084 123 123 minutes
3085 84 84 minutes
3086 130 ['130 minutes', '125 minutes', '(', 'TIFF', ')']
3088 168 168 minutes
3089 90 90 minutes
3090 93 93 minutes
3091 92 92 minutes
3092 94

3546 122 122 minutes
3547 136 136 minutes
3548 80 80 minutes
3549 96 96 minutes
3550 76 76 minutes
3551 121 121 minutes
3553 106 106 minutes
3554 115 115 minutes
3555 111 111 minutes
3556 115 115 minutes
3557 110 110 minutes
3558 89 89 minutes
3559 88 88 minutes
3560 102 102 minutes
3561 97 97 minutes
3562 100 100 minutes
3563 136 136 minutes
3564 126 ['126 minutes', '142 minutes', "(Director's cut)"]
3565 118 118 minutes
3566 114 114 minutes
3567 104 104 minutes
3568 87 87 minutes
3569 93 93 minutes
3570 108 108 minutes
3571 121 121 minutes
3572 118 118 minutes
3573 111 111 minutes
3574 120 120 minutes
3575 146 146 minutes
3576 130 130 minutes
3577 101 101 minutes
3578 90 90 minutes
3579 97 97 minutes
3580 115 115 minutes
3581 131 131 minutes
3582 99 99 minutes
3583 132 132 minutes
3584 90 90 minutes
3585 106 106 minutes
3586 135 135 minutes
3587 128 128 minutes
3588 55 55 minutes
3589 135 135 minutes
3590 126 126 minutes
3591 104 104 minutes
3592 86 86 minutes
3593 105 105 minutes
35

4434 87 87 minutes
4435 138 138 minutes
4436 85 85 minutes
4437 116 116 minutes
4438 86 ['US domestic version:', '86 minutes', 'Original version:', '99 minutes']
4439 90 90 minutes
4440 84 84 minutes
4441 92 92 minutes
4442 116 116 minutes
4443 123 123 minutes
4444 94 94 minutes
4445 88 88 minutes
4446 101 101 minutes
4447 101 101 minutes
4448 97 97 minutes
4449 105 105 minutes
4450 115 115 minutes
4451 93 93 minutes
4452 135 135 minutes
4453 98 98 minutes
4454 121 121 minutes
4455 148 148 minutes
4456 99 99 minutes
4457 102 102 minutes
4458 87 87 minutes
4459 106 106 minutes
4460 96 96 minutes
4461 108 108 minutes
4462 95 95 minutes
4463 79 79 minutes
4464 109 109 minutes
4465 128 128 minutes
4466 129 129 minutes
4467 116 116 minutes
4468 152 152 minutes
4469 106 106 minutes
4470 96 96 minutes
4471 117 117 minutes
4472 91 91 minutes
4473 90 90 minutes
4474 88 88 minutes
4475 109 109 minutes
4476 129 129 minutes
4477 98 98 minutes
4478 139 139 minutes
4479 124 124 minutes
4480 158 158 

4905 153 153 minutes
4906 102 102 minutes
4907 129 129 minutes
4908 92 92 minutes
4909 88 88 minutes
4910 91 91 minutes
4911 100 100 minutes
4912 94 94 minutes
4913 113 113 minutes
4914 98 98 minutes
4915 105 105 minutes
4916 131 131 minutes
4917 102 102 minutes
4918 105 105 minutes
4919 87 87 minutes
4920 107 107 minutes
4921 123 123 minutes
4922 108 108 minutes
4923 98 98 minutes
4924 153 153 minutes
4925 106 106 minutes
4926 118 118 minutes
4927 133 133 minutes
4928 96 96 minutes
4929 100 100 minutes
4930 nan nan
4931 120 120 minutes
4932 85 85 minutes
4933 102 102 minutes
4934 76 ['76 minutes', '[1]', '85 minutes', '(Extended edition)', '[2]']
4935 123 123 minutes
4936 121 121 minutes
4937 90 90 minutes
4938 90 90 minutes
4939 102 102 minutes
4940 100 100 minutes
4941 118 118 minutes
4942 97 97 minutes
4943 97 97 minutes
4944 116 116 minutes
4945 109 109 minutes
4946 95 95 minutes
4947 135 135 minutes
4948 94 94 minutes
4949 103 103 minutes
4950 90 90 minutes
4951 94 94 minutes
495

5476 107 107 minutes
5477 90 90 minutes
5478 120 120 minutes
5479 nan nan
5480 112 112 minutes
5481 nan nan
5482 80 80 minutes
5483 143 143 minutes
5484 98 98 minutes
5485 87 87 minutes
5486 90 90 minutes
5487 92 92 minutes
5488 74 74 minutes
5489 73 73 minutes
5490 94 94 min
5491 82 82 minutes
5492 76 ['76 minutes', '(Part 1)', '76 minutes', '(Part 2)', '148 minutes', '(Deluxe Edition)']
5493 101 101 minutes
5494 nan nan
5495 131 131 minutes
5496 85 85 minutes
5497 nan nan
5498 93 93 minutes
5500 nan nan
5501 102 102 minutes
5502 82 82 minutes
5503 83 83 minutes
5504 99 99 minutes
5505 89 89 minutes
5506 89 89 minutes
5507 90 90 minutes
5508 107 107 minutes
5509 80 80 minutes
5510 104 104 minutes
5511 79 79 minutes
5512 80 80 minutes
5513 70 70 minutes
5514 nan nan
5515 135 135 minutes
5516 92 92 minutes
5517 106 106 minutes
5518 93 93 minutes
5519 82 82 minutes
5520 151 151 minutes
5521 99 99 minutes
5522 90 90 minutes
5523 95 95 minutes
5524 84 84 minutes
5525 94 94 minutes
5526 nan

6258 99 99 minutes
6259 107 107 minutes
6260 87 87 minutes
6261 84 84 minutes
6262 100 100 minutes
6263 114 114 minutes
6264 113 113 minutes
6265 89 89 minutes
6266 93 93 minutes
6267 103 103 minutes
6268 101 101 minutes
6269 97 97 minutes
6270 132 132 minutes
6271 116 116 minutes
6272 97 97 minutes
6273 99 99 minutes
6274 149 149 minutes
6275 110 110 minutes
6276 110 110 minutes
6277 87 87 minutes
6278 105 105 minutes
6279 81 81 minutes
6280 92 92 minutes
6281 141 141 minutes
6282 106 106 minutes
6283 102 102 minutes
6284 119 119 minutes
6285 95 95 minutes
6286 117 117 minutes
6287 67 67 minutes
6288 108 108 minutes
6289 135 135 minutes
6290 109 109 minutes
6291 117 117 minutes
6292 nan nan
6293 100 100 minutes
6294 100 100 minutes
6295 89 89 minutes
6296 169 169 minutes
6297 88 88 minutes
6298 95 95 minutes
6299 102 102 minutes
6300 94 94 minutes
6301 90 90 minutes
6302 116 116 minutes
6303 109 109 minutes
6304 134 134 minutes
6305 123 123 minutes
6306 79 79 minutes
6307 123 123 minu

6676 101 101 minutes
6677 92 92 minutes
6678 136 136 minutes
6679 90 90 minutes
6680 141 141 minutes
6681 75 75 minutes
6682 95 95 minutes
6683 76 76 minutes
6684 100 100 minutes
6685 134 134 minutes
6686 110 110 minutes
6687 115 115 minutes
6688 89 89 minutes
6689 137 137 minutes
6690 94 94 minutes
6691 92 92 minutes
6692 126 126 minutes
6693 97 97 minutes
6694 99 99 minutes
6695 81 81 minutes
6696 92 92 minutes
6697 122 122 minutes
6698 90 90 minutes
6699 96 96 minutes
6700 96 96 minutes
6701 116 116 minutes
6702 129 129 minutes
6703 111 111 minutes
6704 122 122 minutes
6705 141 141 minutes
6706 89 89 minutes
6707 110 110 minutes
6708 91 91 minutes
6709 106 106 minutes
6710 116 116 minutes
6711 83 83 minutes
6712 102 102 minutes
6713 101 101 minutes
6714 140 140 minutes
6715 89 89 minutes
6716 105 105 minutes
6717 154 154 minutes
6718 94 94 minutes
6719 120 120 minutes
6720 115 115 minutes
6721 113 113 minutes
6722 120 120 minutes
6723 90 90 minutes
6724 88 88 minutes
6725 90 90 minu

In [178]:
# use the find_time() function
for c in cleaning_wiki_df6.index:
    cleaning_wiki_df6['Running time'][c] = find_time(cleaning_wiki_df6['Running time'][c])
    
cleaning_wiki_df6['Running time']

7098 102 minutes 102
7098 114 minutes 114
7098 113 minutes 113
7098 106 minutes 106
7098 95 minutes 95
7098 95 minutes 95
7098 100 minutes 100
7098 99 minutes 99
7098 50 minutes 50
7098 102 min 102
7098 93 minutes 93
7098 110 minutes 110
7098 126 minutes 126
7098 121 minutes 121
7098 118 minutes 118
7098 99 minutes 99
7098 90 minutes 90
7098 94 minutes 94
7098 110 minutes 110
7098 190 minutes 190
7098 85 minutes 85
7098 102 minutes 102
7098 126 minutes 126
7098 96 minutes 96
7098 97 minutes 97
7098 97 minutes 97
7098 93 min 93
7098 97 minutes 97
7098 32 min. 32
7098 98 minutes 98
7098 nan nan
7098 95 minutes 95
7098 98 minutes 98
7098 84 minutes 84
7098 101 min 101
7098 97 min 97
7098 86 minutes 86
7098 99 minutes 99
7098 97 minutes 97
7098 138 minutes 138
7098 99 minutes 99
7098 85 minutes 85
7098 91 min. 91
7098 85 minutes 85
7098 95 minutes 95
7098 181 minutes 181
7098 nan nan
7098 95 minutes 95
7098 93 minutes 93
7098 108 minutes 108
7098 120 minutes 120
7098 95 minutes 95
7098 100

7098 100 minutes 100
7098 89 minutes 89
7098 110 minutes 110
7098 117 minutes 117
7098 94 minutes 94
7098 140 minutes 140
7098 120 minutes 120
7098 89 minutes 89
7098 96 minutes 96
7098 101 minutes 101
7098 103 minutes 103
7098 93 minutes 93
7098 113 minutes 113
7098 101 minutes 101
7098 ['106 minutes (UK)', '[2]', '125 minutes (USA)'] 106
7098 95 minutes 95
7098 92 minutes 92
7098 ['105 minutes', '116 minutes (international cut)'] 105
7098 102 minutes 102
7098 90 minutes 90
7098 112 minutes 112
7098 ['108 minutes', '141 minutes', "(director's cut)"] 108
7098 128 minutes 128
7098 108 minutes 108
7098 110 minutes 110
7098 118 minutes 118
7098 103 minutes 103
7098 85 minutes 85
7098 129 minutes 129
7098 ['84 minutes (theatrical)', "91 minutes (unrated director's cut)"] 84
7098 102 minutes 102
7098 92 minutes 92
7098 117 minutes 117
7098 98 minutes 98
7098 202 minutes 202
7098 104 minutes 104
7098 ['95 minutes', '[1]', '92 minutes', '[2]', '(Edited cut)'] 95
7098 100 minutes 100
7098 84 m

7098 107 minutes 107
7098 121 minutes 121
7098 90 minutes 90
7098 88 minutes 88
7098 110 minutes 110
7098 98 minutes 98
7098 122 minutes 122
7098 108 minutes 108
7098 89 minutes 89
7098 101 minutes 101
7098 nan nan
7098 78 minutes 78
7098 nan nan
7098 120 minutes 120
7098 117 minutes 117
7098 109 minutes 109
7098 90 minutes 90
7098 76 minutes 76
7098 96 minutes 96
7098 86 minutes 86
7098 73 minutes 73
7098 94 minutes 94
7098 110 minutes 110
7098 133 minutes 133
7098 85 minutes 85
7098 nan nan
7098 88 minutes 88
7098 119 minutes 119
7098 140 minutes 140
7098 106 minutes 106
7098 98 minutes 98
7098 82 minutes 82
7098 118 minutes 118
7098 108 min. 108
7098 102 minutes 102
7098 100 minutes 100
7098 76 minutes 76
7098 105 minutes 105
7098 123 minutes 123
7098 101 minutes 101
7098 127 minutes 127
7098 103 minutes 103
7098 90 minutes 90
7098 89 minutes 89
7098 110 minutes 110
7098 98 minutes 98
7098 96 minutes 96
7098 97 minutes 97
7098 96 minutes 96
7098 112 min. 112
7098 126 minutes 126
709

7098 135 minutes 135
7098 149 minutes 149
7098 105 minutes 105
7098 36 minutes 36
7098 96 minutes 96
7098 92 min. 92
7098 95 minutes 95
7098 109 minutes 109
7098 96 minutes 96
7098 93 minutes 93
7098 92 minutes 92
7098 90 minutes 90
7098 94 minutes 94
7098 144 minutes 144
7098 ['88 minutes', '75 minutes (Edited version)'] 88
7098 99 minutes 99
7098 127 minutes 127
7098 93 minutes 93
7098 97 minutes 97
7098 117 minutes 117
7098 92 minutes 92
7098 20 minutes 20
7098 95 minutes 95
7098 122 minutes 122
7098 90 minutes 90
7098 109 minutes 109
7098 96 minutes 96
7098 87 min. 87
7098 98 minutes 98
7098 139 minutes 139
7098 99 minutes 99
7098 89 minutes 89
7098 82 minutes 82
7098 98 minutes 98
7098 93 minutes 93
7098 126 minutes 126
7098 105 minutes 105
7098 nan nan
7098 103 minutes 103
7098 nan nan
7098 90 minutes 90
7098 93 minutes 93
7098 86 minutes 86
7098 109 minutes 109
7098 115 minutes 115
7098 97 minutes 97
7098 90 minutes 90
7098 124 minutes 124
7098 128 minutes 128
7098 110 minutes 1

7098 125 minutes 125
7098 87 minutes 87
7098 93 minutes 93
7098 119 minutes 119
7098 nan nan
7098 170 minutes 170
7098 103 minutes 103
7098 94 minutes 94
7098 131 minutes 131
7098 100 minutes 100
7098 108 minutes 108
7098 100 minutes 100
7098 90 minutes 90
7098 96 minutes 96
7098 113 minutes 113
7098 116 minutes 116
7098 88 minutes 88
7098 108 minutes 108
7098 117 minutes 117
7098 84 minutes 84
7098 86 minutes 86
7098 121 minutes 121
7098 100 minutes 100
7098 119 minutes 119
7098 116 minutes 116
7098 99 minutes 99
7098 103 minutes 103
7098 101 minutes 101
7098 90 minutes 90
7098 123 minutes 123
7098 77 minutes 77
7098 122 minutes 122
7098 107 minutes 107
7098 95 minutes 95
7098 103 minutes 103
7098 102 min. 102
7098 91 minutes 91
7098 148 minutes 148
7098 90 minutes 90
7098 157 minutes 157
7098 114 minutes 114
7098 119 minutes 119
7098 109 minutes 109
7098 128 minutes 128
7098 168 minutes 168
7098 95 minutes 95
7098 95 minutes 95
7098 101 minutes 101
7098 91 minutes 91
7098 112 minutes

7098 127 minutes 127
7098 110 minutes 110
7098 118 minutes 118
7098 103 minutes 103
7098 104 minutes 104
7098 86 minutes 86
7098 92 minutes 92
7098 104 minutes 104
7098 111 minutes 111
7098 99 minutes 99
7098 93 minutes 93
7098 95 minutes 95
7098 102 minutes 102
7098 106 minutes 106
7098 86 minutes 86
7098 99 minutes 99
7098 98 minutes 98
7098 93 minutes 93
7098 97 minutes 97
7098 121 minutes 121
7098 84 minutes 84
7098 101 minutes 101
7098 100 minutes 100
7098 142 minutes 142
7098 116 minutes 116
7098 88 minutes 88
7098 85 minutes 85
7098 144 min 144
7098 91 minutes 91
7098 78 minutes 78
7098 124 minutes 124
7098 141 minutes 141
7098 90 minutes 90
7098 109 minutes 109
7098 89 minutes 89
7098 85 minutes 85
7098 78 minutes 78
7098 90 minutes 90
7098 95 minutes 95
7098 96 minutes 96
7098 95 minutes 95
7098 92 minutes 92
7098 106 minutes 106
7098 95 minutes 95
7098 80 minutes 80
7098 104 minutes 104
7098 96 minutes 96
7098 98 minutes 98
7098 101 minutes 101
7098 88 minutes 88
7098 96 minu

7098 143 minutes 143
7098 100 minutes 100
7098 77 minutes 77
7098 106 minutes 106
7098 113 minutes 113
7098 87 minutes 87
7098 123 minutes 123
7098 ['103 minutes', '107 minutes (extended cut)'] 103
7098 119 minutes 119
7098 152 minutes 152
7098 nan nan
7098 101 minutes 101
7098 98 minutes 98
7098 92 minutes 92
7098 91 minutes 91
7098 103 minutes 103
7098 92 minutes 92
7098 96 minutes 96
7098 86 minutes 86
7098 106 minutes 106
7098 90 minutes 90
7098 138 minutes 138
7098 92 minutes 92
7098 127 minutes 127
7098 104 minutes 104
7098 128 minutes 128
7098 106 minutes 106
7098 89 minutes 89
7098 141 minutes 141
7098 86 minutes 86
7098 131 minutes 131
7098 106 minutes 106
7098 92 minutes 92
7098 127 minutes 127
7098 87 minutes 87
7098 109 minutes 109
7098 101 minutes 101
7098 96 minutes 96
7098 93 minutes 93
7098 96 minutes 96
7098 98 minutes 98
7098 88 minutes 88
7098 91 minutes 91
7098 99 minutes 99
7098 103 minutes 103
7098 97 minutes 97
7098 74 minutes 74
7098 98 minutes 98
7098 128 minut

7098 93 minutes 93
7098 87 minutes 87
7098 130 minutes 130
7098 88 minutes 88
7098 90 minutes 90
7098 92 minutes 92
7098 88 minutes 88
7098 108 minutes 108
7098 93 minutes 93
7098 111 minutes 111
7098 120 minutes 120
7098 103 minutes 103
7098 87 minutes 87
7098 80 minutes 80
7098 94 minutes 94
7098 111 minutes 111
7098 99 minutes 99
7098 97 minutes 97
7098 117 minutes 117
7098 104 minutes 104
7098 113 minutes 113
7098 91 mins. 91
7098 118 minutes 118
7098 83 minutes 83
7098 93 minutes 93
7098 125 minutes 125
7098 105 minutes 105
7098 101 minutes 101
7098 132 minutes 132
7098 95 minutes 95
7098 84 minutes 84
7098 95 minutes 95
7098 84 minutes 84
7098 138 minutes 138
7098 86 minutes 86
7098 88 minutes 88
7098 96 minutes 96
7098 113 minutes 113
7098 86 minutes 86
7098 88 minutes 88
7098 nan nan
7098 122 minutes 122
7098 87 minutes 87
7098 80 minutes 80
7098 110 minutes 110
7098 100 min 100
7098 135 minutes 135
7098 106 minutes 106
7098 86 minutes 86
7098 105 minutes 105
7098 167 minutes 1

7098 95 minutes 95
7098 124 minutes 124
7098 101 minutes 101
7098 122 minutes 122
7098 80 minutes 80
7098 98 minutes 98
7098 94 minutes 94
7098 112 minutes 112
7098 101 minutes 101
7098 99 minutes 99
7098 91 minutes 91
7098 97 minutes 97
7098 113 minutes 113
7098 92 minutes 92
7098 99 minutes 99
7098 115 minutes 115
7098 126 minutes 126
7098 102 min 102
7098 92 minutes 92
7098 123 minutes 123
7098 87 minutes 87
7098 88 minutes 88
7098 85 minutes 85
7098 88 minutes 88
7098 96 minutes 96
7098 84 minutes 84
7098 109 minutes 109
7098 90 minutes 90
7098 90 minutes 90
7098 127 minutes 127
7098 111 minutes 111
7098 60 minutes 60
7098 89 minutes 89
7098 109 minutes 109
7098 75 minutes 75
7098 85 minutes 85
7098 97 minutes 97
7098 93 minutes 93
7098 107 minutes 107
7098 113 minutes 113
7098 85 minutes 85
7098 116 minutes 116
7098 108 minutes 108
7098 118 minutes 118
7098 106 minutes 106
7098 93 minutes 93
7098 158 minutes 158
7098 118 minutes 118
7098 87 minutes 87
7098 143 minutes 143
7098 82 

7098 107 min 107
7098 92 minutes 92
7098 94 minutes 94
7098 101 minutes 101
7098 110 minutes 110
7098 99 minutes 99
7098 92 minutes 92
7098 107 minutes 107
7098 100 minutes 100
7098 105 minutes 105
7098 88 minutes 88
7098 93 minutes 93
7098 80 minutes 80
7098 88 minutes 88
7098 119 minutes 119
7098 109 minutes 109
7098 89 minutes 89
7098 108 minutes 108
7098 101 minutes 101
7098 88 minutes 88
7098 105 minutes 105
7098 104 minutes 104
7098 95 minutes 95
7098 nan nan
7098 95 minutes 95
7098 108 minutes 108
7098 112 minutes 112
7098 117 minutes 117
7098 75 minutes 75
7098 106 minutes 106
7098 132 minutes 132
7098 95 minutes 95
7098 119 minutes 119
7098 108 minutes 108
7098 118 minutes 118
7098 82 minutes 82
7098 99 minutes 99
7098 115 minutes 115
7098 104 minutes 104
7098 108 minutes 108
7098 118 minutes 118
7098 113 minutes 113
7098 107 minutes 107
7098 107 minutes 107
7098 90 minutes 90
7098 117 minutes 117
7098 101 minutes 101
7098 96 minutes 96
7098 91 minutes 91
7098 88 minutes 88
70

7098 86 minutes 86
7098 88 minutes 88
7098 109 minutes 109
7098 82 minutes 82
7098 89 minutes 89
7098 90 minutes 90
7098 124 minutes 124
7098 102 minutes 102
7098 96 minutes 96
7098 105 minutes 105
7098 111 minutes 111
7098 90 minutes 90
7098 91 minutes 91
7098 1 hour 32 minutes 32
7098 101 minutes 101
7098 88 minutes 88
7098 90 minutes 90
7098 77 minutes 77
7098 84 minutes 84
7098 136 minutes 136
7098 113 minutes 113
7098 97 minutes 97
7098 nan nan
7098 82 minutes 82
7098 107 minutes 107
7098 90 minutes 90
7098 120 minutes 120
7098 nan nan
7098 112 minutes 112
7098 nan nan
7098 80 minutes 80
7098 143 minutes 143
7098 98 minutes 98
7098 87 minutes 87
7098 90 minutes 90
7098 92 minutes 92
7098 74 minutes 74
7098 73 minutes 73
7098 94 min 94
7098 82 minutes 82
7098 ['76 minutes', '(Part 1)', '76 minutes', '(Part 2)', '148 minutes', '(Deluxe Edition)'] 76
7098 101 minutes 101
7098 nan nan
7098 131 minutes 131
7098 85 minutes 85
7098 nan nan
7098 93 minutes 93
7098 nan nan
7098 102 minutes

7098 60 minutes 60
7098 149 minutes 149
7098 101 minutes 101
7098 97 minutes 97
7098 93 minutes 93
7098 121 minutes 121
7098 97 minutes 97
7098 94 minutes 94
7098 108 minutes 108
7098 100 minutes 100
7098 98 minutes 98
7098 94 minutes 94
7098 143 minutes 143
7098 100 minutes 100
7098 90 minutes 90
7098 89 minutes 89
7098 96 minutes 96
7098 82 minutes 82
7098 94 minutes 94
7098 104 minutes 104
7098 130 minutes 130
7098 ['94 minutes', 'UK version:', '98 minutes'] 94
7098 108 minutes 108
7098 130 minutes 130
7098 115 minutes 115
7098 113 minutes 113
7098 86 minutes 86
7098 115 minutes 115
7098 124 minutes 124
7098 93 minutes 93
7098 99 minutes 99
7098 104 minutes 104
7098 119 minutes 119
7098 92 minutes 92
7098 116 minutes 116
7098 130 minutes 130
7098 132 minutes 132
7098 95 minutes 95
7098 129 minutes 129
7098 nan nan
7098 86 minutes 86
7098 115 minutes 115
7098 114 minutes 114
7098 118 minutes 118
7098 93 minutes 93
7098 88 minutes 88
7098 112 minutes 112
7098 86 minutes 86
7098 106 mi

7098 82 minutes 82
7098 86 minutes 86
7098 91 minutes 91
7098 137 minutes 137
7098 82 minutes 82
7098 94 minutes 94
7098 nan nan
7098 99 minutes 99
7098 83 minutes 83
7098 90 minutes 90
7098 99 minutes 99
7098 89 minutes 89
7098 90 minutes 90
7098 107 minutes 107
7098 113 minutes 113
7098 141 minutes 141
7098 119 minutes 119
7098 101 minutes 101
7098 87 minutes 87
7098 95 minutes 95
7098 120 minutes 120
7098 115 minutes 115
7098 ['93 minutes', '101 minutes', '(extended version)'] 93
7098 130 minutes 130
7098 103 minutes 103
7098 105 minutes 105
7098 114 minutes 114
7098 120 minutes 120
7098 104 minutes 104
7098 98 minutes 98
7098 121 minutes 121
7098 124 minutes 124
7098 105 minutes 105
7098 103 minutes 103
7098 94 minutes 94
7098 111 minutes 111
7098 115 minutes 115
7098 126 minutes 126
7098 91 minutes 91
7098 117 minutes 117
7098 125 minutes 125
7098 106 minutes 106
7098 123 minutes 123
7098 91 minutes 91
7098 131 minutes 131
7098 108 minutes 108
7098 147 minutes 147
7098 116 minutes

7098 96 minutes 96
7098 109 minutes 109
7098 100 minutes 100
7098 115 minutes 115
7098 97 minutes 97
7098 95 minutes 95
7098 92 minutes 92
7098 140 minutes 140
7098 107 minutes 107
7098 117 minutes 117
7098 112 minutes 112
7098 102 minutes 102
7098 109 minutes 109
7098 110 minutes 110
7098 92 minutes 92
7098 107 minutes 107
7098 112 minutes 112
7098 85 minutes 85
7098 103 minutes 103
7098 118 minutes 118
7098 110 minutes 110
7098 110 minutes 110
7098 107 minutes 107
7098 83 minutes 83
7098 87 minutes 87
7098 93 minutes 93
7098 111 minutes 111
7098 101 minutes 101
7098 86 minutes 86
7098 98 minutes 98
7098 106 minutes 106
7098 90 minutes 90
7098 91 minutes 91
7098 140 minutes 140
7098 120 minutes 120
7098 106 minutes 106
7098 92 minutes 92
7098 103 minutes 103
7098 91 minutes 91
7098 102 minutes 102
7098 90 minutes 90
7098 101 minutes 101
7098 121 minutes 121
7098 99 minutes 99
7098 109 minutes 109
7098 107 minutes 107
7098 100 minutes 100
7098 105 minutes 105
7098 84 minutes 84
7098 11