In [1]:
import os
import requests as rq
import pandas as pd
import numpy as np
import json
import re


In [2]:
# the filepath for each data file
wiki_movie_file = os.path.join('Data', 'wikipedia-movies.json')
ratings_file = os.path.join('Data', 'ratings.csv')
movie_meta_data_file = os.path.join('Data', 'movies_metadata.csv')


In [3]:
# Load the files
with open(wiki_movie_file, 'r') as file1:
    raw_movie_json = json.load(file1)
    file1.close()

# file 2, not cleaning this one yet
# raw_ratings_df = pd.read_csv(ratings_file, sep=',', header=0)

# file 3
raw_meta_data_df = pd.read_csv(movie_meta_data_file, sep=',', low_memory=False)


In [4]:
# Explore the data, commenting results
raw_movie_json # Results: list of dicts, each movie is it's own dict
len(raw_movie_json) # Result: 7311 Movies
len(raw_movie_json[10].keys()) # changing the index looking for number of keys; 10 random indexes Results: 17 to 22 keys
raw_movie_df = pd.DataFrame(raw_movie_json) # Results: 7311 rows × 193 columns with lots of Nan fields, not every movie has the same "22" keys
raw_movie_df

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Predecessor,Founders,Area served,Products,Services,Russian,Hebrew,Revenue,Operating income,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990.0,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990.0,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,,,,,,,,,,
2,https://en.wikipedia.org/wiki/Air_America_(film),1990.0,https://www.imdb.com/title/tt0099005/,Air America,Roger Spottiswoode,Daniel Melnick,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",...,,,,,,,,,,
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990.0,https://www.imdb.com/title/tt0099012/,Alice,Woody Allen,Robert Greenhut,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",...,,,,,,,,,,
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990.0,https://www.imdb.com/title/tt0099018/,Almost an Angel,John Cornell,John Cornell,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7306,https://en.wikipedia.org/wiki/Holmes_%26_Watson,2018.0,https://www.imdb.com/title/tt1255919/,Holmes & Watson,Etan Cohen,"[Will Ferrell, Adam McKay, Jimmy Miller, Clayt...",Etan Cohen,,"[Sherlock Holmes, and, Dr. Watson, by, Sir Art...","[Will Ferrell, John C. Reilly, Rebecca Hall, R...",...,,,,,,,,,,
7307,https://en.wikipedia.org/wiki/Vice_(2018_film),2018.0,https://www.imdb.com/title/tt6266538/,Vice,Adam McKay,"[Brad Pitt, Dede Gardner, Jeremy Kleiner, Kevi...",,,,"[Christian Bale, Amy Adams, Steve Carell, Sam ...",...,,,,,,,,,,
7308,https://en.wikipedia.org/wiki/On_the_Basis_of_Sex,2018.0,https://www.imdb.com/title/tt4669788/,On the Basis of Sex,Mimi Leder,Robert W. Cort,,,,"[Felicity Jones, Armie Hammer, Justin Theroux,...",...,,,,,,,,,,
7309,https://en.wikipedia.org/wiki/Destroyer_(2018_...,2018.0,https://www.imdb.com/title/tt7137380/,Destroyer,Karyn Kusama,"[Fred Berger, Phil Hay, Matt Manfredi]",,,,"[Nicole Kidman, Sebastian Stan, Toby Kebbell, ...",...,,,,,,,,,,


In [5]:
# Explore the data, commenting results
raw_movie_df.info() # Result: too much data to display any granular info about the columns
raw_movie_df.isnull().sum() # Result: 10 columns visible, Null values seem to range from 158 to 7310; find the minimum number
min(raw_movie_df.isnull().sum())  # Results: 158 is the minimum, url and year columns are the lowest
raw_movie_df.keys().to_list() # Results: column names suggest more than movies; tv shows, people, and possibly books also.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7311 entries, 0 to 7310
Columns: 193 entries, url to Polish
dtypes: float64(1), object(192)
memory usage: 10.8+ MB


['url',
 'year',
 'imdb_link',
 'title',
 'Directed by',
 'Produced by',
 'Screenplay by',
 'Story by',
 'Based on',
 'Starring',
 'Narrated by',
 'Music by',
 'Cinematography',
 'Edited by',
 'Productioncompany ',
 'Distributed by',
 'Release date',
 'Running time',
 'Country',
 'Language',
 'Budget',
 'Box office',
 'Written by',
 'Genre',
 'Theme music composer',
 'Country of origin',
 'Original language(s)',
 'Producer(s)',
 'Editor(s)',
 'Production company(s)',
 'Original network',
 'Original release',
 'Productioncompanies ',
 'Executive producer(s)',
 'Production location(s)',
 'Distributor',
 'Picture format',
 'Audio format',
 'Voices of',
 'Followed by',
 'Composer(s)',
 'Created by',
 'Also known as',
 'Opening theme',
 'No. of episodes',
 'Preceded by',
 'Author',
 'Publisher',
 'Publication date',
 'Media type',
 'Pages',
 'ISBN',
 'OCLC',
 'LC Class',
 'Cover artist',
 'Series',
 'Set in',
 'Adaptation by',
 'Suggested by',
 'Biographical data',
 'Born',
 'Died',
 'Resti

In [6]:
# select for records with an imdb link, 'Director' or 'Directed by', and not references to tv shows.
cleaning_wiki = [movie for movie in raw_movie_json if (('imdb_link' and ('Directed by' or 'Director')) in movie.keys()) and ('No. of episodes' not in movie.keys())]
cleaning_df = pd.DataFrame(cleaning_wiki)
cleaning_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 74 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Directed by             7099 non-null   object
 5   Produced by             6737 non-null   object
 6   Screenplay by           2323 non-null   object
 7   Story by                1004 non-null   object
 8   Based on                2196 non-null   object
 9   Starring                6913 non-null   object
 10  Narrated by             283 non-null    object
 11  Music by                6502 non-null   object
 12  Cinematography          6396 non-null   object
 13  Edited by               6398 non-null   object
 14  Productioncompany       4539 non-null   object
 15  Dist

In [7]:
# # lots of the columns are languages:  "Arabic", "Japanese", "Mandarin", "Polish", "Yiddish", "Romanized"
cleaning_df[cleaning_df['Arabic'].notnull()]['Arabic']

6856    قضية رقم ٢٣
7081      کفرناحوم‎
Name: Arabic, dtype: object

In [8]:
# staring to define a function to clean the data. troubleshooting and ajustments were decided in the cell below this one.
def cleaning_movie(movie):
    """wiki_movie_file is a json file containing a list of dictionaries with each index as a different movie 
    with different key:value pairs. This function is specially designed to clean that specific dataset. 
    Wikipedia is managed by many people and they do not all use the same words.
    """
    alternate_titles = dict()
    fixed_movie = dict(movie)
###  Start with alternate titles stored in language keys and merge them into one.
###  Language keys i could find:      
    language_keys = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']
    for key in language_keys:
        if key in fixed_movie.keys():
            alternate_titles[key] = fixed_movie[key]
            fixed_movie.pop(key)
        else:
            pass
##  if there were alternate titles, add them to the movie.       
    if len(alternate_titles) > 0:
        fixed_movie['alternate_titles'] = alternate_titles
    else:
        pass
    return fixed_movie



In [9]:
## using this cell to debug the above function and decide on key(s) to keep, merge, and delete. 
## Starting with the alternate titles hidden in the language keys.
language_keys1 = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']

cleaning_wiki2 = [cleaning_movie(movie) for movie in cleaning_wiki]

## find a column to remove, add it to the list, check the columns again.
columns = sorted(pd.DataFrame(cleaning_wiki2).columns.to_list())
columns


['Actor control',
 'Adaptation by',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Country of origin',
 'Created by',
 'Directed by',
 'Distributed by',
 'Distributor',
 'Edited by',
 'Editor(s)',
 'Engine(s)',
 'Executive producer(s)',
 'Followed by',
 'Format(s)',
 'Genre',
 'Genre(s)',
 'Language',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Picture format',
 'Preceded by',
 'Produced by',
 'Producer(s)',
 'Production company',
 'Production company(s)',
 'Production location(s)',
 'Productioncompanies ',
 'Productioncompany ',
 'Release date',
 'Release(s)',
 'Running time',
 'Screen story by',
 'Screenplay by',
 'Starring',
 'Story by',
 'Suggested by',
 'Theme music composer',
 'Voices of',
 'Written by',
 'alternate_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [10]:
## Where I am at for this point. 
pd.DataFrame(cleaning_wiki2).info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 55 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Directed by             7099 non-null   object
 5   Produced by             6737 non-null   object
 6   Screenplay by           2323 non-null   object
 7   Story by                1004 non-null   object
 8   Based on                2196 non-null   object
 9   Starring                6913 non-null   object
 10  Narrated by             283 non-null    object
 11  Music by                6502 non-null   object
 12  Cinematography          6396 non-null   object
 13  Edited by               6398 non-null   object
 14  Productioncompany       4539 non-null   object
 15  Dist

In [11]:
def cleaning_movie(movie):
    """wiki_movie_file is a json file containing a list of dictionaries with each index as a different movie 
    with different key:value pairs. This function is specially designed to clean that specific dataset. 
    Wikipedia is managed by many people and they do not all use the same words.
    """
    alternate_titles = dict()
    fixed_movie = dict(movie)
###  Start with alternate titles stored in language keys and merge them into one.
###  Language keys i could find:      
    language_keys = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']
    for key in language_keys:
        if key in fixed_movie.keys():
            alternate_titles[key] = fixed_movie[key]
            fixed_movie.pop(key)
        else:
            pass
##  if there were alternate titles, add them to the movie.       
    if len(alternate_titles) > 0:
        fixed_movie['alternate_titles'] = alternate_titles
    else:
        pass
        
###  Alternative titles are fixed, now merge columns that are similar.     
    keys_to_merge = {'Director':'Directed by', 'Country': 'Country of origin', 'Distributor(s)':'Distributed by',
                     'Editor(s)':'Edited by',  'Language':'Original language(s)', 'Producer(s)':'Produced by',
                     'Genre(s)': 'Genre', 'Composer(s)': ['Music by', 'Theme music composer'], 
                     'Release date': ['Release(s)', 'Original release'], 'Distributor(s)':['Distributed by','Distributor'],
                     'Writer(s)':['Written by', 'Story by', 'Screenplay by', 'Screen story by', 'Adaptation by'],
                     'Production Comapany': ['Production company', 'Production company(s)', 'Productioncompanies ','Productioncompany ']  }
##  item = key, from the key:value pair and the key I want; values = the movie key(s) I do not want. 
    for item in keys_to_merge: 
        if type(keys_to_merge[item]) == type(list()):
            for n in keys_to_merge[item]:
                if n in fixed_movie.keys():
                    fixed_movie[item] = fixed_movie.pop(n)
                else:
                    pass
        else: 
            if keys_to_merge[item] in fixed_movie.keys():
                fixed_movie[item] = fixed_movie.pop(keys_to_merge[item])
            else:
                pass
    
    return fixed_movie




In [12]:
## using this cell to debug the above function and decide on key(s) to keep, merge, and delete. 
## keys that are the same idea need merged, key = the key I am keeping, Value = similar key(s).
keys_to_merge1 = {'Director':'Directed by', 'Country': 'Country of origin', 'Distributor(s)':'Distributed by',
                 'Editor(s)':'Edited by',  'Language':'Original language(s)', 'Producer(s)':'Produced by',
                 'Genre(s)': 'Genre', 'Composer(s)': ['Music by', 'Theme music composer'], 
                 'Release date': ['Release(s)', 'Original release'], 'Distributor(s)':['Distributed by','Distributor'],
                 'Writer(s)':['Written by', 'Story by', 'Screenplay by', 'Screen story by', 'Adaptation by'],
                 'Production Comapany': ['Production company', 'Production company(s)', 'Productioncompanies ','Productioncompany ']  }


cleaning_wiki3 = [cleaning_movie(movie) for movie in cleaning_wiki]

## find common columns, add them to the dictionary and check the columns again.
columns = sorted(pd.DataFrame(cleaning_wiki3).columns.to_list())
columns



['Actor control',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor(s)',
 'Editor(s)',
 'Engine(s)',
 'Executive producer(s)',
 'Followed by',
 'Format(s)',
 'Genre(s)',
 'Language',
 'Narrated by',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production Comapany',
 'Production location(s)',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Voices of',
 'Writer(s)',
 'alternate_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [13]:
## Where I am at for this point. 
cleaning_wiki_df3 = pd.DataFrame(cleaning_wiki3)
cleaning_wiki_df3.info()
# Down to 38 columns from 193.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Based on                2196 non-null   object
 5   Starring                6913 non-null   object
 6   Narrated by             283 non-null    object
 7   Cinematography          6396 non-null   object
 8   Release date            7067 non-null   object
 9   Running time            6956 non-null   object
 10  Country                 6860 non-null   object
 11  Language                7014 non-null   object
 12  Budget                  4774 non-null   object
 13  Box office              5530 non-null   object
 14  Director                7099 non-null   object
 15  Dist

In [14]:
# extract the imdb id from the imdb link to link with other data sets.
cleaning_wiki_df3['imdb_link']   
pattern = r'(tt\d{7})'
cleaning_wiki_df3['imdb_id'] = cleaning_wiki_df3['imdb_link'].str.extract(pattern)

# checking for duplicate data
len(cleaning_wiki_df3['imdb_id'].unique()) # 7074 rows have imdb links, 7032 are unique. There are duplicate movies in the dataset.

7032

In [15]:
## Drop the duplicate rows
cleaning_wiki_df4 = pd.DataFrame(cleaning_wiki_df3.drop_duplicates(subset='imdb_id'))
cleaning_wiki_df4

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Narrated by,Cinematography,Release date,Running time,...,Created by,Preceded by,Suggested by,alternate_titles,Animation by,Color process,Engine(s),Actor control,Format(s),imdb_id
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","Andrew ""Dice"" Clay",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",102 minutes,...,,,,,,,,,,tt0098987
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",,Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",114 minutes,...,,,,,,,,,,tt0098994
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",,Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",113 minutes,...,,,,,,,,,,tt0099005
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",106 minutes,...,,,,,,,,,,tt0099012
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",,Russell Boyd,"December 19, 1990",95 minutes,...,,,,,,,,,,tt0099018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7094,https://en.wikipedia.org/wiki/Holmes_%26_Watson,2018,https://www.imdb.com/title/tt1255919/,Holmes & Watson,"[Sherlock Holmes, and, Dr. Watson, by, Sir Art...","[Will Ferrell, John C. Reilly, Rebecca Hall, R...",,Oliver Wood,"[December 25, 2018, (, 2018-12-25, ), (United ...",90 minutes,...,,,,,,,,,,tt1255919
7095,https://en.wikipedia.org/wiki/Vice_(2018_film),2018,https://www.imdb.com/title/tt6266538/,Vice,,"[Christian Bale, Amy Adams, Steve Carell, Sam ...",,Greig Fraser,"[December 11, 2018, (, 2018-12-11, ), (, Samue...",132 minutes,...,,,,,,,,,,tt6266538
7096,https://en.wikipedia.org/wiki/On_the_Basis_of_Sex,2018,https://www.imdb.com/title/tt4669788/,On the Basis of Sex,,"[Felicity Jones, Armie Hammer, Justin Theroux,...",,Michael Grady,"[November 8, 2018, (, 2018-11-08, ), (, AFI Fe...",120 minutes,...,,,,,,,,,,tt4669788
7097,https://en.wikipedia.org/wiki/Destroyer_(2018_...,2018,https://www.imdb.com/title/tt7137380/,Destroyer,,"[Nicole Kidman, Sebastian Stan, Toby Kebbell, ...",,Julie Kirkwood,"[August 31, 2018, (, 2018-08-31, ), (, Telluri...",123 minutes,...,,,,,,,,,,tt7137380


In [16]:
# drop columns that are 90% or more null
columns_to_keep = [column for column in cleaning_wiki_df4 if cleaning_wiki_df4[column].isnull().sum() < (len(cleaning_wiki_df4['url'])*0.9)]
cleaning_wiki_df5 = cleaning_wiki_df4[columns_to_keep]
cleaning_wiki_df5.info() # down to 21 useful columns; from 193 messy, where 80%+ of the cells were null Values.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7098
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   url                  7032 non-null   object
 1   year                 7032 non-null   int64 
 2   imdb_link            7031 non-null   object
 3   title                7031 non-null   object
 4   Based on             2181 non-null   object
 5   Starring             6850 non-null   object
 6   Cinematography       6343 non-null   object
 7   Release date         7000 non-null   object
 8   Running time         6893 non-null   object
 9   Country              6798 non-null   object
 10  Language             6948 non-null   object
 11  Budget               4738 non-null   object
 12  Box office           5486 non-null   object
 13  Director             7032 non-null   object
 14  Distributor(s)       6677 non-null   object
 15  Editor(s)            6486 non-null   object
 16  Produc

In [17]:
## Data types:  Budget, Box office, and Running time should be numbers not strings/lists/objects, Release Date and year should be a Datetime.  
# cleaning_wiki_df5.groupby(['Box office']).count() # error because mixture of data types
[oddity for oddity in cleaning_wiki_df5['Box office'] if type(oddity)==type(str())]  # sensible, not that dirty
[oddity for oddity in cleaning_wiki_df5['Box office'] if type(oddity)==type(list())]  # lists are a dirty mess. multiple currency types and odd entries
[oddity for oddity in cleaning_wiki_df5['Box office'] if (type(oddity)!=type(list())) and (type(oddity)!=type(str())) ] # lots of nan values
[(index, oddity) for index, oddity in enumerate(cleaning_wiki_df5['Box office']) if type(oddity)==type(list())]  # looking for a specific movie to check the wiki page

[(34, ['US$', '4,212,828']),
 (54, ['$6,698,361 (', 'United States', ')', '[2]']),
 (74, ['$6,488,144', '(US)', '[1]']),
 (126, ['US$1,531,489', '(domestic)']),
 (130, ['US$', '4,803,039']),
 (137, ['$92,706', '(domestic)']),
 (178, ['$3,331', '(USA)']),
 (204, ['$739,104', '(North America)', '[2]']),
 (211, ['$1.2 million', '(US)', '[1]']),
 (255, ['$14.6 million', '(North America)', '[3]']),
 (272, ['$38 million', '(US)', '[2]']),
 (279, ['$57.5 million', '(North America)', '[1]']),
 (339, ['£739,989 (UK)', '[1]', '$4,413,473 (US)', '[1]']),
 (344, ['$4,654,288 (', 'US', ')', '[1]']),
 (376, ['$6.4 million', '(North America)', '[1]']),
 (412, ['$46.7 million', '[4]', '[3]', '(USA)']),
 (488, ['$14.1 million', '[', 'citation needed', ']']),
 (512, ['$10.7 million', '(North America)', '[2]']),
 (532, ['$75.5 million', '(North America)', '[2]']),
 (564, ['$27.2 million', '(North America)', '[3]']),
 (615, ['HK$2,662,446', '(Hong Kong)']),
 (648, ['$13,747,138', '70,542 admissions (Franc

In [18]:
box_office = cleaning_wiki_df5['Box office'].dropna() # no point in modifying a nan value
len(box_office) # Results: 5486 items
box_office

0          $21.4 million
1           $2.7 million
2            $57,718,089
3             $7,331,647
4       $6,939,946 (USA)
              ...       
7093       $19.4 million
7094       $41.9 million
7095       $76.1 million
7096       $38.4 million
7097        $5.5 million
Name: Box office, Length: 5486, dtype: object

In [19]:
len(box_office[box_office.map(lambda x: type(x) != str)]) # 136 items are lists, not strings
box_office[box_office.map(lambda x: type(x) != str)]


34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
126                [US$1,531,489, (domestic)]
130                          [US$, 4,803,039]
                        ...                  
7003               [$99.6, million, [4], [5]]
7017                   [$365.6, million, [1]]
7018                         [$53.8, million]
7038                     [$435, million, [7]]
7071                   [$529.3, million, [4]]
Name: Box office, Length: 136, dtype: object

In [20]:
# make a regular expression pattern for the strings
pattern1 = r'(\$\d+\.?\d*\s*[bm]illi?on)'  # searching for pattern that is similar to: "$45.3 million/billion"

# counting occurances of the first pattern
box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False).sum()  # 3826 items in the list with this pattern
matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False) # save the filter

box_office.str.extract(pattern1, flags=re.IGNORECASE).dropna()  # View the items matching pattern #1

  box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False).sum()  # 3826 items in the list with this pattern
  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False) # save the filter


Unnamed: 0,0
0,$21.4 million
1,$2.7 million
10,$195.3 million
11,$53.2 million
12,$15.7 million
...,...
7093,$19.4 million
7094,$41.9 million
7095,$76.1 million
7096,$38.4 million


In [21]:
pattern2 = r'(\$\d+(?:[,\.]\d{3})+\.?\d*)(?!\s*[bm]illi?on)'   # searching for pattern that is similar to $123,456,789.0
# counting occurances of the second pattern
box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False).sum()  # 1490 items in the list with this pattern
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False) # save the filter
box_office.str.extract(pattern2, flags=re.IGNORECASE).dropna()  # view the items matching pattern 2

  box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False).sum()  # 1490 items in the list with this pattern
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False) # save the filter


Unnamed: 0,0
2,"$57,718,089"
3,"$7,331,647"
4,"$6,939,946"
9,"$855,810"
15,"$12,626,043"
...,...
7045,"$401,463"
7051,"$260,136"
7062,"$100,116"
7067,"$19,996"


In [22]:
box_office[~matches_from_one & ~matches_from_two] # this result matches the search I did Earlier for data types.
problem_lists = box_office[~matches_from_one & ~matches_from_two]

# solving for the easy ones, they fit the pattern and just the wrong data type
for i in problem_lists.index:
    if len(re.findall(pattern1, str(problem_lists[i]))) == 1: # fits the pattern
        problem_lists[i] = re.findall(pattern1, str(problem_lists[i]))[0]
    elif len(re.findall(pattern2, str(problem_lists[i]))) == 1: # fits the pattern
        problem_lists[i] = re.findall(pattern2, str(problem_lists[i]))[0]
    elif len(re.findall(pattern1, str(problem_lists[i]))) > 1: # saving for easier parsing later
        problem_lists[i] = re.findall(pattern1, str(problem_lists[i]))[1]
    elif len(re.findall(pattern2, str(problem_lists[i]))) > 1:  # saving for easier parsing later
        problem_lists[i] = re.findall(pattern2, str(problem_lists[i]))[1]
    else:
        pass
    
problem_lists

34                [US$, 4,212,828]
54                      $6,698,361
74                      $6,488,144
110             $4.35-4.37 million
126                     $1,531,489
                   ...            
7003    [$99.6, million, [4], [5]]
7017        [$365.6, million, [1]]
7018              [$53.8, million]
7038          [$435, million, [7]]
7071        [$529.3, million, [4]]
Name: Box office, Length: 162, dtype: object

In [23]:
# testing to see if i can make the list parsing easier by removing some of the [\d] wiki references

problem_lists.str.replace("\[\d+\]\s*", "")
problem_lists

  problem_lists.str.replace("\[\d+\]\s*", "")


34                [US$, 4,212,828]
54                      $6,698,361
74                      $6,488,144
110             $4.35-4.37 million
126                     $1,531,489
                   ...            
7003    [$99.6, million, [4], [5]]
7017        [$365.6, million, [1]]
7018              [$53.8, million]
7038          [$435, million, [7]]
7071        [$529.3, million, [4]]
Name: Box office, Length: 162, dtype: object

In [24]:
# save the changes to the box_office
for p in problem_lists.index:
    for b in box_office.index:
        if p == b:
#             if type(problem_lists[p]) == str:
            box_office[b] = problem_lists[p]
#             else:
#                 continue
        else:
            continue

# Rerun the two filters to see what still needs fixed
matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)            
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
box_office[~matches_from_one & ~matches_from_two]

  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)


34                [US$, 4,212,828]
110             $4.35-4.37 million
130               [US$, 4,803,039]
602                     $5000 (US)
734                [$, 11,146,270]
                   ...            
7003    [$99.6, million, [4], [5]]
7017        [$365.6, million, [1]]
7018              [$53.8, million]
7038          [$435, million, [7]]
7071        [$529.3, million, [4]]
Name: Box office, Length: 67, dtype: object

In [25]:
modified_p1 = r"(\$\d{1,3}\.?\d*',\s*'[bm]illi?on)"  # for when 43.5 Million is broken into '4.35' and 'million'
modified_p2 = r"(\$', '\d{1,3}(?:[,\.]\d{3})+\.?\d*)"
pattern3 = r"(\$\d*\.?\d*)(?:[-—–]\d*\.?\d*)(\s[bm]illi?on)" # modified pattern #1 version 3
pattern4 = r"(\$\s\d{1,3}(?:[,\.]\d{3})+\.?\d*)" # modified pattern #2 version 3
pattern5 = r"(\$',\s*'\d{1,3}\.?\d*\s[bm]illi?on)" # modified pattern #1 version 4
pattern6 = r"^(\$\d{3,})" # new pattern
problem_lists = box_office[~matches_from_one & ~matches_from_two]
for i in problem_lists.index:  
    if len(re.findall(modified_p1, str(problem_lists[i]))) > 0:
        problem_lists[i] = re.findall(modified_p1, str(problem_lists[i]))[0].replace("', '", " ")
#         print(i, problem_lists[i])
    elif len(re.findall(modified_p2, str(problem_lists[i]))) >0:
        problem_lists[i] = re.findall(modified_p2, str(problem_lists[i]))[0].replace("', '", "")
    elif len(re.findall(pattern3, str(problem_lists[i]))) >0:
        fix = re.findall(pattern3, str(problem_lists[i]))
        problem_lists[i] = fix[0][0] + fix[0][1]
    elif len(re.findall(pattern4, str(problem_lists[i])))>0:
        problem_lists[i] = re.findall(pattern4, str(problem_lists[i]))[0].replace(" ", "")
    elif len(re.findall(pattern5, str(problem_lists[i])))>0:
        problem_lists[i] = re.findall(pattern5, str(problem_lists[i]))[0].replace("', '", "")
    elif len(re.findall(pattern6, str(problem_lists[i])))>0:
        problem_lists[i] = re.findall(pattern6, str(problem_lists[i]))[0]
    else:
        pass
    
problem_lists

34          $4,212,828
110      $4.35 million
130         $4,803,039
602              $5000
734        $11,146,270
             ...      
7003     $99.6 million
7017    $365.6 million
7018     $53.8 million
7038      $435 million
7071    $529.3 million
Name: Box office, Length: 67, dtype: object

In [26]:
## merge the fixes from problem_lists into box_office
for p in problem_lists.index:
    for b in box_office.index:
        if p == b:
            box_office[b] = problem_lists[p]
            

matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)
box_office[~matches_from_one & ~matches_from_two & ~matches_from_three]

  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
  matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)


1073                        35,254,617
1483                        £3 million
1870                      ¥1.1 billion
2037                               N/A
2672       926,423 admissions (France)
3096    [$32, [2], –33.1 million, [1]]
3640                               TBA
3889          CN¥3.650 million (China)
4128                        £7,385,434
4318                            $20-30
4575           $45.2k (only in Turkey)
5465                             £2.56
5802                          413 733$
6034                           Unknown
6615                    less than $372
6865                           8 crore
Name: Box office, dtype: object

In [27]:
## not going to salvage these rows, change them to np.nan. Lots of unknown meaning and some requires currency conversion.
problem_lists = box_office[~matches_from_one & ~matches_from_two & ~matches_from_three]
for i in problem_lists.index:
    problem_lists[i] = np.nan
    
problem_lists    

1073    NaN
1483    NaN
1870    NaN
2037    NaN
2672    NaN
3096    NaN
3640    NaN
3889    NaN
4128    NaN
4318    NaN
4575    NaN
5465    NaN
5802    NaN
6034    NaN
6615    NaN
6865    NaN
Name: Box office, dtype: object

In [28]:
# pur the new np.nan's into box_office
for p in problem_lists.index:
    for b in box_office.index:
        if p == b:
            box_office[b] = problem_lists[p]

matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)
box_office[~matches_from_one & ~matches_from_two & ~matches_from_three]

  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
  matches_from_three = box_office.str.contains(pattern6, flags=re.IGNORECASE, na=False)


1073    NaN
1483    NaN
1870    NaN
2037    NaN
2672    NaN
3096    NaN
3640    NaN
3889    NaN
4128    NaN
4318    NaN
4575    NaN
5465    NaN
5802    NaN
6034    NaN
6615    NaN
6865    NaN
Name: Box office, dtype: object

In [29]:
## replace the old box office info in the DataFrame with the newly filtered box office info. 
for b in box_office.index:
    for c in cleaning_wiki_df5.index:
        if b == c:
            if box_office[b] != cleaning_wiki_df5['Box office'][c]:
                cleaning_wiki_df5['Box office'][c] = box_office[b]
                break
            else:
#                 print(b, cleaning_wiki_df5['Box office'][c], box_office[b])
##                 found some non number strings missed from the first filtering that should have been caught
                fix1 = re.findall(pattern1, box_office[b])
                fix2 = re.findall(pattern2, box_office[b])
                if (len(fix1) >0) and (fix1[0] != box_office[b]):
                    cleaning_wiki_df5['Box office'][c] = fix1[0]
                    break
                elif len(fix2) >0 and (fix2[0] != box_office[b]):
                    cleaning_wiki_df5['Box office'][c] = fix2[0]
                    break
                else:
                    break
cleaning_wiki_df5['Box office']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df5['Box office'][c] = fix2[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df5['Box office'][c] = fix1[0]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df5['Box office'][c] = box_office[b]


0       $21.4 million
1        $2.7 million
2         $57,718,089
3          $7,331,647
4          $6,939,946
            ...      
7094    $41.9 million
7095    $76.1 million
7096    $38.4 million
7097     $5.5 million
7098              NaN
Name: Box office, Length: 7032, dtype: object

In [30]:
# def parse_dollars(s):   
    # patterns to change into numbers
    
    # if s is string, then change it, otherwise np.nan
        # if input is of the form $###.## Million, pattern #1
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            # return value
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2

            # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            # return value
            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        
            # Remove dollar sign and commas, convert to float
            # return value

    # else: return np.nan 
    
    

In [31]:
test= re.search("(\d+\.?\d*)", cleaning_wiki_df5['Box office'][0])
print(type(test), "  ||  ", test)
print(test[0], type(test[0]), type(float(test[0])), f"  |  test * Million = {float(test[0])*1000000}")


<class 're.Match'>   ||   <re.Match object; span=(1, 5), match='21.4'>
21.4 <class 'str'> <class 'float'>   |  test * Million = 21400000.0


In [32]:
def parse_dollars(s):   
    # patterns to change into numbers
    p1 = "(\$\d+\.?\d*\s*milli?on)" # pulled from pattern #1 above
    p2 = "(\$\d+\.?\d*\s*billi?on)" # pulled from pattern #1 above
    
    # if s is string, then change it, otherwise np.nan
    if type(s) == str:
  
        # if input is of the form $###.## Million, pattern #1
        if re.match(p1, s, flags=re.IGNORECASE):
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000
            
            # return value
            return num
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2
        elif re.match(p2, s, flags=re.IGNORECASE):
        # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000000
            # return value
            return num

            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        
            # Remove dollar sign and commas, convert to float
            # return value

    # else: return np.nan 
    
    

In [33]:
test = re.findall(r"(?:\$)(\d+(?:[,\.]\d{3})+\.?\d*)", cleaning_wiki_df5["Box office"][2])
print(test, test[0], test[0].replace(",", ""), float(test[0].replace(",", "")))

['57,718,089'] 57,718,089 57718089 57718089.0


In [34]:
def parse_dollars(s):   
    # patterns to change into numbers
    p1 = "(\$\d+\.?\d*\s*milli?on)" # pulled from pattern #1 above
    p2 = "(\$\d+\.?\d*\s*billi?on)" # pulled from pattern #1 above
    p3 = "(\$\d+(?:[,\.]\d{3})+\.?\d*)"  # pulled from pattern #2 above
    
    # if s is string, then change it, otherwise np.nan
    if type(s) == str:
  
        # if input is of the form $###.## Million, pattern #1
        if re.match(p1, s, flags=re.IGNORECASE):
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000
            
            # return value
            return num
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2
        elif re.match(p2, s, flags=re.IGNORECASE):
        # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000000
            # return value
            return num

            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        elif re.match(p3, s, flags=re.IGNORECASE):
            # Remove dollar sign and commas, convert to float
            num = float(re.findall(r"(?:\$)(\d+(?:[,\.]\d{3})+\.?\d*)", s)[0].replace(",", ""))

            # return value
            return num
    # else: return np.nan 
    else: 
        return np.nan
    
    

In [35]:
# find the first instance of "billion" to test parse_dollars()

for c in cleaning_wiki_df5.index:
    if type(cleaning_wiki_df5['Box office'][c]) == str:
        if re.findall(pattern1, cleaning_wiki_df5['Box office'][c], flags=re.IGNORECASE):
            if "billion" in cleaning_wiki_df5['Box office'][c].lower():
                print(c, cleaning_wiki_df5['Box office'][c])
                break
cleaning_wiki_df5['Box office'].head()        

766 $1.030 billion


0    $21.4 million
1     $2.7 million
2      $57,718,089
3       $7,331,647
4       $6,939,946
Name: Box office, dtype: object

In [36]:
cleaning_wiki_df6 = pd.DataFrame(cleaning_wiki_df5)
# Choose the indexes for the test
print(cleaning_wiki_df6['Box office'][0], cleaning_wiki_df6['Box office'][766], cleaning_wiki_df6['Box office'][2])

# test parse_dollars()
print(parse_dollars(cleaning_wiki_df6['Box office'][0]), parse_dollars(cleaning_wiki_df6['Box office'][766]), parse_dollars(cleaning_wiki_df6['Box office'][2]))

# The function works where I need it to, check for where the if statements miss something.

$21.4 million $1.030 billion $57,718,089
21400000.0 1030000000.0 57718089.0


In [37]:
for c in cleaning_wiki_df6.index:
    test = parse_dollars(cleaning_wiki_df6['Box office'][c])
    if type(test) != float:
        print(c, test, type(test), cleaning_wiki_df6['Box office'][c])

602 None <class 'NoneType'> $5000
2096 None <class 'NoneType'> $309
6391 None <class 'NoneType'> $111
6392 None <class 'NoneType'> $588


In [38]:
def parse_dollars(s):   
    # patterns to change into numbers
    p1 = "(\$\d+\.?\d*\s*milli?on)" # pulled from pattern #1 above
    p2 = "(\$\d+\.?\d*\s*billi?on)" # pulled from pattern #1 above
    p3 = "(\$\d+(?:[,\.]\d{3})+\.?\d*)"  # pulled from pattern #2 above
    
    # if s is string, then change it, otherwise np.nan
    if type(s) == str:
  
        # if input is of the form $###.## Million, pattern #1
        if re.match(p1, s, flags=re.IGNORECASE):
            # Remove dollar sign and "million", convert to float and multiply by 1,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000
            
            # return value
            return num
            
        # elif input is of the form $###.## Billion, formerly pattern #1 also, now pattern #2
        elif re.match(p2, s, flags=re.IGNORECASE):
        # Remove dollar sign and "billion", convert to float and multiply by 1,000,000,000
            num = float(re.search("(\d+\.?\d*)", s)[0])*1000000000
            # return value
            return num

            
        # elif input is of the form $###,###,###, formerly pattern #2, now pattern #3
        elif re.match(p3, s, flags=re.IGNORECASE):
            # Remove dollar sign and commas, convert to float
            num = float(re.findall(r"(?:\$)(\d+(?:[,\.]\d{3})+\.?\d*)", s)[0].replace(",", ""))

            # return value
            return num
        else: 
            if len(s) <= 6:
                if re.match("(\$\d{1,6}$)", s, flags=re.IGNORECASE):
                    num = float(s.replace("$", ""))
                    return num

    else: 
        return np.nan


In [39]:
for c in cleaning_wiki_df6.index:
    cleaning_wiki_df6['Box office'][c] = parse_dollars(cleaning_wiki_df6['Box office'][c])
    
cleaning_wiki_df6['Box office']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df6['Box office'][c] = parse_dollars(cleaning_wiki_df6['Box office'][c])


0       21400000.0
1        2700000.0
2       57718089.0
3        7331647.0
4        6939946.0
           ...    
7094    41900000.0
7095    76100000.0
7096    38400000.0
7097     5500000.0
7098           NaN
Name: Box office, Length: 7032, dtype: object

In [40]:
# cleaning_wiki_df6.groupby(['Budget']).count()  # error because of lists and strings again.
def find_dollars(s):
    # declared patterns from before
    p1 = "(\$\d+\.?\d*\s*[bm]illi?on)" # pulled from pattern #1 above
    p2 = "(\$\d+(?:[,\.]\d{3})+\.?\d*)"
    mp1 = r"(\$\d{1,3}\.?\d*',\s*'[bm]illi?on)"  # for when 43.5 Million is broken into '4.35' and 'million'
    mp2 = r"(\$', '\d{1,3}(?:[,\.]\d{3})+\.?\d*)"  # modified pattern #2 version 2 
    p3 = r"(\$\d*\.?\d*)(?:[-—–]\d*\.?\d*)(\s[bm]illi?on)" # modified pattern #1 version 3
    p4 = r"(\$\s\d{1,3}(?:[,\.]\d{3})+\.?\d*)" # modified pattern #2 version 3
    p5 = r"(\$',\s*'\d{1,3}\.?\d*\s[bm]illi?on)" # modified pattern #1 version 4
    p6 = r"^(\$\d{3,6})" # new pattern
    if len(re.findall(p1, str(s))) == 1:
        num = re.findall(p1, str(s))[0]
        return num
    elif len(re.findall(p2, str(s))) == 1:
        num = re.findall(p2, str(s))[0]
        return num
    elif len(re.findall(p1, str(s))) > 1: 
        num = re.findall(p1, str(s))[1]
        return num
    elif len(re.findall(p2, str(problem_lists[i]))) > 1:  
        num = re.findall(p2, str(s))[1]
        return num
    elif len(re.findall(mp1, str(s))) > 0:
        num = re.findall(mp1, str(s))[0].replace("', '", " ")
        return num
    elif len(re.findall(mp2, str(s))) >0:
        num = re.findall(mp2, str(s))[0].replace("', '", "")
        return num
    elif len(re.findall(p3, str(s))) >0:
        fix = re.findall(p3, str(s))
        num = fix[0][0] + fix[0][1]
        return num
    elif len(re.findall(p4, str(s)))>0:
        num = re.findall(p4, str(s))[0].replace(" ", "")
        return num
    elif len(re.findall(p5, str(s)))>0:
        num = re.findall(p5, str(s))[0].replace("', '", "")
        return num
    elif len(re.findall(p6, str(s)))>0:
        num = re.findall(p6, str(s))[0]
        return num
    else:
        return np.nan


In [41]:
budget = cleaning_wiki_df6['Budget']
for b in budget.index:
    print(budget[b], find_dollars(budget[b]), parse_dollars(find_dollars(budget[b])))
    

$20 million $20 million 20000000.0
$6 million $6 million 6000000.0
$35 million $35 million 35000000.0
$12 million $12 million 12000000.0
$25 million $25 million 25000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$50 million $50 million 50000000.0
$22 million $22 million 22000000.0
nan nan nan
$29 million $29 million 29000000.0
$40 million $40 million 40000000.0
$7 million $7 million 7000000.0
['$2,500,000', '[', 'citation needed', ']'] $2,500,000 2500000.0
nan nan nan
$20 million $20 million 20000000.0
nan nan nan
nan nan nan
nan nan nan
$47 million $47 million 47000000.0
nan nan nan
nan nan nan
$15 million $15 million 15000000.0
nan nan nan
$10 million $10 million 10000000.0
nan nan nan
$10 million $10 million 10000000.0
nan nan nan
nan nan nan
nan nan nan
$13 million $13 million 13000000.0
nan nan nan
nan nan nan
nan nan nan
$5.2 million $5.2 million 5200000.0
nan nan nan
$17.5 million $17.5 million 17500000.0
nan nan nan
nan nan nan
US$18 million $18 million 1800

nan nan nan
$30 million $30 million 30000000.0
nan nan nan
$1.3 million $1.3 million 1300000.0
$20 million $20 million 20000000.0
$12.5 million $12.5 million 12500000.0
nan nan nan
$11.7 million $11.7 million 11700000.0
$42 million $42 million 42000000.0
nan nan nan
$35 million $35 million 35000000.0
$20 million $20 million 20000000.0
$40 million $40 million 40000000.0
$25 million $25 million 25000000.0
$26 million $26 million 26000000.0
$20 million (estimate) $20 million 20000000.0
nan nan nan
$20 million $20 million 20000000.0
nan nan nan
$20 million $20 million 20000000.0
$5 million $5 million 5000000.0
$100,000 $100,000 100000.0
$9 million $9 million 9000000.0
nan nan nan
$20 million $20 million 20000000.0
$40 million $40 million 40000000.0
$10 million $10 million 10000000.0
$40 million $40 million 40000000.0
nan nan nan
nan nan nan
$35 million $35 million 35000000.0
$5,000,000 $5,000,000 5000000.0
$11 million $11 million 11000000.0
['$', '30 million'] $30 million 30000000.0
$6 mil

nan nan nan
$48 million $48 million 48000000.0
$25-35 million $25 million 25000000.0
$500,000 (estimated) $500,000 500000.0
$3.2 million $3.2 million 3200000.0
nan nan nan
$50 million $50 million 50000000.0
nan nan nan
nan nan nan
$13 million $13 million 13000000.0
nan nan nan
nan nan nan
$11 million $11 million 11000000.0
nan nan nan
nan nan nan
$28 million $28 million 28000000.0
$35 million $35 million 35000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
['$20 million', '[', 'not verified in body', ']'] $20 million 20000000.0
$10 million $10 million 10000000.0
nan nan nan
$10.7 million $10.7 million 10700000.0
nan nan nan
['$15 million', '[', 'citation needed', ']'] $15 million 15000000.0
$15 million $15 million 15000000.0
nan nan nan
$40 million $40 million 40000000.0
nan nan nan
nan nan nan
nan nan nan
$62 million $62 million 62000000.0
['$27,575', '$230,000 (post)'] nan nan
['$45 million', '[', 'citation needed', ']'] $45 million 45000000.0
nan nan nan
nan nan nan
$40 millio

nan nan nan
nan nan nan
['$22 million', '[', 'citation needed', ']'] $22 million 22000000.0
$6.4 million $6.4 million 6400000.0
$5 million $5 million 5000000.0
nan nan nan
$6,000,000 $6,000,000 6000000.0
nan nan nan
nan nan nan
$44 million $44 million 44000000.0
nan nan nan
$12 million $12 million 12000000.0
$24 million $24 million 24000000.0
$11 million $11 million 11000000.0
nan nan nan
$50 million $50 million 50000000.0
nan nan nan
nan nan nan
nan nan nan
$150,000 (estimated) $150,000 150000.0
nan nan nan
nan nan nan
$28 million $28 million 28000000.0
$11,000,000 (estimated) $11,000,000 11000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$55 million $55 million 55000000.0
$9.5 million $9.5 million 9500000.0
nan nan nan
nan nan nan
nan nan nan
$35 million $35 million 35000000.0
nan nan nan
nan nan nan
$19 million $19 million 19000000.0
£6,000,000 nan nan
$28 million $28 million 28000000.0
$22 million $22 million 22000000.0
nan nan nan
$50 million $50 million 50000000.0
nan nan

$8 million $8 million 8000000.0
nan nan nan
$350,000 $350,000 350000.0
nan nan nan
nan nan nan
nan nan nan
$92 million $92 million 92000000.0
$24 million $24 million 24000000.0
['$18 million', '[', 'citation needed', ']'] $18 million 18000000.0
nan nan nan
$4,000,000 (estimated) $4,000,000 4000000.0
$60 million $60 million 60000000.0
$15 million $15 million 15000000.0
$1,000,000 $1,000,000 1000000.0
$3 million $3 million 3000000.0
$300,000 $300,000 300000.0
nan nan nan
nan nan nan
nan nan nan
$38 million $38 million 38000000.0
$1,300,000 $1,300,000 1300000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$1.75 million $1.75 million 1750000.0
['$', '15,000,000 (estimated)'] $15,000,000 15000000.0
nan nan nan
nan nan nan
$3 million $3 million 3000000.0
['$4,000,000 (', 'NZD', ')'] $4,000,000 4000000.0
$50 million $50 million 50000000.0
nan nan nan
nan nan nan
nan nan nan
$85 million $85 million 85000000.0
nan nan nan
nan nan nan
$6 million $6 million 6000000.0
$6

nan nan nan
$3.5 million $3.5 million 3500000.0
$1.5 million $1.5 million 1500000.0
nan nan nan
$90 million $90 million 90000000.0
$26 million $26 million 26000000.0
$15 million $15 million 15000000.0
$46 million $46 million 46000000.0
nan nan nan
$18.5 million $18.5 million 18500000.0
nan nan nan
$19 million $19 million 19000000.0
$25 million $25 million 25000000.0
$10 million $10 million 10000000.0
$130–150 million $130 million 130000000.0
$20 million $20 million 20000000.0
$25 million $25 million 25000000.0
nan nan nan
$8 million $8 million 8000000.0
$17 million $17 million 17000000.0
$2.2 million $2.2 million 2200000.0
$70 million $70 million 70000000.0
$25 million $25 million 25000000.0
nan nan nan
nan nan nan
nan nan nan
$60 million $60 million 60000000.0
$15 million $15 million 15000000.0
nan nan nan
$30 million $30 million 30000000.0
$60 million $60 million 60000000.0
$20 million $20 million 20000000.0
nan nan nan
$5 million $5 million 5000000.0
nan nan nan
$3.5 million $3.5 mi

nan nan nan
$14 million $14 million 14000000.0
$4.5 million $4.5 million 4500000.0
nan nan nan
nan nan nan
$170 million $170 million 170000000.0
$30 million $30 million 30000000.0
nan nan nan
nan nan nan
nan nan nan
$6 million $6 million 6000000.0
$85 million $85 million 85000000.0
$43 million $43 million 43000000.0
$3.4 million $3.4 million 3400000.0
$82 million $82 million 82000000.0
nan nan nan
nan nan nan
$76 million $76 million 76000000.0
$57 million $57 million 57000000.0
$60 million $60 million 60000000.0
$7 million $7 million 7000000.0
nan nan nan
$60 million $60 million 60000000.0
$65 million $65 million 65000000.0
$51 million $51 million 51000000.0
$10 million $10 million 10000000.0
$73 million $73 million 73000000.0
$50 million $50 million 50000000.0
nan nan nan
$14 million $14 million 14000000.0
$15 million $15 million 15000000.0
$48 million $48 million 48000000.0
$20 million $20 million 20000000.0
$10 million $10 million 10000000.0
$30 million $30 million 30000000.0
nan na

$60 million $60 million 60000000.0
nan nan nan
$4 million $4 million 4000000.0
$29 million $29 million 29000000.0
$300,000 $300,000 300000.0
$85 million $85 million 85000000.0
$500,000 $500,000 500000.0
$72 million $72 million 72000000.0
$14 million $14 million 14000000.0
$33 million $33 million 33000000.0
$47 million $47 million 47000000.0
$12 million $12 million 12000000.0
nan nan nan
$139 million $139 million 139000000.0
$80 million $80 million 80000000.0
$4.5 million $4.5 million 4500000.0
$38 million $38 million 38000000.0
$115 million $115 million 115000000.0
$60 million $60 million 60000000.0
$600,000 $600,000 600000.0
$25 million $25 million 25000000.0
nan nan nan
$1.8 million $1.8 million 1800000.0
$120 million $120 million 120000000.0
$68 million $68 million 68000000.0
nan nan nan
$43 million $43 million 43000000.0
$30 million $30 million 30000000.0
$10 million $10 million 10000000.0
$10 million $10 million 10000000.0
$150,000 $150,000 150000.0
$17 million $17 million 1700000

$16 million $16 million 16000000.0
$6.5 million $6.5 million 6500000.0
nan nan nan
$3.5 million $3.5 million 3500000.0
$26 million $26 million 26000000.0
$125 million $125 million 125000000.0
nan nan nan
$15 million $15 million 15000000.0
$500,000 $500,000 500000.0
nan nan nan
nan nan nan
$25 million $25 million 25000000.0
nan nan nan
$20 million $20 million 20000000.0
$7.5 million $7.5 million 7500000.0
nan nan nan
$31 million $31 million 31000000.0
$40 million $40 million 40000000.0
$20 million $20 million 20000000.0
$50 million $50 million 50000000.0
$25 million $25 million 25000000.0
nan nan nan
$6 million $6 million 6000000.0
$45 million $45 million 45000000.0
$25 million $25 million 25000000.0
nan nan nan
['$30 million', '[', 'citation needed', ']'] $30 million 30000000.0
$45 million $45 million 45000000.0
$42 million $42 million 42000000.0
$30 million $30 million 30000000.0
nan nan nan
$2.5 million $2.5 million 2500000.0
$50 million $50 million 50000000.0
$20-21 million $20 mill

$25 million $25 million 25000000.0
$9 million $9 million 9000000.0
$30 million $30 million 30000000.0
$12 million $12 million 12000000.0
nan nan nan
$3 million $3 million 3000000.0
$28 million $28 million 28000000.0
$132 million $132 million 132000000.0
nan nan nan
$22 million $22 million 22000000.0
$40 million $40 million 40000000.0
$15 million $15 million 15000000.0
$25 million $25 million 25000000.0
$10 million $10 million 10000000.0
nan nan nan
['$25 million', '[', 'citation needed', ']'] $25 million 25000000.0
$60-87 million $60 million 60000000.0
$45 million $45 million 45000000.0
$65 million $65 million 65000000.0
$5,000,000 $5,000,000 5000000.0
nan nan nan
N/A nan nan
$55 million $55 million 55000000.0
nan nan nan
nan nan nan
nan nan nan
$23 million $23 million 23000000.0
nan nan nan
nan nan nan
$6–8 million $6 million 6000000.0
nan nan nan
nan nan nan
$55 million $55 million 55000000.0
nan nan nan
US$29,000,000 $29,000,000 29000000.0
nan nan nan
nan nan nan
$17 million $17 mil

nan nan nan
$53–67 million $53 million 53000000.0
$75 million $75 million 75000000.0
$15 million $15 million 15000000.0
nan nan nan
$47 million $47 million 47000000.0
$150 million $150 million 150000000.0
nan nan nan
$60 million $60 million 60000000.0
$13 million $13 million 13000000.0
$15 million $15 million 15000000.0
nan nan nan
$24 million $24 million 24000000.0
$25 million $25 million 25000000.0
nan nan nan
$10.2 million $10.2 million 10200000.0
$25.3 million $25.3 million 25300000.0
nan nan nan
$150 million $150 million 150000000.0
$24 million $24 million 24000000.0
$12 million $12 million 12000000.0
$85 million $85 million 85000000.0
$11 million $11 million 11000000.0
$20 million $20 million 20000000.0
$10 million $10 million 10000000.0
$23 million $23 million 23000000.0
$20 million $20 million 20000000.0
$65–80 million $65 million 65000000.0
nan nan nan
nan nan nan
nan nan nan
['$6.5', '[2]', '–$7.5 million', '[3]'] $7.5 million 7500000.0
$25.7 million $25.7 million 25700000.0


$90 million $90 million 90000000.0
$55 million $55 million 55000000.0
$30 million $30 million 30000000.0
$22 million $22 million 22000000.0
$18 million $18 million 18000000.0
$70 million $70 million 70000000.0
$250,000 $250,000 250000.0
$50 million $50 million 50000000.0
$50–60 million $50 million 50000000.0
nan nan nan
US$18.5 million $18.5 million 18500000.0
nan nan nan
$85 million $85 million 85000000.0
$10 million $10 million 10000000.0
$16 million $16 million 16000000.0
nan nan nan
$40 million $40 million 40000000.0
$50 million $50 million 50000000.0
nan nan nan
nan nan nan
$100 million $100 million 100000000.0
$15 million $15 million 15000000.0
$53 million $53 million 53000000.0
$4.5 million $4.5 million 4500000.0
nan nan nan
nan nan nan
$18 million $18 million 18000000.0
nan nan nan
$65 million $65 million 65000000.0
nan nan nan
$17.5 million $17.5 million 17500000.0
nan nan nan
nan nan nan
$25 million $25 million 25000000.0
$24 million $24 million 24000000.0
nan nan nan
$6.5 mi

$19.1 million $19.1 million 19100000.0
$21 million $21 million 21000000.0
$70 million $70 million 70000000.0
$37 million $37 million 37000000.0
$25 million $25 million 25000000.0
$17,000,000 $17,000,000 17000000.0
$50 million $50 million 50000000.0
$15 million $15 million 15000000.0
$50 million $50 million 50000000.0
$20 million $20 million 20000000.0
$25 million $25 million 25000000.0
$5 million $5 million 5000000.0
$40 million $40 million 40000000.0
$125 million $125 million 125000000.0
nan nan nan
nan nan nan
$40 million $40 million 40000000.0
$24 million $24 million 24000000.0
$35 million $35 million 35000000.0
$30 million $30 million 30000000.0
nan nan nan
$1 million $1 million 1000000.0
$90 million $90 million 90000000.0
$9 million $9 million 9000000.0
$36 million $36 million 36000000.0
$5 million $5 million 5000000.0
['$6 million', '[', 'citation needed', ']'] $6 million 6000000.0
$1.5 million $1.5 million 1500000.0
$120 million $120 million 120000000.0
$200 million $200 million

nan nan nan
$50–60 million $50 million 50000000.0
$3 million $3 million 3000000.0
$23 million $23 million 23000000.0
$1 million $1 million 1000000.0
$207,061 $207,061 207061.0
nan nan nan
$17 million $17 million 17000000.0
nan nan nan
nan nan nan
nan nan nan
$35 million $35 million 35000000.0
$17.5 million $17.5 million 17500000.0
$1.5 million $1.5 million 1500000.0
nan nan nan
$8 million $8 million 8000000.0
nan nan nan
$18 million $18 million 18000000.0
$420,000 $420,000 420000.0
nan nan nan
$19 million $19 million 19000000.0
nan nan nan
nan nan nan
nan nan nan
$385,000 $385,000 385000.0
$35 million $35 million 35000000.0
$40–55 million $40 million 40000000.0
$30 million $30 million 30000000.0
nan nan nan
$13 million $13 million 13000000.0
$8 million $8 million 8000000.0
$250,000 $250,000 250000.0
$55 million $55 million 55000000.0
$30 million $30 million 30000000.0
$5 million $5 million 5000000.0
nan nan nan
nan nan nan
nan nan nan
nan nan nan
$78 million $78 million 78000000.0
nan 

$17 million $17 million 17000000.0
$90-100 million $90 million 90000000.0
$25 million $25 million 25000000.0
$2.4 million $2.4 million 2400000.0
nan nan nan
['$11', 'million', '[3]'] $11 million 11000000.0
nan nan nan
nan nan nan
$18 million $18 million 18000000.0
$65 million $65 million 65000000.0
$5 million $5 million 5000000.0
$7–10 million $7 million 7000000.0
$5 million $5 million 5000000.0
['$36 million (gross)', '[3]', '$30.5 million (net)', '[3]'] $30.5 million 30500000.0
$1,000,000 $1,000,000 1000000.0
$13.2 million $13.2 million 13200000.0
$5 million $5 million 5000000.0
$23–28 million $23 million 23000000.0
$34 million $34 million 34000000.0
nan nan nan
$1 million $1 million 1000000.0
$19.8 million $19.8 million 19800000.0
$3 million $3 million 3000000.0
$60 million $60 million 60000000.0
$55–73 million $55 million 55000000.0
nan nan nan
nan nan nan
['$6.5', 'million', '[2]'] $6.5 million 6500000.0
$61 million $61 million 61000000.0
nan nan nan
$16 million $16 million 160000

$5–10 million $5 million 5000000.0
$12 million $12 million 12000000.0
$90 million $90 million 90000000.0
$18 million $18 million 18000000.0
$13 million $13 million 13000000.0
$250,000 $250,000 250000.0
$200 million $200 million 200000000.0
nan nan nan
nan nan nan
$175 million $175 million 175000000.0
$42 million $42 million 42000000.0
$5,000,000 $5,000,000 5000000.0
$3 million $3 million 3000000.0
$5 million $5 million 5000000.0
$97–111 million $97 million 97000000.0
$22 million $22 million 22000000.0
$10 million $10 million 10000000.0
nan nan nan
$65–69 million $65 million 65000000.0
$230–320 million $230 million 230000000.0
nan nan nan
$60 million $60 million 60000000.0
$120–150 million $120 million 120000000.0
$38 million $38 million 38000000.0
$125-195 million $125 million 125000000.0
$2.4–5 million $2.4 million 2400000.0
nan nan nan
$890,000 $890,000 890000.0
nan nan nan
$175 million $175 million 175000000.0
$20–26 million $20 million 20000000.0
$40–45 million $40 million 40000000

In [42]:
for b in cleaning_wiki_df6.index:
    cleaning_wiki_df6['Budget'][b] = parse_dollars(find_dollars(cleaning_wiki_df6['Budget'][b]))

cleaning_wiki_df6['Budget']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df6['Budget'][b] = parse_dollars(find_dollars(cleaning_wiki_df6['Budget'][b]))


0       20000000.0
1        6000000.0
2       35000000.0
3       12000000.0
4       25000000.0
           ...    
7094    42000000.0
7095    60000000.0
7096    20000000.0
7097     9000000.0
7098           NaN
Name: Budget, Length: 7032, dtype: object

In [43]:
# clean the release date column find 
release_date = cleaning_wiki_df6['Release date']
count_s = 0
count_l = 0
count_o =0
for r in release_date.index:
    if type(release_date[r]) == str:
        count_s += 1
    elif type(release_date[r]) == list:
        count_l += 1
    elif release_date[r] == np.nan:
        count+=1
    else:
        count_o += 1

print(f"String count: {count_s} \n List count: {count_l} \n Nan count: {count_o} ")    

String count: 474 
 List count: 6526 
 Nan count: 32 


In [44]:
release_date

0                       [July 11, 1990, (, 1990-07-11, )]
1       [May 17, 1990, (, 1990-05-17, ), (Cannes Film ...
2                     [August 10, 1990, (, 1990-08-10, )]
3                   [December 25, 1990, (, 1990-12-25, )]
4                                       December 19, 1990
                              ...                        
7094    [December 25, 2018, (, 2018-12-25, ), (United ...
7095    [December 11, 2018, (, 2018-12-11, ), (, Samue...
7096    [November 8, 2018, (, 2018-11-08, ), (, AFI Fe...
7097    [August 31, 2018, (, 2018-08-31, ), (, Telluri...
7098                 [28 December 2018, (, 2018-12-28, )]
Name: Release date, Length: 7032, dtype: object

In [45]:
# date patterns to parse
# Month DD, YYYY
date_p1 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*[123]?\d,\s*\d{4}"

# DD Month YYYY
date_p2 = r"[123]?\d\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{4}"

# Month YYYY
date_p3 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December),?\s*\d{4}"

# YYYY
date_p4 = r"\d{4}"


In [46]:
# used this cell to find the patterns
for r in release_date.index:
    if re.findall(date_p1, str(release_date[r]), flags=re.IGNORECASE):  # Month DD, YYYY
        spam = re.findall(date_p1, str(release_date[r]), flags=re.IGNORECASE)[0]
    elif re.findall(date_p2, str(release_date[r]), flags=re.IGNORECASE): # DD Month YYYY
        spam = re.findall(date_p2, str(release_date[r]), flags=re.IGNORECASE)[0]
    elif re.findall(date_p3, str(release_date[r]), flags=re.IGNORECASE): # Month YYYY
        spam = re.findall(date_p3, str(release_date[r]), flags=re.IGNORECASE)[0]
    elif re.findall(date_p4, str(release_date[r]), flags=re.IGNORECASE): # YYYY
        spam = re.findall(date_p4, str(release_date[r]), flags=re.IGNORECASE)[0]

    else:
        print(r, release_date[r])





214 nan
1047 nan
1176 nan
1237 nan
1306 nan
1350 nan
1640 nan
1649 nan
1782 nan
1841 nan
1927 nan
1973 nan
2107 nan
2114 nan
2250 nan
2771 nan
3001 nan
3050 nan
3198 nan
3248 nan
3618 nan
3957 nan
4008 nan
4015 nan
4154 nan
4221 nan
4264 nan
5526 nan
5575 nan
5601 nan
5899 nan
6819 nan


In [47]:
# used this cell to find the patterns
def find_dates(s):
    # declared patterns
    p1 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*[123]?\d,\s*\d{4}"
    p2 = r"[123]?\d\s*(?:January|February|March|April|May|June|July|August|September|October|November|December)\s*\d{4}"
    p3 = r"(?:January|February|March|April|May|June|July|August|September|October|November|December),?\s*\d{4}"
    p4 = r"\d{4}"
    if re.findall(p1, str(s), flags=re.IGNORECASE):  # Month DD, YYYY
        spam = re.findall(p1, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p2, str(s), flags=re.IGNORECASE): # DD Month YYYY
        spam = re.findall(p2, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p3, str(s), flags=re.IGNORECASE): # Month YYYY
        spam = re.findall(p3, str(s), flags=re.IGNORECASE)[0]
        return spam
    elif re.findall(p4, str(s), flags=re.IGNORECASE): # YYYY
        spam = re.findall(p4, str(s), flags=re.IGNORECASE)[0]
        return spam
    else:
        return np.nan



In [48]:
for r in release_date.index:
    release_date[r] = find_dates(release_date[r])

release_date

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  release_date[r] = find_dates(release_date[r])


0           July 11, 1990
1            May 17, 1990
2         August 10, 1990
3       December 25, 1990
4       December 19, 1990
              ...        
7094    December 25, 2018
7095    December 11, 2018
7096     November 8, 2018
7097      August 31, 2018
7098     28 December 2018
Name: Release date, Length: 7032, dtype: object

In [49]:
cleaning_wiki_df6['Release date'] = pd.to_datetime(release_date.str.extract(f"({date_p1}|{date_p2}|{date_p3}|{date_p4})", flags=re.IGNORECASE)[0], infer_datetime_format=True)
cleaning_wiki_df6['Release date']

0      1990-07-11
1      1990-05-17
2      1990-08-10
3      1990-12-25
4      1990-12-19
          ...    
7094   2018-12-25
7095   2018-12-11
7096   2018-11-08
7097   2018-08-31
7098   2018-12-28
Name: Release date, Length: 7032, dtype: datetime64[ns]

In [50]:
running_time = cleaning_wiki_df6['Running time']
running_time

0                                 102 minutes
1                                 114 minutes
2                                 113 minutes
3                                 106 minutes
4                                  95 minutes
                        ...                  
7094                               90 minutes
7095                              132 minutes
7096                              120 minutes
7097                              123 minutes
7098    Variable; 90 minutes for default path
Name: Running time, Length: 7032, dtype: object

In [51]:
# patterns found in running_time using the cell below
time_p1 = r"(\d+)(?:', ')?\s*min\.?(?:utes)?" # 120 minutes, 120 min, 120 min., '120', 'min'
time_p2 = r"((?:\d+\s*h)?\s*\d+m)"# 1h 48m, 70m
time_p3 = r"(\d+\s*hours?)" # one record says "4 Hours"

In [52]:
# use this cell to find the patterns and write them in the cell above
for r in running_time.index:
    if re.findall(time_p1, str(running_time[r]), flags=re.IGNORECASE): # 120 minutes, 120 min, 120 min., '120', 'min'
        spam = re.findall(time_p1, str(running_time[r]), flags=re.IGNORECASE)[0]
#         print(r, spam) # ready to convert to an int
    elif re.findall(time_p2, str(running_time[r]), flags=re.IGNORECASE): # 1h 48m, 70m
        spam = re.findall(time_p2, str(running_time[r]), flags=re.IGNORECASE)[0]
        minutes = int(re.findall(r"(\d+)m$", spam, flags=re.IGNORECASE)[0])
        try:
            hours = int(re.findall(r"(\d+)h", spam, flags=re.IGNORECASE)[0])*60 
            minutes+=hours
        except:
            pass
        
#         print(r, spam, type(spam), minutes)
    elif re.findall(time_p3, str(running_time[r]), flags=re.IGNORECASE): # 1 record says "4 hours" 
        spam = re.findall(time_p3, str(running_time[r]), flags=re.IGNORECASE)[0]
        hours = int(re.findall(r"(\d+)\s*h", spam, flags=re.IGNORECASE)[0])*60 
#         print(r, spam, type(spam), hours)
    else:
        pass
#         print(r, running_time[r])

In [53]:
# make the changes to running_time for an easier number conversion
def find_time(s):
    # Declared patterns
    p1 = r"(\d+)(?:', ')?\s*min\.?(?:utes)?" # 120 minutes, 120 min, 120 min., '120', 'min'
    p2 = r"((?:\d+\s*h)?\s*\d+m)"# 1h 48m, 70m
    p3 = r"(\d+\s*hours?)" # one record says "4 Hours"
    if type(s) == float:
        return s
    if re.findall(p1, str(s), flags=re.IGNORECASE): # 120 minutes, 120 min, 120 min., '120', 'min'
        spam = int(re.findall(p1, str(s), flags=re.IGNORECASE)[0])
        return spam
    elif re.findall(p2, str(s), flags=re.IGNORECASE): # 1h 48m, 70m
        spam = re.findall(p2, str(s), flags=re.IGNORECASE)[0]
        egg = int(re.findall(r"(\d+)m$", spam, flags=re.IGNORECASE)[0])
        try:
            hours = int(re.findall(r"(\d+)h", spam, flags=re.IGNORECASE)[0])*60 
            egg+=hours
        except:
            pass

        return egg
    elif re.findall(p3, str(s), flags=re.IGNORECASE): # 1 record says "4 hours" 
        spam = re.findall(p3, str(s), flags=re.IGNORECASE)[0]
        egg = int(re.findall(r"(\d+)\s*h", spam, flags=re.IGNORECASE)[0])*60
        return egg
    else:
        return float(np.nan)

In [54]:
# test the find_time() function
for r in running_time.index:
    print(r, find_time(running_time[r]), running_time[r])

0 102 102 minutes
1 114 114 minutes
2 113 113 minutes
3 106 106 minutes
4 95 95 minutes
5 95 95 minutes
6 100 100 minutes
7 99 99 minutes
8 50 50 minutes
9 102 102 min
10 93 93 minutes
11 110 110 minutes
12 126 126 minutes
13 121 121 minutes
14 118 118 minutes
15 99 99 minutes
16 90 90 minutes
17 94 94 minutes
18 110 110 minutes
19 190 190 minutes
20 85 85 minutes
21 102 102 minutes
22 126 126 minutes
23 96 96 minutes
24 97 97 minutes
25 97 97 minutes
26 93 93 min
27 97 97 minutes
28 32 32 min.
29 98 98 minutes
30 nan nan
31 95 95 minutes
32 98 98 minutes
33 84 84 minutes
34 101 101 min
35 97 97 min
36 86 86 minutes
37 99 99 minutes
38 97 97 minutes
39 138 138 minutes
40 99 99 minutes
41 85 85 minutes
42 91 91 min.
43 85 85 minutes
44 95 95 minutes
45 181 181 minutes
46 nan nan
47 95 95 minutes
48 93 93 minutes
49 108 108 minutes
50 120 120 minutes
51 95 95 minutes
52 100 100 minutes
53 94 94 minutes
54 111 111 minutes
55 103 103 minutes
56 106 106 minutes
57 105 105 minutes
58 124 124

494 103 103 minutes
495 187 187 min.
496 101 101 minutes
497 104 104 minutes
498 107 107 minutes
499 98 98 minutes
500 112 112 minutes
501 96 96 minutes
502 88 88 minutes
503 110 110 minutes
504 96 96 minutes
505 140 140 minutes
506 76 76 minutes
507 138 138 minutes
508 101 101 minutes
509 124 124 minutes
510 107 107 minutes
511 102 102 minutes
512 110 110 minutes
513 96 96 min.
514 101 101 minutes
515 101 101 minutes
516 100 100 minutes
517 89 89 minutes
518 110 110 minutes
519 117 117 minutes
520 94 94 minutes
521 140 140 minutes
522 120 120 minutes
523 89 89 minutes
524 96 96 minutes
525 101 101 minutes
526 103 103 minutes
527 93 93 minutes
528 113 113 minutes
529 101 101 minutes
530 106 ['106 minutes (UK)', '[2]', '125 minutes (USA)']
531 95 95 minutes
532 92 92 minutes
533 105 ['105 minutes', '116 minutes (international cut)']
534 102 102 minutes
535 90 90 minutes
536 112 112 minutes
537 108 ['108 minutes', '141 minutes', "(director's cut)"]
538 128 128 minutes
539 108 108 minutes

1442 99 99 minutes
1443 90 90 minutes
1444 103 103 minutes
1445 105 105 minutes
1446 77 ['United States:', '77 minutes', 'Argentina:', '94 minutes', 'Germany:', '94 minutes (DVD version)']
1447 96 96 minutes
1448 81 81 minutes
1449 109 109 min
1450 90 90 min.
1451 91 91 min.
1452 84 84 minutes
1453 92 92 minutes
1454 167 167 minutes
1455 115 115 minutes
1456 113 113 minutes
1457 94 94 minutes
1458 86 ['86 minutes', '97 minutes', '(Argentina)']
1459 80 80 minutes
1460 98 98 minutes
1461 106 106 minutes
1462 98 98 minutes
1463 176 176 minutes
1464 112 112 minutes
1465 81 81 minutes
1466 87 87 minutes
1467 108 108 minutes
1468 90 90 minutes
1469 91 91 minutes
1470 107 107 minutes
1471 95 95 minutes
1472 118 118 minutes
1473 87 87 minutes
1474 90 90 minutes
1475 101 101 minutes
1476 86 86 min.
1477 91 91 minutes
1478 108 108 minutes
1479 112 112 minutes
1480 86 86 minutes
1481 100 100 minutes
1482 89 89 minutes
1483 107 107 minutes
1484 158 158 minutes
1485 108 108 minutes
1486 96 96 minut

1940 97 97 minutes
1941 67 67 minutes
1942 82 82 minutes
1943 105 105 minutes
1944 186 186 minutes
1945 112 112 minutes
1946 100 100 minutes
1947 124 124 minutes
1948 154 154 minutes
1949 108 108 min
1950 105 105 minutes
1951 100 100 min.
1952 95 95 minutes
1953 102 102 minutes
1954 86 86 minutes
1955 93 93 minutes
1956 113 113 mins
1957 117 117 minutes
1958 86 86 minutes
1959 96 96 minutes
1960 134 134 minutes
1961 138 138 minutes
1962 74 74 minutes
1963 96 96 minutes
1964 92 92 minutes
1965 101 101 minutes
1966 84 84 minutes
1967 95 95 minutes
1969 86 86 minutes
1970 360 360 minutes
1971 103 103 minutes
1972 90 90 minutes
1973 12 12 minutes
1974 137 137 minutes
1975 134 134 minutes
1976 129 129 minutes
1977 108 108 minutes
1978 103 103 minutes
1979 108 108 minutes
1980 96 96 minutes
1981 11 11 min.
1982 114 114 minutes
1983 94 94 minutes
1984 105 105 minutes
1985 97 97 minutes
1986 108 108 min.
1987 104 104 minutes
1988 105 105 minutes
1989 93 93 minutes
1990 98 98 minutes
1991 127 1

2785 104 104 minutes
2786 115 115 minutes
2787 111 111 minutes
2788 120 120 minutes
2789 125 125 minutes
2790 146 146 minutes
2791 118 118 minutes
2792 157 157 minutes
2793 95 95 minutes
2794 103 103 minutes
2795 91 91 minutes
2796 103 103 minutes
2797 94 94 minutes
2798 94 94 minutes
2799 100 100 minutes
2800 106 106 minutes
2801 85 85 minutes
2802 102 102 minutes
2803 83 83 minutes
2804 115 115 minutes
2805 109 109 minutes
2806 96 96 minutes
2807 131 131 minutes
2808 123 123 minutes
2809 83 83 minutes
2810 135 135 minutes
2811 106 106 minutes
2812 98 98 minutes
2813 111 111 minutes
2814 144 144 minutes
2815 95 95 minutes
2816 124 124 minutes
2817 96 96 minutes
2818 58 58 minutes
2819 106 106 minutes
2820 83 83 minutes
2821 98 98 minutes
2822 113 113 minutes
2823 84 84 minutes
2824 129 129 minutes
2825 114 114 minutes
2826 88 88 minutes
2827 105 105 minutes
2828 88 88 minutes
2829 86 86 minutes
2830 88 88 minutes
2831 99 99 minutes
2832 88 88 minutes
2833 87 87 minutes
2834 103 103 mi

3371 88 88 minutes
3372 91 91 minutes
3373 95 95 minutes
3374 137 137 minutes
3375 117 117 minutes
3376 109 109 minutes
3377 88 88 minutes
3378 86 ['86 minutes', '90 minutes (R-rated version)']
3379 83 83 minutes
3380 137 137 minutes
3381 96 96 minutes
3382 90 90 minutes
3383 88 88 minutes
3384 97 97 minutes
3385 91 ['91 minutes', '92 minutes', '(Unrated)']
3386 101 101 minutes
3387 139 139 minutes
3388 79 79 minutes
3389 102 102 minutes
3390 105 105 minutes
3391 99 99 minutes
3392 88 88 minutes
3393 82 82 minutes
3394 119 119 minutes
3395 113 113 minutes
3396 81 81 minutes
3397 81 81 minutes
3398 75 75 minutes
3399 143 143 minutes
3401 104 104 minutes
3402 77 77 minutes
3403 95 95 minutes
3404 109 109 minutes
3405 96 96 minutes
3406 115 115 minutes
3407 99 99 minutes
3408 80 80 minutes
3409 127 127 minutes
3410 104 104 minutes
3411 117 117 minutes
3412 84 84 minutes
3413 109 109 minutes
3414 141 141 minutes
3415 111 111 minutes
3416 101 101 minutes
3417 114 114 minutes
3418 94 94 minu

3920 78 78 min.
3921 94 94 minutes
3922 92 92 minutes
3923 80 80 minutes
3924 81 81 minutes
3925 112 112 minutes
3926 90 90 minutes
3927 90 90 minutes
3928 128 128 minutes
3929 78 78 minutes
3930 118 118 minutes
3931 95 ['95 minutes', "Director's Cut: 111 minutes"]
3932 92 92 minutes
3933 107 107 minutes
3934 nan nan
3935 90 90 min
3936 240 4 hours
3937 117 117 minutes
3938 93 93 minutes
3939 82 82 minutes
3940 138 138 minutes
3941 104 104 minutes
3942 82 82 minutes
3943 102 102 minutes
3944 86 86 min.
3945 127 127 minutes
3946 98 98 minutes
3947 117 117 minutes
3948 104 104 minutes
3949 105 105 minutes
3950 70 70 minutes
3951 83 83 minutes
3952 90 90 minutes
3953 143 143 minutes
3954 75 75 minutes
3955 90 90 minutes
3956 114 114 minutes
3957 nan nan
3958 110 110 minutes
3959 92 92 minutes
3960 91 91 minutes
3961 85 85 minutes
3962 95 95 minutes
3963 79 79 minutes
3964 93 93 minutes
3965 95 95 minutes
3966 85 85 minutes
3967 99 99 minutes
3968 84 ['84 minutes', '(European cut)', '[3]',

4366 97 97 minutes
4367 93 93 minutes
4368 85 85 minutes
4369 71 71 minutes
4370 108 108 minutes
4371 90 90 minutes
4372 84 84 minutes
4373 100 100 minutes
4374 110 110 minutes
4375 84 84 minutes
4376 84 ['84 minutes', '107 minutes (unrated version)']
4377 113 113 minutes
4378 122 122 minutes
4379 92 92 minutes
4380 100 100 minutes
4381 89 89 minutes
4382 95 95 minutes
4383 81 81 minutes
4384 98 98 minutes
4385 98 98 minutes
4386 90 90 minutes
4387 93 93 minutes
4388 89 ['89 minutes', '92 minutes', '(Unrated cut)']
4389 111 111 minutes
4390 90 90 minutes
4391 82 82 minutes
4393 105 105 minutes
4394 112 112 minutes
4395 101 101 minutes
4396 90 90 minutes
4397 105 105 minutes
4398 107 107 minutes
4399 85 85 minutes
4400 96 96 minutes
4401 117 117 minutes
4402 92 92 minutes
4403 75 75 minutes
4404 92 92 minutes
4405 nan nan
4406 88 88 minutes
4407 111 111 minutes
4408 105 105 minutes
4409 98 98 minutes
4410 113 113 minutes
4411 90 90 minutes
4412 116 116 minutes
4413 122 122 minutes
4414 

4897 98 98 minutes
4898 77 77 minutes
4899 90 90 minutes
4900 90 90 minutes
4901 90 90 minutes
4902 105 105 minutes
4903 100 100 minutes
4904 97 97 minutes
4905 153 153 minutes
4906 102 102 minutes
4907 129 129 minutes
4908 92 92 minutes
4909 88 88 minutes
4910 91 91 minutes
4911 100 100 minutes
4912 94 94 minutes
4913 113 113 minutes
4914 98 98 minutes
4915 105 105 minutes
4916 131 131 minutes
4917 102 102 minutes
4918 105 105 minutes
4919 87 87 minutes
4920 107 107 minutes
4921 123 123 minutes
4922 108 108 minutes
4923 98 98 minutes
4924 153 153 minutes
4925 106 106 minutes
4926 118 118 minutes
4927 133 133 minutes
4928 96 96 minutes
4929 100 100 minutes
4930 nan nan
4931 120 120 minutes
4932 85 85 minutes
4933 102 102 minutes
4934 76 ['76 minutes', '[1]', '85 minutes', '(Extended edition)', '[2]']
4935 123 123 minutes
4936 121 121 minutes
4937 90 90 minutes
4938 90 90 minutes
4939 102 102 minutes
4940 100 100 minutes
4941 118 118 minutes
4942 97 97 minutes
4943 97 97 minutes
4944 11

5881 142 142 minutes
5882 100 100 minutes
5883 99 99 minutes
5884 90 90 minutes
5885 101 101 minutes
5886 113 113 minutes
5887 100 100 minutes
5888 nan nan
5889 86 86 minutes
5890 88 88 minutes
5891 82 82 minutes
5892 nan nan
5893 86 86 minutes
5894 100 100 minutes
5895 nan nan
5896 117 117 minutes
5897 94 94 minutes
5898 126 126 minutes
5899 97 97 minutes
5900 161 161 minutes
5901 nan nan
5902 100 100 minutes
5903 90 90 minutes
5904 125 125 minutes
5905 99 99 minutes
5906 97 97 Minutes
5907 90 90 minutes
5908 67 67 minutes
5909 85 85 minutes
5910 146 146 minutes
5911 90 90 minutes
5912 nan nan
5913 95 95 minutes
5914 106 ['106 minutes', '100 minutes', '[1]', '(Edited cut)']
5915 90 90 minutes
5916 105 105 minutes
5917 112 112 minutes
5918 93 93 minutes
5919 83 83 minutes
5920 100 100 minutes
5921 nan nan
5922 120 120 minutes
5923 105 105 minutes
5924 106 106 minutes
5925 119 119 minutes
5926 131 131 minutes
5927 79 79 minutes
5928 88 88 minutes
5929 87 87 minutes
5930 114 114 minutes


6377 119 119 minutes
6378 115 115 minutes
6379 115 115 minutes
6380 94 94 minutes
6381 88 88 minutes
6382 100 100 minutes
6383 94 94 minutes
6384 110 110 minutes
6385 97 97 minutes
6386 137 137 minutes
6387 109 109 minutes
6388 108 108 minutes
6389 128 128 minutes
6390 95 95 minutes
6391 82 82 minutes
6392 86 86 minutes
6393 91 91 minutes
6394 137 137 minutes
6395 82 82 minutes
6396 94 94 minutes
6397 nan nan
6398 99 99 minutes
6399 83 83 minutes
6400 90 90 minutes
6401 99 99 minutes
6402 89 89 minutes
6403 90 90 minutes
6404 107 107 minutes
6405 113 113 minutes
6406 141 141 minutes
6407 119 119 minutes
6408 101 101 minutes
6409 87 87 minutes
6410 95 95 minutes
6411 120 120 minutes
6412 115 115 minutes
6413 93 ['93 minutes', '101 minutes', '(extended version)']
6414 130 130 minutes
6415 103 103 minutes
6416 105 105 minutes
6417 114 114 minutes
6418 120 120 minutes
6419 104 104 minutes
6420 98 98 minutes
6421 121 121 minutes
6422 124 124 minutes
6423 105 105 minutes
6424 103 103 minutes

6848 91 91 minutes
6849 93 93 minutes
6850 116 116 minutes
6851 103 103 minutes
6852 129 129 minutes
6853 140 140 minutes
6854 104 104 minutes
6855 143 143 minutes
6856 112 112 minutes
6857 93 93 minutes
6858 99 99 minutes
6859 104 104 minutes
6860 118 118 minutes
6861 102 102 minutes
6862 95 95 minutes
6863 110 110 minutes
6864 94 94 minutes
6865 138 138 minutes
6866 96 96 minutes
6867 96 96 minutes
6868 94 94 minutes
6869 134 134 minutes
6870 87 87 minutes
6871 127 127 minutes
6872 71 71 minutes
6873 114 114 minutes
6874 96 96 minutes
6875 109 109 minutes
6876 100 100 minutes
6877 115 115 minutes
6878 97 97 minutes
6879 95 95 minutes
6880 92 92 minutes
6881 140 140 minutes
6882 107 107 minutes
6883 117 117 minutes
6884 112 112 minutes
6885 102 102 minutes
6886 109 109 minutes
6887 110 110 minutes
6888 92 92 minutes
6889 107 107 minutes
6890 112 112 minutes
6891 85 85 minutes
6892 103 103 minutes
6893 118 118 minutes
6894 110 110 minutes
6895 110 110 minutes
6896 107 107 minutes
6897 

In [55]:
# use the find_time() function
for c in cleaning_wiki_df6.index:
    cleaning_wiki_df6['Running time'][c] = find_time(cleaning_wiki_df6['Running time'][c])
    
cleaning_wiki_df6['Running time']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_wiki_df6['Running time'][c] = find_time(cleaning_wiki_df6['Running time'][c])


0       102
1       114
2       113
3       106
4        95
       ... 
7094     90
7095    132
7096    120
7097    123
7098     90
Name: Running time, Length: 7032, dtype: object

In [56]:
print(type(cleaning_wiki_df6['Budget'][0]), "  ||  ", type(cleaning_wiki_df6['Box office'][0]))
cleaning_wiki_df6.info()

<class 'float'>   ||   <class 'float'>
<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7098
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   url                  7032 non-null   object        
 1   year                 7032 non-null   int64         
 2   imdb_link            7031 non-null   object        
 3   title                7031 non-null   object        
 4   Based on             2181 non-null   object        
 5   Starring             6850 non-null   object        
 6   Cinematography       6343 non-null   object        
 7   Release date         7000 non-null   datetime64[ns]
 8   Running time         6889 non-null   object        
 9   Country              6798 non-null   object        
 10  Language             6948 non-null   object        
 11  Budget               4696 non-null   object        
 12  Box office           5470 non-null   object        

In [57]:
# the individual cells of the columns are numbers (float/int) but the dataframe think that they are
cleaning_wiki_df6['Box office'] = pd.to_numeric(cleaning_wiki_df6['Box office'])
cleaning_wiki_df6['Budget'] = pd.to_numeric(cleaning_wiki_df6['Budget'])
cleaning_wiki_df6['Running time'] = pd.to_numeric(cleaning_wiki_df6['Running time'])


In [58]:
cleaning_wiki_df6.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7098
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   url                  7032 non-null   object        
 1   year                 7032 non-null   int64         
 2   imdb_link            7031 non-null   object        
 3   title                7031 non-null   object        
 4   Based on             2181 non-null   object        
 5   Starring             6850 non-null   object        
 6   Cinematography       6343 non-null   object        
 7   Release date         7000 non-null   datetime64[ns]
 8   Running time         6889 non-null   float64       
 9   Country              6798 non-null   object        
 10  Language             6948 non-null   object        
 11  Budget               4696 non-null   float64       
 12  Box office           5470 non-null   float64       
 13  Director             7032 non-nul

In [59]:
# moving on to the next dataset Kaggle's Movie Meta Data
raw_meta_data_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45466 non-null  object 
 1   belongs_to_collection  4494 non-null   object 
 2   budget                 45466 non-null  object 
 3   genres                 45466 non-null  object 
 4   homepage               7782 non-null   object 
 5   id                     45466 non-null  object 
 6   imdb_id                45449 non-null  object 
 7   original_language      45455 non-null  object 
 8   original_title         45466 non-null  object 
 9   overview               44512 non-null  object 
 10  popularity             45461 non-null  object 
 11  poster_path            45080 non-null  object 
 12  production_companies   45463 non-null  object 
 13  production_countries   45463 non-null  object 
 14  release_date           45379 non-null  object 
 15  re

In [76]:
raw_meta_data_df['adult'].value_counts() # some bad data

False                                                                                                                             45454
True                                                                                                                                  9
 - Written by Ørnås                                                                                                                   1
 Rune Balot goes to a casino connected to the October corporation to try to wrap up her case once and for all.                        1
 Avalanche Sharks tells the story of a bikini contest that turns into a horrifying affair when it is hit by a shark avalanche.        1
Name: adult, dtype: int64

In [81]:
raw_meta_data_df['genres'].value_counts() # looks like it should be it's own connecting table: genre name, genre id

[{'id': 18, 'name': 'Drama'}]                                                                                                         5000
[{'id': 35, 'name': 'Comedy'}]                                                                                                        3621
[{'id': 99, 'name': 'Documentary'}]                                                                                                   2723
[]                                                                                                                                    2442
[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'name': 'Romance'}]                                                                       1301
                                                                                                                                      ... 
[{'id': 28, 'name': 'Action'}, {'id': 18, 'name': 'Drama'}, {'id': 35, 'name': 'Comedy'}, {'id': 99, 'name': 'Documentary'}]             1
[{'id': 10752, 'name': 'War

In [83]:
raw_meta_data_df['belongs_to_collection'].value_counts() # another dict almost worth creating a seperate table.

{'id': 415931, 'name': 'The Bowery Boys', 'poster_path': '/q6sA4bzMT9cK7EEmXYwt7PNrL5h.jpg', 'backdrop_path': '/foe3kuiJmg5AklhtD3skWbaTMf2.jpg'}                 29
{'id': 421566, 'name': 'Totò Collection', 'poster_path': '/4ayJsjC3djGwU9eCWUokdBWvdLC.jpg', 'backdrop_path': '/jaUuprubvAxXLAY5hUfrNjxccUh.jpg'}                 27
{'id': 645, 'name': 'James Bond Collection', 'poster_path': '/HORpg5CSkmeQlAolx3bKMrKgfi.jpg', 'backdrop_path': '/6VcVl48kNKvdXOZfJPdarlUGOsk.jpg'}               26
{'id': 96887, 'name': 'Zatôichi: The Blind Swordsman', 'poster_path': '/8Q31DAtmFJjhFTwQGXghBUCgWK2.jpg', 'backdrop_path': '/bY8gLImMR5Pr9PaG3ZpobfaAQ8N.jpg'}    26
{'id': 37261, 'name': 'The Carry On Collection', 'poster_path': '/2P0HNrYgKDvirV8RCdT1rBSJdbJ.jpg', 'backdrop_path': '/38tF1LJN7ULeZAuAfP7beaPMfcl.jpg'}          25
                                                                                                                                                                  ..
{'id': 456

In [84]:
raw_meta_data_df['id'].value_counts()

141971    3
168538    2
25541     2
15028     2
11115     2
         ..
72272     1
1549      1
10171     1
38996     1
461257    1
Name: id, Length: 45436, dtype: int64

In [85]:
raw_meta_data_df['imdb_id'].value_counts()

tt1180333    3
0            3
tt0270288    2
tt0157472    2
tt0446676    2
            ..
tt0324242    1
tt0090206    1
tt0383846    1
tt1056437    1
tt6980792    1
Name: imdb_id, Length: 45417, dtype: int64

In [86]:
raw_meta_data_df['original_language'].value_counts()

en       32269
fr        2438
it        1529
ja        1350
de        1080
         ...  
zu           1
qu           1
104.0        1
la           1
si           1
Name: original_language, Length: 92, dtype: int64

In [88]:
raw_meta_data_df['spoken_languages'].value_counts()

[{'iso_639_1': 'en', 'name': 'English'}]                                                                                                                                                                                         22395
[]                                                                                                                                                                                                                                3829
[{'iso_639_1': 'fr', 'name': 'Français'}]                                                                                                                                                                                         1853
[{'iso_639_1': 'ja', 'name': '日本語'}]                                                                                                                                                                                              1289
[{'iso_639_1': 'it', 'name': 'Italiano'}]                                   

In [87]:
raw_meta_data_df['original_title'].value_counts()

Hamlet                 8
Alice in Wonderland    8
Les Misérables         7
Cinderella             7
Macbeth                7
                      ..
거미숲                    1
What Goes Up           1
12th & Delaware        1
Wonder Bar             1
Queerama               1
Name: original_title, Length: 43373, dtype: int64

In [89]:
raw_meta_data_df['title'].value_counts()

Cinderella              11
Hamlet                   9
Alice in Wonderland      9
Beauty and the Beast     8
Les Misérables           8
                        ..
Cluny Brown              1
Babies                   1
The Green Room           1
Captain Conan            1
Queerama                 1
Name: title, Length: 42277, dtype: int64

In [91]:
raw_meta_data_df['video'].value_counts()

False    45367
True        93
Name: video, dtype: int64

In [92]:
raw_meta_data_df['release_date'].value_counts()

2008-01-01    136
2009-01-01    121
2007-01-01    118
2005-01-01    111
2006-01-01    101
             ... 
1957-09-26      1
1938-11-21      1
1936-08-19      1
2010-01-27      1
1917-10-21      1
Name: release_date, Length: 17336, dtype: int64

In [98]:
cleaning_meta_df = raw_meta_data_df

In [107]:
# look at the rows with dirty data in the adult column
cleaning_meta_df[~cleaning_meta_df['adult'].isin(['True', 'False'])] # Results: the rows themselves are only incoreect data or np.nan


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
19730,- Written by Ørnås,0.065736,,"[{'name': 'Carousel Productions', 'id': 11176}...","[{'iso_3166_1': 'CA', 'name': 'Canada'}, {'iso...",1997-08-20,0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,1,,,,,,,,,
29503,Rune Balot goes to a casino connected to the ...,1.931659,,"[{'name': 'Aniplex', 'id': 2883}, {'name': 'Go...","[{'iso_3166_1': 'US', 'name': 'United States o...",2012-09-29,0,68.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,...,12,,,,,,,,,
35587,Avalanche Sharks tells the story of a bikini ...,2.185485,,"[{'name': 'Odyssey Media', 'id': 17161}, {'nam...","[{'iso_3166_1': 'CA', 'name': 'Canada'}]",2014-01-01,0,82.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,...,22,,,,,,,,,


In [108]:
cleaning_meta_df = cleaning_meta_df[cleaning_meta_df['adult'].isin(['True', 'False'])]
cleaning_meta_df['adult'] = cleaning_meta_df['adult'] == 'True'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_meta_df['adult'] = cleaning_meta_df['adult'] == 'True'


In [111]:
cleaning_meta_df['video'] = cleaning_meta_df['video'] == 'True'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_meta_df['video'] = cleaning_meta_df['video'] == 'True'


In [112]:
cleaning_meta_df.info() # adult and video are changed

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45463 non-null  bool   
 1   belongs_to_collection  4491 non-null   object 
 2   budget                 45463 non-null  object 
 3   genres                 45463 non-null  object 
 4   homepage               7779 non-null   object 
 5   id                     45463 non-null  object 
 6   imdb_id                45446 non-null  object 
 7   original_language      45452 non-null  object 
 8   original_title         45463 non-null  object 
 9   overview               44509 non-null  object 
 10  popularity             45460 non-null  object 
 11  poster_path            45077 non-null  object 
 12  production_companies   45460 non-null  object 
 13  production_countries   45460 non-null  object 
 14  release_date           45376 non-null  object 
 15  re

In [118]:
pd.to_numeric(cleaning_meta_df['budget']) # error at index 19730, string of link to jpg file
pd.to_numeric(cleaning_meta_df['budget'][19720:19770], errors='raise') 
cleaning_meta_df['budget'] = pd.to_numeric(cleaning_meta_df['budget'], errors='raise') 


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_meta_df['budget'] = pd.to_numeric(cleaning_meta_df['budget'], errors='raise')


In [121]:
cleaning_meta_df['id'] = pd.to_numeric(cleaning_meta_df['id'], errors='raise') 
cleaning_meta_df['popularity'] = pd.to_numeric(cleaning_meta_df['popularity'], errors='raise') 
cleaning_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   adult                  45463 non-null  bool   
 1   belongs_to_collection  4491 non-null   object 
 2   budget                 45463 non-null  int64  
 3   genres                 45463 non-null  object 
 4   homepage               7779 non-null   object 
 5   id                     45463 non-null  int64  
 6   imdb_id                45446 non-null  object 
 7   original_language      45452 non-null  object 
 8   original_title         45463 non-null  object 
 9   overview               44509 non-null  object 
 10  popularity             45460 non-null  float64
 11  poster_path            45077 non-null  object 
 12  production_companies   45460 non-null  object 
 13  production_countries   45460 non-null  object 
 14  release_date           45376 non-null  object 
 15  re

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_meta_df['id'] = pd.to_numeric(cleaning_meta_df['id'], errors='raise')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_meta_df['popularity'] = pd.to_numeric(cleaning_meta_df['popularity'], errors='raise')


In [123]:
cleaning_meta_df['release_date'] = pd.to_datetime(cleaning_meta_df['release_date'])
cleaning_meta_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 45463 entries, 0 to 45465
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  45463 non-null  bool          
 1   belongs_to_collection  4491 non-null   object        
 2   budget                 45463 non-null  int64         
 3   genres                 45463 non-null  object        
 4   homepage               7779 non-null   object        
 5   id                     45463 non-null  int64         
 6   imdb_id                45446 non-null  object        
 7   original_language      45452 non-null  object        
 8   original_title         45463 non-null  object        
 9   overview               44509 non-null  object        
 10  popularity             45460 non-null  float64       
 11  poster_path            45077 non-null  object        
 12  production_companies   45460 non-null  object        
 13  p

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  cleaning_meta_df['release_date'] = pd.to_datetime(cleaning_meta_df['release_date'])
