In [1]:
import os
import requests as rq
import pandas as pd
import numpy as np
import json
import re


In [2]:
# the filepath for each data file
wiki_movie_file = os.path.join('Data', 'wikipedia-movies.json')
ratings_file = os.path.join('Data', 'ratings.csv')
movie_meta_data_file = os.path.join('Data', 'movies_metadata.csv')


In [3]:
# Load the files
with open(wiki_movie_file, 'r') as file1:
    raw_movie_json = json.load(file1)
    file1.close()

# file 2, not cleaning this one yet
# raw_ratings_df = pd.read_csv(ratings_file, sep=',', header=0)

# file 3, not cleaning this one yet
# raw_meta_data_df = pd.read_csv(movie_meta_data_file, sep=',', low_memory=False)


In [4]:
# Explore the data, commenting results
raw_movie_json # Results: list of dicts, each movie is it's own dict
len(raw_movie_json) # Result: 7311 Movies
len(raw_movie_json[10].keys()) # changing the index looking for number of keys; 10 random indexes Results: 17 to 22 keys
raw_movie_df = pd.DataFrame(raw_movie_json) # Results: 7311 rows × 193 columns with lots of Nan fields, not every movie has the same "22" keys
raw_movie_df

Unnamed: 0,url,year,imdb_link,title,Directed by,Produced by,Screenplay by,Story by,Based on,Starring,...,Predecessor,Founders,Area served,Products,Services,Russian,Hebrew,Revenue,Operating income,Polish
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990.0,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,Renny Harlin,"[Steve Perry, Joel Silver]","[David Arnott, James Cappe, Daniel Waters]","[David Arnott, James Cappe]","[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...",...,,,,,,,,,,
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990.0,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet",James Foley,"[Ric Kidney, Robert Redlin]","[James Foley, Robert Redlin]",,"[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",...,,,,,,,,,,
2,https://en.wikipedia.org/wiki/Air_America_(film),1990.0,https://www.imdb.com/title/tt0099005/,Air America,Roger Spottiswoode,Daniel Melnick,"[John Eskow, Richard Rush]",,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",...,,,,,,,,,,
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990.0,https://www.imdb.com/title/tt0099012/,Alice,Woody Allen,Robert Greenhut,,,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",...,,,,,,,,,,
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990.0,https://www.imdb.com/title/tt0099018/,Almost an Angel,John Cornell,John Cornell,,,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7306,https://en.wikipedia.org/wiki/Holmes_%26_Watson,2018.0,https://www.imdb.com/title/tt1255919/,Holmes & Watson,Etan Cohen,"[Will Ferrell, Adam McKay, Jimmy Miller, Clayt...",Etan Cohen,,"[Sherlock Holmes, and, Dr. Watson, by, Sir Art...","[Will Ferrell, John C. Reilly, Rebecca Hall, R...",...,,,,,,,,,,
7307,https://en.wikipedia.org/wiki/Vice_(2018_film),2018.0,https://www.imdb.com/title/tt6266538/,Vice,Adam McKay,"[Brad Pitt, Dede Gardner, Jeremy Kleiner, Kevi...",,,,"[Christian Bale, Amy Adams, Steve Carell, Sam ...",...,,,,,,,,,,
7308,https://en.wikipedia.org/wiki/On_the_Basis_of_Sex,2018.0,https://www.imdb.com/title/tt4669788/,On the Basis of Sex,Mimi Leder,Robert W. Cort,,,,"[Felicity Jones, Armie Hammer, Justin Theroux,...",...,,,,,,,,,,
7309,https://en.wikipedia.org/wiki/Destroyer_(2018_...,2018.0,https://www.imdb.com/title/tt7137380/,Destroyer,Karyn Kusama,"[Fred Berger, Phil Hay, Matt Manfredi]",,,,"[Nicole Kidman, Sebastian Stan, Toby Kebbell, ...",...,,,,,,,,,,


In [5]:
# Explore the data, commenting results
raw_movie_df.info() # Result: too much data to display any granular info about the columns
raw_movie_df.isnull().sum() # Result: 10 columns visible, Null values seem to range from 158 to 7310; find the minimum number
min(raw_movie_df.isnull().sum())  # Results: 158 is the minimum, url and year columns are the lowest
raw_movie_df.keys().to_list() # Results: column names suggest more than movies; tv shows, people, and possibly books also.


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7311 entries, 0 to 7310
Columns: 193 entries, url to Polish
dtypes: float64(1), object(192)
memory usage: 10.8+ MB


['url',
 'year',
 'imdb_link',
 'title',
 'Directed by',
 'Produced by',
 'Screenplay by',
 'Story by',
 'Based on',
 'Starring',
 'Narrated by',
 'Music by',
 'Cinematography',
 'Edited by',
 'Productioncompany ',
 'Distributed by',
 'Release date',
 'Running time',
 'Country',
 'Language',
 'Budget',
 'Box office',
 'Written by',
 'Genre',
 'Theme music composer',
 'Country of origin',
 'Original language(s)',
 'Producer(s)',
 'Editor(s)',
 'Production company(s)',
 'Original network',
 'Original release',
 'Productioncompanies ',
 'Executive producer(s)',
 'Production location(s)',
 'Distributor',
 'Picture format',
 'Audio format',
 'Voices of',
 'Followed by',
 'Composer(s)',
 'Created by',
 'Also known as',
 'Opening theme',
 'No. of episodes',
 'Preceded by',
 'Author',
 'Publisher',
 'Publication date',
 'Media type',
 'Pages',
 'ISBN',
 'OCLC',
 'LC Class',
 'Cover artist',
 'Series',
 'Set in',
 'Adaptation by',
 'Suggested by',
 'Biographical data',
 'Born',
 'Died',
 'Resti

In [6]:
# select for records with an imdb link, 'Director' or 'Directed by', and not references to tv shows.
cleaning_wiki = [movie for movie in raw_movie_json if (('imdb_link' and ('Directed by' or 'Director')) in movie.keys()) and ('No. of episodes' not in movie.keys())]
cleaning_df = pd.DataFrame(cleaning_wiki)
cleaning_df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 74 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Directed by             7099 non-null   object
 5   Produced by             6737 non-null   object
 6   Screenplay by           2323 non-null   object
 7   Story by                1004 non-null   object
 8   Based on                2196 non-null   object
 9   Starring                6913 non-null   object
 10  Narrated by             283 non-null    object
 11  Music by                6502 non-null   object
 12  Cinematography          6396 non-null   object
 13  Edited by               6398 non-null   object
 14  Productioncompany       4539 non-null   object
 15  Dist

In [7]:
# # lots of the columns are languages:  "Arabic", "Japanese", "Mandarin", "Polish", "Yiddish", "Romanized"
cleaning_df[cleaning_df['Arabic'].notnull()]['Arabic']

6856    قضية رقم ٢٣
7081      کفرناحوم‎
Name: Arabic, dtype: object

In [8]:
# staring to define a function to clean the data. troubleshooting and ajustments were decided in the cell below this one.
def cleaning_movie(movie):
    """wiki_movie_file is a json file containing a list of dictionaries with each index as a different movie 
    with different key:value pairs. This function is specially designed to clean that specific dataset. 
    Wikipedia is managed by many people and they do not all use the same words.
    """
    alternate_titles = dict()
    fixed_movie = dict(movie)
###  Start with alternate titles stored in language keys and merge them into one.
###  Language keys i could find:      
    language_keys = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']
    for key in language_keys:
        if key in fixed_movie.keys():
            alternate_titles[key] = fixed_movie[key]
            fixed_movie.pop(key)
        else:
            pass
##  if there were alternate titles, add them to the movie.       
    if len(alternate_titles) > 0:
        fixed_movie['alternate_titles'] = alternate_titles
    else:
        pass
    return fixed_movie



In [9]:
## using this cell to debug the above function and decide on key(s) to keep, merge, and delete. 
## Starting with the alternate titles hidden in the language keys.
language_keys1 = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']

cleaning_wiki2 = [cleaning_movie(movie) for movie in cleaning_wiki]

## find a column to remove, add it to the list, check the columns again.
columns = sorted(pd.DataFrame(cleaning_wiki2).columns.to_list())
columns


['Actor control',
 'Adaptation by',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Country of origin',
 'Created by',
 'Directed by',
 'Distributed by',
 'Distributor',
 'Edited by',
 'Editor(s)',
 'Engine(s)',
 'Executive producer(s)',
 'Followed by',
 'Format(s)',
 'Genre',
 'Genre(s)',
 'Language',
 'Music by',
 'Narrated by',
 'Original language(s)',
 'Original network',
 'Original release',
 'Picture format',
 'Preceded by',
 'Produced by',
 'Producer(s)',
 'Production company',
 'Production company(s)',
 'Production location(s)',
 'Productioncompanies ',
 'Productioncompany ',
 'Release date',
 'Release(s)',
 'Running time',
 'Screen story by',
 'Screenplay by',
 'Starring',
 'Story by',
 'Suggested by',
 'Theme music composer',
 'Voices of',
 'Written by',
 'alternate_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [10]:
## Where I am at for this point. 
pd.DataFrame(cleaning_wiki2).info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 55 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Directed by             7099 non-null   object
 5   Produced by             6737 non-null   object
 6   Screenplay by           2323 non-null   object
 7   Story by                1004 non-null   object
 8   Based on                2196 non-null   object
 9   Starring                6913 non-null   object
 10  Narrated by             283 non-null    object
 11  Music by                6502 non-null   object
 12  Cinematography          6396 non-null   object
 13  Edited by               6398 non-null   object
 14  Productioncompany       4539 non-null   object
 15  Dist

In [11]:
def cleaning_movie(movie):
    """wiki_movie_file is a json file containing a list of dictionaries with each index as a different movie 
    with different key:value pairs. This function is specially designed to clean that specific dataset. 
    Wikipedia is managed by many people and they do not all use the same words.
    """
    alternate_titles = dict()
    fixed_movie = dict(movie)
###  Start with alternate titles stored in language keys and merge them into one.
###  Language keys i could find:      
    language_keys = ['Also known as','Arabic','Cantonese','Chinese','French', 'Hangul','Hebrew','Hepburn','Japanese','Literally',
        'Mandarin','McCune–Reischauer','Original title','Polish', 'Revised Romanization','Romanized','Russian',
        'Simplified','Traditional','Yiddish']
    for key in language_keys:
        if key in fixed_movie.keys():
            alternate_titles[key] = fixed_movie[key]
            fixed_movie.pop(key)
        else:
            pass
##  if there were alternate titles, add them to the movie.       
    if len(alternate_titles) > 0:
        fixed_movie['alternate_titles'] = alternate_titles
    else:
        pass
        
###  Alternative titles are fixed, now merge columns that are similar.     
    keys_to_merge = {'Director':'Directed by', 'Country': 'Country of origin', 'Distributor(s)':'Distributed by',
                     'Editor(s)':'Edited by',  'Language':'Original language(s)', 'Producer(s)':'Produced by',
                     'Genre(s)': 'Genre', 'Composer(s)': ['Music by', 'Theme music composer'], 
                     'Release date': ['Release(s)', 'Original release'], 'Distributor(s)':['Distributed by','Distributor'],
                     'Writer(s)':['Written by', 'Story by', 'Screenplay by', 'Screen story by', 'Adaptation by'],
                     'Production Comapany': ['Production company', 'Production company(s)', 'Productioncompanies ','Productioncompany ']  }
##  item = key, from the key:value pair and the key I want; values = the movie key(s) I do not want. 
    for item in keys_to_merge: 
        if type(keys_to_merge[item]) == type(list()):
            for n in keys_to_merge[item]:
                if n in fixed_movie.keys():
                    fixed_movie[item] = fixed_movie.pop(n)
                else:
                    pass
        else: 
            if keys_to_merge[item] in fixed_movie.keys():
                fixed_movie[item] = fixed_movie.pop(keys_to_merge[item])
            else:
                pass
    
    return fixed_movie




In [12]:
## using this cell to debug the above function and decide on key(s) to keep, merge, and delete. 
## keys that are the same idea need merged, key = the key I am keeping, Value = similar key(s).
keys_to_merge1 = {'Director':'Directed by', 'Country': 'Country of origin', 'Distributor(s)':'Distributed by',
                 'Editor(s)':'Edited by',  'Language':'Original language(s)', 'Producer(s)':'Produced by',
                 'Genre(s)': 'Genre', 'Composer(s)': ['Music by', 'Theme music composer'], 
                 'Release date': ['Release(s)', 'Original release'], 'Distributor(s)':['Distributed by','Distributor'],
                 'Writer(s)':['Written by', 'Story by', 'Screenplay by', 'Screen story by', 'Adaptation by'],
                 'Production Comapany': ['Production company', 'Production company(s)', 'Productioncompanies ','Productioncompany ']  }


cleaning_wiki3 = [cleaning_movie(movie) for movie in cleaning_wiki]

## find common columns, add them to the dictionary and check the columns again.
columns = sorted(pd.DataFrame(cleaning_wiki3).columns.to_list())
columns



['Actor control',
 'Animation by',
 'Audio format',
 'Based on',
 'Box office',
 'Budget',
 'Cinematography',
 'Color process',
 'Composer(s)',
 'Country',
 'Created by',
 'Director',
 'Distributor(s)',
 'Editor(s)',
 'Engine(s)',
 'Executive producer(s)',
 'Followed by',
 'Format(s)',
 'Genre(s)',
 'Language',
 'Narrated by',
 'Original network',
 'Picture format',
 'Preceded by',
 'Producer(s)',
 'Production Comapany',
 'Production location(s)',
 'Release date',
 'Running time',
 'Starring',
 'Suggested by',
 'Voices of',
 'Writer(s)',
 'alternate_titles',
 'imdb_link',
 'title',
 'url',
 'year']

In [13]:
## Where I am at for this point. 
cleaning_wiki_df3 = pd.DataFrame(cleaning_wiki3)
cleaning_wiki_df3.info()
# Down to 38 columns from 193.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7099 entries, 0 to 7098
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   url                     7099 non-null   object
 1   year                    7099 non-null   int64 
 2   imdb_link               7074 non-null   object
 3   title                   7098 non-null   object
 4   Based on                2196 non-null   object
 5   Starring                6913 non-null   object
 6   Narrated by             283 non-null    object
 7   Cinematography          6396 non-null   object
 8   Release date            7067 non-null   object
 9   Running time            6956 non-null   object
 10  Country                 6860 non-null   object
 11  Language                7014 non-null   object
 12  Budget                  4774 non-null   object
 13  Box office              5530 non-null   object
 14  Director                7099 non-null   object
 15  Dist

In [14]:
# extract the imdb id from the imdb link to link with other data sets.
cleaning_wiki_df3['imdb_link']   
pattern = r'(tt\d{7})'
cleaning_wiki_df3['imdb_id'] = cleaning_wiki_df3['imdb_link'].str.extract(pattern)

# checking for duplicate data
len(cleaning_wiki_df3['imdb_id'].unique()) # 7074 rows have imdb links, 7032 are unique. There are duplicate movies in the dataset.

7032

In [15]:
## Drop the duplicate rows
cleaning_wiki_df4 = pd.DataFrame(cleaning_wiki_df3.drop_duplicates(subset='imdb_id'))
cleaning_wiki_df4

Unnamed: 0,url,year,imdb_link,title,Based on,Starring,Narrated by,Cinematography,Release date,Running time,...,Created by,Preceded by,Suggested by,alternate_titles,Animation by,Color process,Engine(s),Actor control,Format(s),imdb_id
0,https://en.wikipedia.org/wiki/The_Adventures_o...,1990,https://www.imdb.com/title/tt0098987/,The Adventures of Ford Fairlane,"[Characters, by Rex Weiner]","[Andrew Dice Clay, Wayne Newton, Priscilla Pre...","Andrew ""Dice"" Clay",Oliver Wood,"[July 11, 1990, (, 1990-07-11, )]",102 minutes,...,,,,,,,,,,tt0098987
1,"https://en.wikipedia.org/wiki/After_Dark,_My_S...",1990,https://www.imdb.com/title/tt0098994/,"After Dark, My Sweet","[the novel, After Dark, My Sweet, by, Jim Thom...","[Jason Patric, Rachel Ward, Bruce Dern, George...",,Mark Plummer,"[May 17, 1990, (, 1990-05-17, ), (Cannes Film ...",114 minutes,...,,,,,,,,,,tt0098994
2,https://en.wikipedia.org/wiki/Air_America_(film),1990,https://www.imdb.com/title/tt0099005/,Air America,"[Air America, by, Christopher Robbins]","[Mel Gibson, Robert Downey Jr., Nancy Travis, ...",,Roger Deakins,"[August 10, 1990, (, 1990-08-10, )]",113 minutes,...,,,,,,,,,,tt0099005
3,https://en.wikipedia.org/wiki/Alice_(1990_film),1990,https://www.imdb.com/title/tt0099012/,Alice,,"[Alec Baldwin, Blythe Danner, Judy Davis, Mia ...",,Carlo Di Palma,"[December 25, 1990, (, 1990-12-25, )]",106 minutes,...,,,,,,,,,,tt0099012
4,https://en.wikipedia.org/wiki/Almost_an_Angel,1990,https://www.imdb.com/title/tt0099018/,Almost an Angel,,"[Paul Hogan, Elias Koteas, Linda Kozlowski]",,Russell Boyd,"December 19, 1990",95 minutes,...,,,,,,,,,,tt0099018
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7094,https://en.wikipedia.org/wiki/Holmes_%26_Watson,2018,https://www.imdb.com/title/tt1255919/,Holmes & Watson,"[Sherlock Holmes, and, Dr. Watson, by, Sir Art...","[Will Ferrell, John C. Reilly, Rebecca Hall, R...",,Oliver Wood,"[December 25, 2018, (, 2018-12-25, ), (United ...",90 minutes,...,,,,,,,,,,tt1255919
7095,https://en.wikipedia.org/wiki/Vice_(2018_film),2018,https://www.imdb.com/title/tt6266538/,Vice,,"[Christian Bale, Amy Adams, Steve Carell, Sam ...",,Greig Fraser,"[December 11, 2018, (, 2018-12-11, ), (, Samue...",132 minutes,...,,,,,,,,,,tt6266538
7096,https://en.wikipedia.org/wiki/On_the_Basis_of_Sex,2018,https://www.imdb.com/title/tt4669788/,On the Basis of Sex,,"[Felicity Jones, Armie Hammer, Justin Theroux,...",,Michael Grady,"[November 8, 2018, (, 2018-11-08, ), (, AFI Fe...",120 minutes,...,,,,,,,,,,tt4669788
7097,https://en.wikipedia.org/wiki/Destroyer_(2018_...,2018,https://www.imdb.com/title/tt7137380/,Destroyer,,"[Nicole Kidman, Sebastian Stan, Toby Kebbell, ...",,Julie Kirkwood,"[August 31, 2018, (, 2018-08-31, ), (, Telluri...",123 minutes,...,,,,,,,,,,tt7137380


In [16]:
# drop columns that are 90% or more null
columns_to_keep = [column for column in cleaning_wiki_df4 if cleaning_wiki_df4[column].isnull().sum() < (len(cleaning_wiki_df4['url'])*0.9)]
cleaning_wiki_df5 = cleaning_wiki_df4[columns_to_keep]
cleaning_wiki_df5.info() # down to 21 useful columns; from 193 messy, where 80%+ of the cells were null Values.

<class 'pandas.core.frame.DataFrame'>
Int64Index: 7032 entries, 0 to 7098
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   url                  7032 non-null   object
 1   year                 7032 non-null   int64 
 2   imdb_link            7031 non-null   object
 3   title                7031 non-null   object
 4   Based on             2181 non-null   object
 5   Starring             6850 non-null   object
 6   Cinematography       6343 non-null   object
 7   Release date         7000 non-null   object
 8   Running time         6893 non-null   object
 9   Country              6798 non-null   object
 10  Language             6948 non-null   object
 11  Budget               4738 non-null   object
 12  Box office           5486 non-null   object
 13  Director             7032 non-null   object
 14  Distributor(s)       6677 non-null   object
 15  Editor(s)            6486 non-null   object
 16  Produc

In [17]:
## Data types:  Budget, Box office, and Running time should be numbers not strings/lists/objects, Release Date and year should be a Datetime.  
# cleaning_wiki_df5.groupby(['Box office']).count() # error because mixture of data types
[oddity for oddity in cleaning_wiki_df5['Box office'] if type(oddity)==type(str())]  # sensible, not that dirty
[oddity for oddity in cleaning_wiki_df5['Box office'] if type(oddity)==type(list())]  # lists are a dirty mess. multiple currency types and odd entries
[oddity for oddity in cleaning_wiki_df5['Box office'] if (type(oddity)!=type(list())) and (type(oddity)!=type(str())) ] # lots of nan values
[(index, oddity) for index, oddity in enumerate(cleaning_wiki_df5['Box office']) if type(oddity)==type(list())]  # looking for a specific movie to check the wiki page

[(34, ['US$', '4,212,828']),
 (54, ['$6,698,361 (', 'United States', ')', '[2]']),
 (74, ['$6,488,144', '(US)', '[1]']),
 (126, ['US$1,531,489', '(domestic)']),
 (130, ['US$', '4,803,039']),
 (137, ['$92,706', '(domestic)']),
 (178, ['$3,331', '(USA)']),
 (204, ['$739,104', '(North America)', '[2]']),
 (211, ['$1.2 million', '(US)', '[1]']),
 (255, ['$14.6 million', '(North America)', '[3]']),
 (272, ['$38 million', '(US)', '[2]']),
 (279, ['$57.5 million', '(North America)', '[1]']),
 (339, ['£739,989 (UK)', '[1]', '$4,413,473 (US)', '[1]']),
 (344, ['$4,654,288 (', 'US', ')', '[1]']),
 (376, ['$6.4 million', '(North America)', '[1]']),
 (412, ['$46.7 million', '[4]', '[3]', '(USA)']),
 (488, ['$14.1 million', '[', 'citation needed', ']']),
 (512, ['$10.7 million', '(North America)', '[2]']),
 (532, ['$75.5 million', '(North America)', '[2]']),
 (564, ['$27.2 million', '(North America)', '[3]']),
 (615, ['HK$2,662,446', '(Hong Kong)']),
 (648, ['$13,747,138', '70,542 admissions (Franc

In [18]:
box_office = cleaning_wiki_df5['Box office'].dropna() # no point in modifying a nan value
len(box_office) # Results: 5486 items
box_office

0          $21.4 million
1           $2.7 million
2            $57,718,089
3             $7,331,647
4       $6,939,946 (USA)
              ...       
7093       $19.4 million
7094       $41.9 million
7095       $76.1 million
7096       $38.4 million
7097        $5.5 million
Name: Box office, Length: 5486, dtype: object

In [19]:
len(box_office[box_office.map(lambda x: type(x) != str)]) # 136 items are lists, not strings
box_office[box_office.map(lambda x: type(x) != str)]


34                           [US$, 4,212,828]
54      [$6,698,361 (, United States, ), [2]]
74                    [$6,488,144, (US), [1]]
126                [US$1,531,489, (domestic)]
130                          [US$, 4,803,039]
                        ...                  
7003               [$99.6, million, [4], [5]]
7017                   [$365.6, million, [1]]
7018                         [$53.8, million]
7038                     [$435, million, [7]]
7071                   [$529.3, million, [4]]
Name: Box office, Length: 136, dtype: object

In [20]:
# make a regular expression pattern for the strings
pattern1 = r'(\$\d+\.?\d*\s*[bm]illi?on)'  # searching for pattern that is similar to: "$45.3 million/billion"

# counting occurances of the first pattern
box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False).sum()  # 3826 items in the list with this pattern
matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False) # save the filter

box_office.str.extract(pattern1, flags=re.IGNORECASE).dropna()  # View the items matching pattern #1

  box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False).sum()  # 3826 items in the list with this pattern
  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False) # save the filter


Unnamed: 0,0
0,$21.4 million
1,$2.7 million
10,$195.3 million
11,$53.2 million
12,$15.7 million
...,...
7093,$19.4 million
7094,$41.9 million
7095,$76.1 million
7096,$38.4 million


In [21]:
pattern2 = r'(\$\d{1,3}(?:[,\.]\d{3})+\.?\d*)(?!\s*[bm]illi?on)'   # searching for pattern that is similar to $123,456,789.0
# counting occurances of the second pattern
box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False).sum()  # 1490 items in the list with this pattern
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False) # save the filter
box_office.str.extract(pattern2, flags=re.IGNORECASE).dropna()  # view the items matching pattern 2

  box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False).sum()  # 1490 items in the list with this pattern
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False) # save the filter


Unnamed: 0,0
2,"$57,718,089"
3,"$7,331,647"
4,"$6,939,946"
9,"$855,810"
15,"$12,626,043"
...,...
7045,"$401,463"
7051,"$260,136"
7062,"$100,116"
7067,"$19,996"


In [22]:
box_office[~matches_from_one & ~matches_from_two] # this result matches the search I did Earlier for data types.
problem_lists = box_office[~matches_from_one & ~matches_from_two]

# solving for the easy ones, they fit the pattern and just the wrong data type
for i in problem_lists.index:
    if len(re.findall(pattern1, str(problem_lists[i]))) == 1: # fits the pattern
        problem_lists[i] = re.findall(pattern1, str(problem_lists[i]))[0]
    elif len(re.findall(pattern2, str(problem_lists[i]))) == 1: # fits the pattern
        problem_lists[i] = re.findall(pattern2, str(problem_lists[i]))[0]
    elif len(re.findall(pattern1, str(problem_lists[i]))) > 1: # saving for easier parsing later
        problem_lists[i] = re.findall(pattern1, str(problem_lists[i]))[1]
    elif len(re.findall(pattern2, str(problem_lists[i]))) > 1:  # saving for easier parsing later
        problem_lists[i] = re.findall(pattern2, str(problem_lists[i]))[1]
    else:
        pass
    
problem_lists

34                [US$, 4,212,828]
54                      $6,698,361
74                      $6,488,144
110             $4.35-4.37 million
126                     $1,531,489
                   ...            
7003    [$99.6, million, [4], [5]]
7017        [$365.6, million, [1]]
7018              [$53.8, million]
7038          [$435, million, [7]]
7071        [$529.3, million, [4]]
Name: Box office, Length: 162, dtype: object

In [23]:
# save the changes to the box_office
for p in problem_lists.index:
    for b in box_office.index:
        if p == b:
#             if type(problem_lists[p]) == str:
            box_office[b] = problem_lists[p]
#             else:
#                 continue
        else:
            continue

# Rerun the two filters to see what still needs fixed
matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)            
matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)
box_office[~matches_from_one & ~matches_from_two]

  matches_from_one = box_office.str.contains(pattern1, flags=re.IGNORECASE, na=False)
  matches_from_two = box_office.str.contains(pattern2, flags=re.IGNORECASE, na=False)


34                [US$, 4,212,828]
110             $4.35-4.37 million
130               [US$, 4,803,039]
602                     $5000 (US)
734                [$, 11,146,270]
                   ...            
7003    [$99.6, million, [4], [5]]
7017        [$365.6, million, [1]]
7018              [$53.8, million]
7038          [$435, million, [7]]
7071        [$529.3, million, [4]]
Name: Box office, Length: 67, dtype: object

In [43]:
modified_p1 = r"(\$\d{1,3}\.?\d*',\s*'[bm]illi?on)"  # for when 43.5 Million is broken into '4.35' and 'million'
modified_p2 = r"(\$', '\d{1,3}(?:[,\.]\d{3})+\.?\d*)"
pattern3 = r"(\$\d*\.?\d*)(?:[-—–]\d*\.?\d*)(\s[bm]illi?on)"

problem_lists = box_office[~matches_from_one & ~matches_from_two]
for i in problem_lists.index:  
    if len(re.findall(modified_p1, str(problem_lists[i]))) > 0:
        problem_lists[i] = re.findall(modified_p1, str(problem_lists[i]))[0].replace("', '", " ")
#         print(i, problem_lists[i])
    elif len(re.findall(modified_p2, str(problem_lists[i]))) >0:
        problem_lists[i] = re.findall(modified_p2, str(problem_lists[i]))[0].replace("', '", "")
    elif len(re.findall(pattern3, str(problem_lists[i]))) >0:
        fix = re.findall(pattern3, str(problem_lists[i]))
        problem_lists[i] = fix[0][0] + fix[0][1]
    else:
        print(i, problem_lists[i])
        pass
    


602 $5000 (US)
961 $ 50,004
1073 35,254,617
1449 $ 11,829,959
1483 £3 million
1870 ¥1.1 billion
2037 N/A
2096 $309
2135 ['US$', '171.8 million', '[9]']
2270 ['$ 1,223,034', '(', 'domestic', ')']
2672 926,423 admissions (France)
2704 ['$', '1.7 million (US) (sub-total)']
3096 ['$32', '[2]', '–33.1 million', '[1]']
3640 TBA
3889 CN¥3.650 million (China)
4128 £7,385,434
4318 $20-30
4575 $45.2k (only in Turkey)
4677 ['USD$', '8.2 million', '[2]']
5380 ['$', '142 million', '[3]']
5465 £2.56
5802 413 733$
6034 Unknown
6391 $111k
6392 $588
6615 less than $372
6851 ['$', '41 million', '[3]']
6865 8 crore
