In [31]:
import pandas as pd
import regex
from pathlib import Path

### Load Data

In [32]:
"""
Get all Files and concatenate
"""
# Path to data files (might change)
parent_path = Path("..", "movies").absolute()

files_path: list[Path] = []
for data_path in parent_path.iterdir():
    if regex.match(".+/film_info_part_\d.csv$", str(data_path)) != None:
        files_path.append(data_path)

# read all datafiles and concatenate in pandas
dfs = []
lengths = []
for file in files_path:
    df = pd.read_csv(file)
    lengths.append(len(df))
    dfs.append(df)

dataframe = pd.concat(dfs, ignore_index=True)
dataframe

Unnamed: 0.1,Unnamed: 0,title,year,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length,infobox
0,0.0,10 Rillington Place,1971,13 Reviews,62.0,"1,000+ Ratings",85.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1971,Drama,1h 51m,"{'Directed by': ['Richard Fleischer', '', ''],..."
1,1.0,100 Rifles,1969,2 Reviews,,500+ Ratings,41.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1969,Western,1h 50m,"{'Directed by': ['Tom Gries', '', ''], 'Screen..."
2,2.0,1001 Arabian Nights,1959,0 Reviews,,0 Ratings,,,1959,Kids & family/Fantasy,1h 15m,"{'Directed by': ['Jack Kinney', '', ''], 'Writ..."
3,3.0,10:30 P.M. Summer,1966,4 Reviews,,50+ Ratings,32.0,,1966,Drama/Mystery & thriller,1h 25m,"{'Directed by': ['Jules Dassin', '', ''], 'Wri..."
4,4.0,13 Frightened Girls,1963,3 Reviews,,50+ Ratings,17.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1963,Mystery & thriller,1h 29m,"{'Directed by': ['William Castle', '', ''], 'S..."
...,...,...,...,...,...,...,...,...,...,...,...,...
19361,,Last Chance Harvey,2008,157 Reviews,71.0,"25,000+ Ratings",53.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Romance,1h 32m,"{'Directed by': ['Joel Hopkins', '', ''], 'Wri..."
19362,,The Spirit,2008,115 Reviews,14.0,"100,000+ Ratings",25.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Action/Adventure,1h 42m,"{'Directed by': ['Frank Miller', '', ''], 'Scr..."
19363,,Valkyrie,2008,198 Reviews,62.0,"100,000+ Ratings",65.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,History/Drama,2h 0m,"{'Directed by': ['Bryan Singer', '', ''], 'Wri..."
19364,,Waltz with Bashir,2008,154 Reviews,96.0,"25,000+ Ratings",91.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Documentary/Animation,1h 30m,"{'Original title': ['', ''], 'Directed by': ['..."


### Filter titles

In [33]:
# There shouldn't be any losses here
dataframe = dataframe.loc[~(dataframe["title"].isnull() | dataframe["title"].isna())]

In [34]:
dataframe

Unnamed: 0.1,Unnamed: 0,title,year,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length,infobox
0,0.0,10 Rillington Place,1971,13 Reviews,62.0,"1,000+ Ratings",85.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1971,Drama,1h 51m,"{'Directed by': ['Richard Fleischer', '', ''],..."
1,1.0,100 Rifles,1969,2 Reviews,,500+ Ratings,41.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1969,Western,1h 50m,"{'Directed by': ['Tom Gries', '', ''], 'Screen..."
2,2.0,1001 Arabian Nights,1959,0 Reviews,,0 Ratings,,,1959,Kids & family/Fantasy,1h 15m,"{'Directed by': ['Jack Kinney', '', ''], 'Writ..."
3,3.0,10:30 P.M. Summer,1966,4 Reviews,,50+ Ratings,32.0,,1966,Drama/Mystery & thriller,1h 25m,"{'Directed by': ['Jules Dassin', '', ''], 'Wri..."
4,4.0,13 Frightened Girls,1963,3 Reviews,,50+ Ratings,17.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1963,Mystery & thriller,1h 29m,"{'Directed by': ['William Castle', '', ''], 'S..."
...,...,...,...,...,...,...,...,...,...,...,...,...
19361,,Last Chance Harvey,2008,157 Reviews,71.0,"25,000+ Ratings",53.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Romance,1h 32m,"{'Directed by': ['Joel Hopkins', '', ''], 'Wri..."
19362,,The Spirit,2008,115 Reviews,14.0,"100,000+ Ratings",25.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Action/Adventure,1h 42m,"{'Directed by': ['Frank Miller', '', ''], 'Scr..."
19363,,Valkyrie,2008,198 Reviews,62.0,"100,000+ Ratings",65.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,History/Drama,2h 0m,"{'Directed by': ['Bryan Singer', '', ''], 'Wri..."
19364,,Waltz with Bashir,2008,154 Reviews,96.0,"25,000+ Ratings",91.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Documentary/Animation,1h 30m,"{'Original title': ['', ''], 'Directed by': ['..."


### Filter Infobox

In [35]:
"""
We want to preserve as much information as possible
The most important ones are budget, box office and release date (with month) which will have to be included
"""
import json

# Create pattern that ONLY changes " to ' when not needed for Names/language e.g.: "Edmond O'Brian"
_quote_mark_pattern = regex.compile("([A-Za-z][A-Za-z\s]+\"[A-Za-z]+)(?=[^\"]*\".*)")
_release_pattern = regex.compile(r"(\\xa0(?=\d+)(\d+))") # double enclosure group, potentially leading 0 
_structured_number_pattern = regex.compile('(?<=\".?)((\d+,?)+)(?=\")') # commas within natural numbers
_too_many_quotes_pattern = regex.compile('(\"{3,})')    # if 3 or more behind each other

def to_dict_transformer(x: str) -> dict | None:
    x = x.replace("'", "\"")    # needed for json
    # revert wanted '
    for p in _quote_mark_pattern.findall(x):
        x = x.replace(p, p.replace("\"", "'"))

    # replace broken numbers with correct ones
    # for broken, nums in _release_pattern.findall(x):
    #     x = x.replace(broken, f" {int(nums)}") 
    x = x.replace("\\xa0", " ")
    
    for broken, _ in _structured_number_pattern.findall(x):
        x = x.replace(broken, broken.replace(",", ""))

    for p in _too_many_quotes_pattern.findall(x):
        x = x.replace(p, '""')

    try:
        ret_val: dict = json.loads(x)
    except:
        ret_val = None  # most errors should now be fixed anyway
    return ret_val

# _str = 'ributed by": ["Loew"s, Inc.", "", ""], "Release date": ["J'
# print(_quote_mark_pattern.findall(_str))

"""
We can't use dataframe.transform here since then all the " and ' replacements are just reversed by panda
-> create new tags for each category at once
"""
dicts: list[dict[str, str] | None] = []
for dict_str in dataframe["infobox"]:
    dicts.append(to_dict_transformer(dict_str))


# Pattern that recognizes plural (s at end, not ies) and removes it
# words shorter than 3 letters are not plurals
_plural_pattern = regex.compile("(?<!i)(?<=[A-za-z]{3,})s(?=_?)(?![A-za-z])")

synonyms_dict = {}
local_path = Path("..", "filter").absolute()
with open(Path(local_path, "synonyms.json")) as syns:
    synonyms_list: list[dict] = json.load(syns)
    # pack into one dict
    for _dic in synonyms_list:
        for _key, _list in _dic.items():
            synonyms_dict[_key] = _list

# create reverse lookup table
reverse_synonym_dict = {item: _key for _key, _list in synonyms_dict.items() for item in _list}

def replace_synonyms(in_str: str) -> str:
    base_synonym = reverse_synonym_dict.get(in_str)
    return base_synonym if base_synonym else in_str

def normalize_keys(in_str: str) -> str:
    # basic compression
    in_str = in_str.lower().strip().replace(" ", "_")
    # somtimes numbers are somehow included which we're gonna remove to get a smaller set
    # we're also gonna remove enclosing brackets around a letter
    in_str = regex.sub("\d|\(|\)", "", in_str)
    # some tags are given are in plural some in singular. We're gonna reduce that to always singular by 
    # simply assuming that only plurals end with 's'ArithmeticError
    in_str = regex.sub("ies", "y", in_str)
    # apply regex based plural filter
    in_str = _plural_pattern.sub("", in_str)

    return in_str

category_counter: dict[str, int] = dict()
# normalize categories
for dic in dicts:
    if dic != None:
        key_copy = set(dic.keys())
        for c_key in key_copy:
            normalized_key = normalize_keys(c_key)
            
            # replace synonyms
            normalized_key = replace_synonyms(normalized_key)

            dic[normalized_key] = dic.pop(c_key)

            # add new entry or increase counter
            if category_counter.get(normalized_key):
                category_counter[normalized_key] += 1
            else:
                category_counter[normalized_key] = 1

print(category_counter)

{'produced_by': 17451, 'distributed_by': 16310, 'language': 18301, 'screenplay_by': 6186, 'based_on': 5749, 'edited_by': 16629, 'music_by': 16598, 'cinematography': 16931, 'directed_by': 18745, 'release_date': 18434, 'production_company': 13736, 'country': 18191, 'running_time': 17872, 'starring': 18193, 'box_office': 11383, 'budget': 8976, 'color_proces': 434, 'written_by': 14545, 'narrated_by': 451, 'adapted_by': 19, 'text': 3, 'original_title': 26, 'published_by': 88, 'illustrated_by': 21, 'author': 40, 'published': 17, '': 2697, 'released': 1453, 'media_type': 22, 'genre': 196, 'recorded': 586, 'length': 990, 'dialogue_by': 15, 'country_of_origin': 192, 'original_release': 200, 'original_language': 208, 'original_network': 190, 'additional_dialogue_by': 4, 'original_air_date': 3, 'featured_music': 2, 'teleplay_by': 3, 'literally': 131, 'publication_date': 32, 'theme_music_composer': 65, 'studio': 115, 'date_premiered': 16, 'place_premiered': 12, 'animation_by': 19, 'end_date': 2, '

### Convert Info Box into new Tags

In [36]:
"""
Split dictionary into single data types for easier referencing
"""
# how much percent of the tag with the most occurences another tag MUST have to be included in the dataset
INCLUSION_MARGIN = 1/3
MAX_COUNT = max(category_counter.values())
LOWER_BOUNDARY = int(INCLUSION_MARGIN * MAX_COUNT)

assert(len(dicts) == len(dataframe))

valid_tags: list[tuple[str, int]] = list(filter(lambda _tag_val_tup: _tag_val_tup[1] >= LOWER_BOUNDARY, category_counter.items()))
new_col_dict: dict[str, list] = {_key: [] for _key, _ in valid_tags}

for dic in dicts:
    for tag, _ in valid_tags:
        if dic: # could be None if not readable
            # get all rows with this tag and add as column to dataframe
            item: str | None = dic.get(tag)
            new_col_dict[tag].append(item)
        else:
            new_col_dict[tag].append(None)

# check for any lost items
for _tag, _list in new_col_dict.items():
    assert(len(_list) == len(dataframe))

# add columns to dataframe
existing_keys = list(dataframe.keys())
for _tag, _list in new_col_dict.items():
    # prevent overwriting
    if _tag in existing_keys:
        _tag += "_info"
    dataframe[_tag] = _list

### Filter length

In [37]:
"""
We might want to include the runtime and convert it into minutes only 
"""

_pattern = regex.compile("^(\d+)h (\d+)m$")
def movie_runtime_transformer(x):
    match = _pattern.match(x.strip())
    # convert into minutes only
    ret_val = None
    if match:
        ret_val = int(match.group(1))*60 + int(match.group(2))
    return ret_val

# replace original length
dataframe["rottentomatoes_length"] = dataframe["rottentomatoes_length"].transform(movie_runtime_transformer)

### Filter Review counts

In [38]:
_pattern = regex.compile("^\D*(\d+).*$")
def review_count_transformer(x: str):
    x = x.replace(",", "")  # integer are divided by ',' for every 3 digits
    match = _pattern.match(x.strip())
    # convert into number of reviews only
    ret_val = 0
    
    if match:
        ret_val = int(match.group(1))
    return ret_val

# replace original counts
dataframe["audience_count"] = dataframe["audience_count"].transform(review_count_transformer)
dataframe["critics_count"] = dataframe["critics_count"].transform(review_count_transformer)
dataframe

Unnamed: 0.1,Unnamed: 0,title,year,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,...,cinematography,directed_by,release_date,production_company,country,running_time,starring,box_office,budget,written_by
0,0.0,10 Rillington Place,1971,13,62.0,1000,85.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1971,Drama,...,"[Denys Coop, , ]","[Richard Fleischer, , ]","[29 January 1971, (UK), \n \n, ]","[Genesis Productions, Filmways Pictures]","[United Kingdom, , ]","[111 min, , ]","[Richard Attenborough, Judy Geeson, John Hurt,...",,,
1,1.0,100 Rifles,1969,2,,500,41.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1969,Western,...,"[Cecilio Paniagua, , ]","[Tom Gries, , ]","[March 26, 1969, \n \n, ]","[Marvin Schwartz Productions, ]","[United States, , ]","[110 minutes, , ]","[Jim Brown, Raquel Welch, Burt Reynolds, , ]","[$3.5 million (US/ Canada rentals), , ]","[$3920000, , ]",
2,2.0,1001 Arabian Nights,1959,0,,0,,,1959,Kids & family/Fantasy,...,,"[Jack Kinney, , ]","[December 1, 1959, \n \n, ]","[, UPA]","[United States, , ]","[75 minutes, , ]","[Jim Backus, Kathryn Grant, Dwayne Hickman, Ha...",,"[$2 million, , ]","[Dick Shaw, , Leo Salkin, , Lew Keller, E..."
3,3.0,10:30 P.M. Summer,1966,4,,50,32.0,,1966,Drama/Mystery & thriller,...,"[Gábor Pogány, , ]","[Jules Dassin, , ]","[October 24, 1966, \n \n, ]",,"[United States, , ]","[85 minutes, , ]","[Melina Mercouri, Romy Schneider, , ]",,,"[ (novel), Jules Dassin, Marguerite Duras, , ]"
4,4.0,13 Frightened Girls,1963,3,,50,17.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1963,Mystery & thriller,...,"[Gordon Avil, , ]","[William Castle, , ]","[July 1963, \n \n, ]","[William Castle Pictures, ]","[United States, , ]","[89 minutes, , ]","[Kathy Dunn, Murray Hamilton, Joyce Taylor, Hu...",,,"[Otis L. Guernsey Jr., , ]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19361,,Last Chance Harvey,2008,157,71.0,25000,53.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Romance,...,"[John de Borman, , ]","[Joel Hopkins, , ]","[December 25, 2008, \n \n, ]",,"[United States, , ]","[92 minutes, , ]","[Dustin Hoffman, Emma Thompson, Kathy Baker, J...","[$32.5 million , , ]","[$5 million, , ]","[Joel Hopkins, , ]"
19362,,The Spirit,2008,115,14.0,100000,25.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Action/Adventure,...,"[Bill Pope, , ]","[Frank Miller, , ]","[December 25, 2008, \n \n, ]","[, OddLot Entertainment DarkLot Entertainment ...","[United States, , ]","[103 minutes, , ]","[Gabriel Macht, Eva Mendes, Sarah Paulson, Dan...","[$39 million, , ]","[$60 million, , ]",
19363,,Valkyrie,2008,198,62.0,100000,65.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,History/Drama,...,"[Newton Thomas Sigel, , ]","[Bryan Singer, , ]","[December 25, 2008, (United States), \n \n, ]","[, , United Artists, Bad Hat Harry Productions...","[United States, Germany, \n \n, ]","[121 minutes, , ]","[Tom Cruise, Kenneth Branagh, Bill Nighy, Tom ...","[$201.5 million, , ]","[$75–90 million, , ]","[Nathan Alexander, Christopher McQuarrie, \n \..."
19364,,Waltz with Bashir,2008,154,96.0,25000,91.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Documentary/Animation,...,,"[Ari Folman, , ]","[13 May 2008, (, ), 5 June 2008, (Israel), C...","[, , Bridgit Folman Film Gang, Les Films d'Ici...","[Germany, France, Israel, \n \n, ]","[90 minutes, , ]","[Ari Folman, , ]","[$11.1 million, , ]","[$1.3 million, , ]","[Ari Folman, , ]"


In [39]:
test = "'Release dates': ['21\\xa0May\\xa02015', ' (', ')', '27\\xa0August\\xa02015', ' (China & Hong Kong)', '28\\xa0August\\xa02015', ' (Taiwan)', 'Cannes', '\\n "
_release_pattern.findall(test)

[('\\xa02015', '2015'), ('\\xa02015', '2015'), ('\\xa02015', '2015')]

In [40]:
dataframe["infobox"]

0        {'Directed by': ['Richard Fleischer', '', ''],...
1        {'Directed by': ['Tom Gries', '', ''], 'Screen...
2        {'Directed by': ['Jack Kinney', '', ''], 'Writ...
3        {'Directed by': ['Jules Dassin', '', ''], 'Wri...
4        {'Directed by': ['William Castle', '', ''], 'S...
                               ...                        
19361    {'Directed by': ['Joel Hopkins', '', ''], 'Wri...
19362    {'Directed by': ['Frank Miller', '', ''], 'Scr...
19363    {'Directed by': ['Bryan Singer', '', ''], 'Wri...
19364    {'Original title': ['', ''], 'Directed by': ['...
19365    {'Directed by': ['Sam Mendes', '', ''], 'Scree...
Name: infobox, Length: 19366, dtype: object

In [41]:
dataframe.keys()

Index(['Unnamed: 0', 'title', 'year', 'critics_count', 'critics_score',
       'audience_count', 'audience_score', 'suppliers_list',
       'rottentomatoes_year', 'rottentomatoes_genre', 'rottentomatoes_length',
       'infobox', 'produced_by', 'distributed_by', 'language', 'edited_by',
       'music_by', 'cinematography', 'directed_by', 'release_date',
       'production_company', 'country', 'running_time', 'starring',
       'box_office', 'budget', 'written_by'],
      dtype='object')

### Reduce Lists within entries

In [42]:
dataframe.loc[0]

Unnamed: 0                                                             0.0
title                                                  10 Rillington Place
year                                                                  1971
critics_count                                                           13
critics_score                                                         62.0
audience_count                                                        1000
audience_score                                                        85.0
suppliers_list           [('vudu', 'Rent/buy'), ('amazon-prime-video-us...
rottentomatoes_year                                                   1971
rottentomatoes_genre                                                 Drama
rottentomatoes_length                                                111.0
infobox                  {'Directed by': ['Richard Fleischer', '', ''],...
produced_by                          [Leslie Linder, Martin Ransohoff, , ]
distributed_by           

In [43]:
"""
Due to the data collection process, the infobox contains many empty elements
These need to be reduced and ideally into one element only
"""
def reduce_list(lis: list[str] | None) -> list:
    if lis:
        reduced_list = []
        for item in lis:
            item = item.replace("\n", "").strip().lower()
            if item:
                reduced_list.append(item)
        return reduced_list
    else:
        return None

# transform data
for _tag in new_col_dict.keys():
    dataframe[_tag] = dataframe[_tag].transform(reduce_list)
    # print(dataframe[_tag].transform(reduce_list))

### Convert Release dates

In [44]:
dataframe["release_date"]

0                                  [29 january 1971, (uk)]
1                                         [march 26, 1969]
2                                       [december 1, 1959]
3                                       [october 24, 1966]
4                                              [july 1963]
                               ...                        
19361                                  [december 25, 2008]
19362                                  [december 25, 2008]
19363                 [december 25, 2008, (united states)]
19364    [13 may 2008, (, ), 5 june 2008, (israel), can...
19365    [december 26, 2008, (united states), january 3...
Name: release_date, Length: 19366, dtype: object

In [45]:
"""
There are some different variants for writing the date unfortunately so we will use regex to extrapolate month day and year and bring it into the following format:
<yyyy>-<mm>-<dd>
furthermore, there are different release dates. We're only gonna look at the oldest release (which came first)
"""
from dateutil.parser import *

# test = dataframe["release_date"][12818]
# print(str(parse(test[0]).date()))
# print(parse(test[0]) < parse("21.05.2016"))

def get_first_date(dates: list[str] | None) -> str | None:
    if dates:
        parsed_dates = []
        for line in dates:
            try:
                _date = parse(line)
                parsed_dates.append(_date)
            except:
                # unrecognizable date or simply something different (e.g. countries)
                continue
        # get oldest date as str
        return str(min(parsed_dates).date()) if parsed_dates else None
    return None

dataframe["release_date"] = dataframe["release_date"].transform(get_first_date)

# recalculate index
dataframe.reset_index(inplace=True, drop=True)

In [46]:
# amount of valid dates
len(dataframe["release_date"].loc[~dataframe["release_date"].isnull()])

18055

### Filter years

In [47]:
"""
There might occur some differences between the Release Date on Wikipedia and Rotten Tomatoes possibly indicating a different movie alltogether
We're gonna simply remove the ones having a difference bigger than 1 year
"""

def _get_year(entry: str) -> int:
    parse(entry).year

wiki_list_year = dataframe["year"].copy()
wiki_list_year[dataframe["release_date"].notnull()] = dataframe["release_date"][dataframe["release_date"].notnull()].transform(_get_year)

_filter = abs(wiki_list_year - dataframe["rottentomatoes_year"]) > 1
different_movies = dataframe.loc[_filter]

print(f"removed {len(different_movies)} titles")
dataframe = dataframe.loc[~_filter]
# remove duplicates
dataframe.drop_duplicates(["title", "release_date"], inplace=True)
dataframe

removed 222 titles


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe.drop_duplicates(["title", "release_date"], inplace=True)


Unnamed: 0.1,Unnamed: 0,title,year,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,...,cinematography,directed_by,release_date,production_company,country,running_time,starring,box_office,budget,written_by
0,0.0,10 Rillington Place,1971,13,62.0,1000,85.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1971,Drama,...,[denys coop],[richard fleischer],1971-01-29,"[genesis productions, filmways pictures]",[united kingdom],[111 min],"[richard attenborough, judy geeson, john hurt]",,,
1,1.0,100 Rifles,1969,2,,500,41.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1969,Western,...,[cecilio paniagua],[tom gries],1969-03-26,[marvin schwartz productions],[united states],[110 minutes],"[jim brown, raquel welch, burt reynolds]",[$3.5 million (us/ canada rentals)],[$3920000],
2,2.0,1001 Arabian Nights,1959,0,,0,,,1959,Kids & family/Fantasy,...,,[jack kinney],1959-12-01,[upa],[united states],[75 minutes],"[jim backus, kathryn grant, dwayne hickman, ha...",,[$2 million],"[dick shaw, leo salkin, lew keller, ed nofzige..."
3,3.0,10:30 P.M. Summer,1966,4,,50,32.0,,1966,Drama/Mystery & thriller,...,[gábor pogány],[jules dassin],1966-10-24,,[united states],[85 minutes],"[melina mercouri, romy schneider]",,,"[(novel), jules dassin, marguerite duras]"
4,4.0,13 Frightened Girls,1963,3,,50,17.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1963,Mystery & thriller,...,[gordon avil],[william castle],1963-07-21,[william castle pictures],[united states],[89 minutes],"[kathy dunn, murray hamilton, joyce taylor, hu...",,,[otis l. guernsey jr.]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19360,,Defiance,2008,189,59.0,250000,72.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,History/Drama,...,[eduardo serra],[edward zwick],2008-12-31,"[paramount vantage, bedford falls productions,...",[united states],[137 minutes],"[daniel craig, liev schreiber, jamie bell, ale...",[$51.2 million],[$32 million],"[clayton frohman, edward zwick]"
19361,,Last Chance Harvey,2008,157,71.0,25000,53.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Romance,...,[john de borman],[joel hopkins],2008-12-25,,[united states],[92 minutes],"[dustin hoffman, emma thompson, kathy baker, j...",[$32.5 million],[$5 million],[joel hopkins]
19362,,The Spirit,2008,115,14.0,100000,25.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Action/Adventure,...,[bill pope],[frank miller],2008-12-25,[oddlot entertainment darklot entertainment li...,[united states],[103 minutes],"[gabriel macht, eva mendes, sarah paulson, dan...",[$39 million],[$60 million],
19364,,Waltz with Bashir,2008,154,96.0,25000,91.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Documentary/Animation,...,,[ari folman],2008-05-13,"[bridgit folman film gang, les films d'ici, ra...","[germany, france, israel]",[90 minutes],[ari folman],[$11.1 million],[$1.3 million],[ari folman]


### Box Office/Budget Filter

In [48]:
cols = [k for k in new_col_dict.keys()]
dataframe[cols]

Unnamed: 0,produced_by,distributed_by,language,edited_by,music_by,cinematography,directed_by,release_date,production_company,country,running_time,starring,box_office,budget,written_by
0,"[leslie linder, martin ransohoff]",[columbia pictures],[english],[ernest walter],[john dankworth],[denys coop],[richard fleischer],1971-01-29,"[genesis productions, filmways pictures]",[united kingdom],[111 min],"[richard attenborough, judy geeson, john hurt]",,,
1,[marvin schwartz],[20th century fox],"[english, spanish]",[robert l. simpson],[jerry goldsmith],[cecilio paniagua],[tom gries],1969-03-26,[marvin schwartz productions],[united states],[110 minutes],"[jim brown, raquel welch, burt reynolds]",[$3.5 million (us/ canada rentals)],[$3920000],
2,[stephen bosustow],[columbia pictures],[english],,[george duning],,[jack kinney],1959-12-01,[upa],[united states],[75 minutes],"[jim backus, kathryn grant, dwayne hickman, ha...",,[$2 million],"[dick shaw, leo salkin, lew keller, ed nofzige..."
3,"[jules dassin, anatole litvak]",[lopert pictures corporation],[english],[roger dwyre],[cristóbal halffter],[gábor pogány],[jules dassin],1966-10-24,,[united states],[85 minutes],"[melina mercouri, romy schneider]",,,"[(novel), jules dassin, marguerite duras]"
4,[william castle],[columbia pictures],[english],[edwin h. bryant],[van alexander],[gordon avil],[william castle],1963-07-21,[william castle pictures],[united states],[89 minutes],"[kathy dunn, murray hamilton, joyce taylor, hu...",,,[otis l. guernsey jr.]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19360,"[edward zwick, pieter jan brugge]","[(north america), essential entertainment (int...","[english, russian]",[steven rosenblum],[james newton howard],[eduardo serra],[edward zwick],2008-12-31,"[paramount vantage, bedford falls productions,...",[united states],[137 minutes],"[daniel craig, liev schreiber, jamie bell, ale...",[$51.2 million],[$32 million],"[clayton frohman, edward zwick]"
19361,"[tim perell, nicola usborne]","[(united states), (international), overture fi...",[english],[robin sales],[dickon hinchliffe],[john de borman],[joel hopkins],2008-12-25,,[united states],[92 minutes],"[dustin hoffman, emma thompson, kathy baker, j...",[$32.5 million],[$5 million],[joel hopkins]
19362,"[deborah del prete, gigi pritzker, michael e. ...",[sony pictures],[english],[gregory nussbaum],[david newman],[bill pope],[frank miller],2008-12-25,[oddlot entertainment darklot entertainment li...,[united states],[103 minutes],"[gabriel macht, eva mendes, sarah paulson, dan...",[$39 million],[$60 million],
19364,"[ari folman, serge lalou, gerhard meixner, yae...",[sony pictures classics],[hebrew],[nili feller],[max richter],,[ari folman],2008-05-13,"[bridgit folman film gang, les films d'ici, ra...","[germany, france, israel]",[90 minutes],[ari folman],[$11.1 million],[$1.3 million],[ari folman]


In [49]:
inflation_table_US = pd.read_csv(Path(local_path, "inflation_rate_usa.csv"), dtype=float)
inflation_table_US["Year"] = inflation_table_US["Year"].transform(lambda x: int(x))
# TODO use IMF Table for different countries
FIRST_YEAR_US = 1913
LAST_YEAR_US = 2022
MONTH_MAPPING = {
    1: "Jan",
    2: "Feb",
    3: "Mar",
    4: "Apr",
    5: "May",
    6: "June",
    7: "July",
    8: "Aug",
    9: "Sep",
   10: "Oct",
   11: "Nov",
   12: "Dec",
}
"""
Inflation Table contains CPI in Columns Jan-Dec and Avg, Dec-Dec and Avg-Avg is already given in percent
"""

def inflation_calculator_US(value: float, from_year: int, to_year: int = 2022, from_month: int = None, to_month: int = None) -> float:
    """
    Calculate Inflation per year or even per month if <from_month> and <to_month> are specified
    Inflation can be efficiently calculated with CPI money*(<to_cpi> / <from_cpi>)
    Rounded to 2 decimals
    """
    if from_year < FIRST_YEAR_US or to_year > LAST_YEAR_US:
        if to_year < FIRST_YEAR_US or from_year > LAST_YEAR_US:
            raise ValueError("years are out of range of dataset")
    if from_month:
        if from_month < 1 or from_month > 12:
            raise ValueError("from months are out of range")
    if to_month:
        if to_month < 1 or to_month > 12:
            raise ValueError("to months are out of range")

    _from_table = inflation_table_US.loc[from_year-FIRST_YEAR_US]
    _to_table = inflation_table_US.loc[to_year-FIRST_YEAR_US]

    _from_pci: float = _from_table[MONTH_MAPPING[from_month]] if from_month else _from_table["Avg"]
    _to_pci: float = _to_table[MONTH_MAPPING[to_month]] if to_month else _to_table["Avg"]

    return round(value * (_to_pci / _from_pci), 2)

# inflation_calculator_US(1.0, 1913, 1917, 1, 12)

In [50]:
CURRENCY_CPI_MAPPING: dict[str: str] = {
    "US_DOLLAR" : "United States",
    "NZ_DOLLAR" : "New Zealand",
    "AUS_DOLLAR" : "Australia",
    "HK_DOLLAR" : "China, P.R.: Hong Kong",
    "NT_DOLLAR" : "China, P.R.: Hong Kong", # "Taiwan" is not in the Database assume same growth as Hong Kong
    "MX_DOLLAR" : "Mexico",
    "GB_POUND" : "United Kingdom",
    "EURO" : "Euro Area",
    "JP_YEN" : "Japan",
    "CN_YUAN" : "China, P.R.: Mainland",
    "RUBLES" : "Russian Federation",
    "SWED_KRON" : "Sweden",
    "INDIAN_RUPEE" : "India",
    "TURK_LIRA" : "Türkiye, Rep of",
    "MALAYSIA_RM" : "Malaysia",
}
inflation_table = pd.read_csv(Path(local_path, "CPI_timeSeries", "CPI_simple.csv"))

# TODO use IMF Table for different countries
FIRST_YEAR = 1950
LAST_YEAR = 2022

# TODO inflation calculator for different countries and conversion Rate!!

def inflation_calculator(currency: str, value: float, from_year: int, to_year: int = 2022, from_month: int = None, to_month: int = None) -> float:
    """
    Calculate Inflation per year or even per month if <from_month> and <to_month> are specified
    Inflation can be efficiently calculated with CPI money*(<to_cpi> / <from_cpi>)
    Rounded to 2 decimals.

    @param currency must be part of the translation_dict_CPI mapping!
    
    It is recommended to calculate up to 2017, and the convert to US $ since the exchange database wasn't updated anymore since may 2018
    """
    if from_year < FIRST_YEAR or to_year > LAST_YEAR:
        if to_year < FIRST_YEAR or from_year > LAST_YEAR:
            raise ValueError("years are out of range of dataset")
    if from_month:
        if from_month < 1 or from_month > 12:
            raise ValueError("from months are out of range")
    if to_month:
        if to_month < 1 or to_month > 12:
            raise ValueError("to months are out of range")

    _from_index = f"{from_year}" + (f"M{from_month}" if from_month else "")
    _to_index = f"{to_year}" + (f"M{to_month}" if to_month else "")

    country_inflation_table = inflation_table.loc[inflation_table["Country Name"] == CURRENCY_CPI_MAPPING[currency]].reset_index()

    _from_pci = country_inflation_table[_from_index][0]
    _to_pci = country_inflation_table[_to_index][0]

    return round(value * (_to_pci / _from_pci), 2)

In [51]:
print(inflation_calculator("US_DOLLAR", 1.0, 1950, 1980, 1, 12))
print(inflation_calculator_US(1.0, 1950, 1980, 1, 12))
# Close enough

3.49
3.67


In [52]:
CURRENCY_EXCHANGE_MAPPING: dict[str: str|None] = {
    "US_DOLLAR" : "U.S. Dollar",
    "NZ_DOLLAR" : "New Zealand Dollar",
    "AUS_DOLLAR" : "Australian Dollar",
    "HK_DOLLAR" : None,   # Hong Kong is not in the Database
    "NT_DOLLAR" : None,   # Taiwan is not in the Database
    "MX_DOLLAR" : "Mexican Peso",
    "GB_POUND" : "U.K. Pound Sterling",
    "EURO" : "Euro",
    "JP_YEN" : "Japanese Yen",
    "CN_YUAN" : "Chinese Yuan",
    "RUBLES" : "Russian Ruble",
    "SWED_KRON" : "Swedish Krona",
    "INDIAN_RUPEE" : "Indian Rupee",
    "TURK_LIRA" : None,    # Turkey is not in the Database
    "MALAYSIA_RM" : "Malaysian Ringgit",
}
exchange_table = pd.read_csv(Path(local_path, "exchange_rate", "currency_exchange_rates.csv"))
# take only one timestamp where No value is NaN
exchange_table = exchange_table.loc[exchange_table["Date"] == "2017-1-18"]

CURRENCY_EXCHANGE_VALUE_MAPPING: dict[str, float] = {
    base_key: float(exchange_table[ex_key]) for base_key, ex_key in CURRENCY_EXCHANGE_MAPPING.items() if ex_key
}
# add missing avg exchange rates for 2017, copied manually from https://www.exchangerates.org.uk
update_dict = {
    "HK_DOLLAR" : 7.7925,
    "NT_DOLLAR" : 30.4252,
    "TURK_LIRA" : 3.6462,
}
CURRENCY_EXCHANGE_VALUE_MAPPING.update(update_dict)

def convert_into_dollar(currency: str, value: float) -> float:
    """
    Takes a currency from CURRENCY_EXCHANGE_VALUE_MAPPING and converts it into dollar.
    This is ONLY somewhat accurate for January of 2017!!!
    """
    return value / CURRENCY_EXCHANGE_VALUE_MAPPING[currency]
    

### Combine Both functions to get money in US$ for 2022

In [53]:
def convert_value_to_recent_us_dollar(currency: str, value: float, from_year: int, from_month: int|None = None) -> float:
    """
    @param currency original currency of <value>
    @param value amount of original currency
    @param from_year how much inflation took place since the value was recent
    @param from_month which month in <from_year> 
    """
    exchangable_value = inflation_calculator(currency, value, from_year, 2017, from_month, 1)
    new_value = convert_into_dollar(currency, exchangable_value)
    return inflation_calculator("US_DOLLAR", new_value, 2017, 2022, 1, 12)

convert_value_to_recent_us_dollar("JP_YEN", 100, 1970, 1)
# CURRENCY_EXCHANGE_VALUE_MAPPING

3.48

In [54]:
"""
First we need to convert all written text into a natural number (e.g. 46.75 million = 46750000)
After that, the really difficult part starts by accounting for inflation and different currencies
We want US$ only at the end
"""

_number_pattern = regex.compile("(\d+)")
_reduced_mbillion_pattern = regex.compile("^((\d+\.?\d*)\s?(millions?|billions?|crores?))(?=.*)")
_mbillion_pattern = regex.compile("(?<=.*\s|\W)((\d+\.?\d*)\s?(millions?|billions?|crores?))(?=.*)")
# some entries already or only contain an equivalent for decades later than when the movei was released
# this needs to be regarded for the inflation adjustments
_equivalent_pattern = regex.compile(".*[\s\(]((\w?\w?\D)(\d+)\sin\s(\d{4})).*") # groups -> capture, currency, amount, year

"""
Predefined Pattern for the Value type of the return dict of money_heuristic
This is required for the inflation calculator to adjust the values more correctly
if one of the ATTRIBUTES items is None, use the original entry data to infer the used values
"""
CURRENCY = str|None
YEAR = int|None
MONEY_ATTRIBUTES = list[CURRENCY, YEAR]
AMOUNT_ATTRIBUTES = 2

NON_US_DOLLAR_SCORE = 6
# each tuple has Name of Currency/Country and associated score (loosley based on conversion factor)
# sorted with decreasing score!
_optional_value_pattern = "()"
# patterns first and last group are always the POTENTIAL number (maybe doesn't exist)
_currencies_patterns: list[tuple[str, "regex.Pattern", int]] = [
    ("US_DOLLAR", regex.compile("(\d+)?\s?((us)?\$)\s?(\d+)?"), 10),    # score must only be applied if NONE of the other _DOLLAR patterns matched!
    ("NZ_DOLLAR", regex.compile("(\d+)?\s?((au?)\$)\s?(\d+)?"), NON_US_DOLLAR_SCORE),
    ("AUS_DOLLAR", regex.compile("(\d+)?\s?((nz)\$)\s?(\d+)?"), NON_US_DOLLAR_SCORE),
    ("HK_DOLLAR", regex.compile("(\d+)?\s?((hk)\$)\s?(\d+)?"), NON_US_DOLLAR_SCORE),
    ("NT_DOLLAR", regex.compile("(\d+)?\s?((nt)\$)\s?(\d+)?"), NON_US_DOLLAR_SCORE),
    ("MX_DOLLAR", regex.compile("(\d+)?\s?((mx)\$)\s?(\d+)?"), NON_US_DOLLAR_SCORE),
    ("GB_POUND", regex.compile("(\d+)?\s?(£)\s?(\d+)?"), 5),
    ("EURO", regex.compile("(\d+)?\s?(€)\s?(\d+)?"), 5),
    ("JP_YEN", regex.compile("(\d+)?\s?((?<!cn)¥)\s?(\d+)?"), 3),
    ("CN_YUAN", regex.compile("(\d+)?\s?(cn¥|yuan|rmb)\s?(\d+)?"), 3),
    ("RUBLES", regex.compile("(\d+)?\s?(rub|₽)\s?(\d+)?"), 3),
    ("SWED_KRON", regex.compile("(\d+)?\s?(sek)\s?(\d+)?"), 3),
    ("INDIAN_RUPEE", regex.compile("(\d+)?\s?(₹)\s?(\d+)?"), 3),
    ("TURK_LIRA", regex.compile("(\d+)?\s?(₺)\s?(\d+)?"), 3),
    ("MALAYSIA_RM", regex.compile("(\d+)?\s?(rm)\s?(\d+)?"), 3),
]

def money_heuristic(entries: list[str]) -> tuple[int, MONEY_ATTRIBUTES] | None:
    """
    wikipedia often has multiple entries about the box office (global, within the country, other currencies, etc.)
    This function assigns a scoring system for these entries to estimate the most beneficial for our dataset.
    
    @return the most useful entry based on heuristics, condensed on number only, additional ATTRIBUTES needed for conversion/inflation
    """

    def multiplier(selector: str) -> float:
        """
        the equivalent in numbers to a word
        """
        return {
            "million": 10**6,
            "millions": 10**6,
            "crore": 10**7, # east asian for 10 million, also indian currency
            "crores": 10**7,
            "billion": 10**9,
            "billions": 10**9,
        }[selector]
    
    
    def amount_heuristic(_heuristics: dict[int, int], _entries: list[str]) -> None:
        """
        modifies heuristic based on number 
        """
        numbers: list[int] = []
        # get biggest number in one entry
        for index, entry in enumerate(_entries):
            numbers.append(-1)
            for num in _number_pattern.findall(entry):
                int_num = int(num)
                if int_num > numbers[-1]:
                    numbers[-1] = int_num

        # get heuristic value based on size
        argmax_list = [0 for i in range(len(_entries))]
        for index, num in enumerate(numbers):
            for i in range(len(_entries)):
                argmax_list[i] += 1 if numbers[i] > num else 0

        # apply scores
        for index, points in enumerate(argmax_list):
            _heuristics[index] += points
    

    def currency_heuristic(_heuristics: dict[int, int], _entries: list[str]) -> dict[int, MONEY_ATTRIBUTES]:
        """
        This method tries to find out which currencies are probably used in each entry.
        We want us$ if possible since we only need to account for inflation then and not for exchange factor as well
        """
        curr_heur: dict[int, int] = {}
        _attributes: dict[int, MONEY_ATTRIBUTES] = {} 
        for index, entry in enumerate(_entries):
            curr_heur[index] = 0    # initiate default score of +0
            _attributes[index] = [None for _ in range(AMOUNT_ATTRIBUTES)]   # initiate empty attribute list

            for cur_name, pattern, score in _currencies_patterns[:0:-1]: # use inverse list and omit US$
                if pattern.findall(entry):
                    curr_heur[index] = score
                    _attributes[index][0] = cur_name
            # check if other form of dollar was already found
            cur_name, pattern, score = _currencies_patterns[0]
            if curr_heur[index] != NON_US_DOLLAR_SCORE and pattern.findall(entry):
                curr_heur[index] = score
                _attributes[index][0] = cur_name    # US_DOLLAR

        # apply scores
        for index, points in curr_heur.items():
            _heuristics[index] += points
        return _attributes


    def equivalency_heuristic(_heuristics: dict[int, int], _entries: list[str], _attributes: dict[int, MONEY_ATTRIBUTES]) -> None:
        """
        Some entries already include conversions for a more recent timestamp than when the movie originally released.
        We want to utilize this extra information since this is more often more accurate (somtimes even the only) information.
        We give an additional +5 score for more recent information.

        This method additionally also filters out the pure number for equivalency terms only
        """
        for index, entry in enumerate(_entries):
            for _capture, currency, amount, year in _equivalent_pattern.findall(entry):
                _heuristics[index] += 5
                _attributes[index][1] = int(year)    # get date of already converted money
                # get currency of converted money
                for cur_name, pattern, _ in _currencies_patterns[::-1]:
                    if pattern.findall(currency):
                        _attributes[index][0] = cur_name
                        _entries[index] = amount
                        break   # don't check for multiple possibilites


    def pure_number_filter(_entries: list[str], _attributes: dict[int, MONEY_ATTRIBUTES]) -> list[int]:
        """
        Get number with associated currency (_entries MUST have run through currency_heuristic!)

        @return list with number only, type of currency is stored in attributes
        """
        def get_max_number(_entry: str) -> int:
            max: int = 0
            for str_num in _number_pattern.findall(_entry):
                num = int(str_num)  # shouldn't raise exceptions since guarded by regex
                if num > max:
                    max = num
            return max
        
        ret_entries: list[int] = _entries.copy()
        for index, entry in enumerate(_entries):
            pattern = None
            # get correct pattern (no dictionary used because of undeterministic order within the data structure)
            for cur_name, _pattern, _ in _currencies_patterns:
                if _attributes[index][0] == cur_name:
                    pattern = _pattern
                    break
            if pattern:
                # found pattern to currency, use first and last capture group to get amount
                for capture_groups in pattern.findall(entry):
                    if capture_groups[-1]:  # is empty if no number behind currency (is most likely the case)
                        ret_entries[index] = int(capture_groups[-1])
                    elif capture_groups[0]:  # is empty if no number before currency (happens somtimes)
                        ret_entries[index] = int(capture_groups[0])
                    else:   # Can happen due to equivalency_heuristic. should not happen otherwise
                        ret_entries[index] = get_max_number(entry)
            else:
                # unidentified currency, just get highest number, currency will be inferred by country of origin (or rather just Dollar?)
                ret_entries[index] = get_max_number(entry)
        return ret_entries


    # filter unwanted characters ',' (in numbers)
    filtered_entries: list[str] = [item.replace(",", "").strip().lower() for item in entries]

    # we assume a box office to include numbers in their string (could also be year, but most of the times)
    # if there isn't a number, concatenate the string with the entry left to it (or right in case its the first index) 
    to_keep: list[int] = []
    for index, entry in enumerate(filtered_entries):
        if not _number_pattern.findall(entry):
            if (index-1) not in to_keep and index+1 < len(filtered_entries):
                filtered_entries[index+1] = f"{entry} {filtered_entries[index+1]}"
            elif to_keep:
                filtered_entries[to_keep[-1]] += f" {entry}"
        else:
            to_keep.append(index)
    # remove entry that was concatenated
    _filtered_entries = []
    for keep in to_keep:
        _filtered_entries.append(filtered_entries[keep])
    del filtered_entries
    filtered_entries = _filtered_entries

    # convert 'million' 'billion' 'crore' into numbers
    for index, entry in enumerate(filtered_entries):
        # print(entry)
        filtered_entries[index] = entry
        # try this pattern first in case string starts directly with number
        for capture, _float, mbil in _reduced_mbillion_pattern.findall(entry):
            replace_with = str(int(float(_float) * multiplier(mbil)))
            # crore is special case since its also the currency itself and not only a multiplier
            if "crore" in capture:
                # indian rupees
                replace_with = f"₹{replace_with}"
            filtered_entries[index] = filtered_entries[index].replace(capture, replace_with)
        # try the extended pattern
        for capture, _float, mbil in _mbillion_pattern.findall(filtered_entries[index]):
            replace_with = str(int(float(_float) * multiplier(mbil)))
            # crore is special case since its also the currency itself and not only a multiplier
            if "crore" in capture:
                # indian rupees
                replace_with = f"₹{replace_with}"
            filtered_entries[index] = filtered_entries[index].replace(capture, replace_with)
    
    if filtered_entries:
        # apply heuristics, select best candidate
        # selection based on index
        heuristics: dict[int, int] = {_ind: 1 for _ind in range(len(filtered_entries))}
        
        amount_heuristic(heuristics, filtered_entries)
        attributes: dict[int, MONEY_ATTRIBUTES] = currency_heuristic(heuristics, filtered_entries)
        equivalency_heuristic(heuristics, filtered_entries, attributes)

        # get pure numbers only
        filtered_entries: list[int] = pure_number_filter(filtered_entries, attributes)
        
        # get entry with highest heuristic score
        _argmax = -1
        max_score = -1
        for index, score in heuristics.items():
            if score > max_score:
                max_score = score
                _argmax = index
        return filtered_entries[_argmax], attributes[_argmax]
    return None


In [55]:
# Local Tests
test_number = "asddhasdha asdhasiodhausdh 321 dasdhadsad"
test_number2 = "a da 123123123 asd 123"
print(_number_pattern.findall(test_number))
print(_number_pattern.findall(test_number2))

test_mb = "da d, $3.12 million das 5.21 billions"
#_mbillion_pattern.match(test_mb)
print(_reduced_mbillion_pattern.findall(test_mb))
print(_mbillion_pattern.findall(test_mb))

test_ep = "(us$26000000 in 2020"   # "d asd asdd ( us$26000000 in 2020"
print(_equivalent_pattern.findall(test_ep))

test_currency = "¥" # "cn¥" should not be found
print(_currencies_patterns[8][1].findall(test_currency))

money_heuristic(["da", "d, ", " 3.12 million das 5.21 billions", "$5.12 million", "3 crore", "to $4 million in 2020", "hk$6,384,789"])

['321']
['123123123', '123']
[]
[('3.12 million', '3.12', 'million'), ('5.21 billions', '5.21', 'billions')]
[('us$26000000 in 2020', 'us$', '26000000', '2020')]
[('', '¥', '')]


('4000000', ['US_DOLLAR', 2020])

In [56]:
# Apply heuristic selection and inflation and conversion to database

def transform_money(money_tag: str) -> "pd.DataFrame":
    box_off = dataframe[money_tag].copy()
    for key, entries in dataframe[money_tag].items():
        final_value = None
        if entries:
            val_att = money_heuristic(entries)
            if val_att:
                value: float = float(val_att[0])
                currency, year = val_att[1]   # list[CURRENCY, YEAR]
                # for simplicity, if no currency is given, we will just use dollar
                if not currency:
                    currency = "US_DOLLAR"
                if not year:
                    year = dataframe[key:key+1]["year"][key]
                date_str = dataframe[key:key+1]["release_date"][key]
                month = None
                if date_str:
                    month = parse(date_str).month
                
                final_value = convert_value_to_recent_us_dollar(currency, value, year, month)
        box_off[key:key+1][key] = final_value
    return box_off

dataframe.reset_index(inplace=True, drop=True)
dataframe["box_office"] = transform_money("box_office")
dataframe["budget"] = transform_money("budget")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["box_office"] = transform_money("box_office")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe["budget"] = transform_money("budget")


### All monetary Values are now in dollar 

In [57]:
dataframe

Unnamed: 0.1,Unnamed: 0,title,year,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,...,cinematography,directed_by,release_date,production_company,country,running_time,starring,box_office,budget,written_by
0,0.0,10 Rillington Place,1971,13,62.0,1000,85.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1971,Drama,...,[denys coop],[richard fleischer],1971-01-29,"[genesis productions, filmways pictures]",[united kingdom],[111 min],"[richard attenborough, judy geeson, john hurt]",,,
1,1.0,100 Rifles,1969,2,,500,41.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1969,Western,...,[cecilio paniagua],[tom gries],1969-03-26,[marvin schwartz productions],[united states],[110 minutes],"[jim brown, raquel welch, burt reynolds]",28530691.27,31954374.22,
2,2.0,1001 Arabian Nights,1959,0,,0,,,1959,Kids & family/Fantasy,...,,[jack kinney],1959-12-01,[upa],[united states],[75 minutes],"[jim backus, kathryn grant, dwayne hickman, ha...",,20018619.14,"[dick shaw, leo salkin, lew keller, ed nofzige..."
3,3.0,10:30 P.M. Summer,1966,4,,50,32.0,,1966,Drama/Mystery & thriller,...,[gábor pogány],[jules dassin],1966-10-24,,[united states],[85 minutes],"[melina mercouri, romy schneider]",,,"[(novel), jules dassin, marguerite duras]"
4,4.0,13 Frightened Girls,1963,3,,50,17.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1963,Mystery & thriller,...,[gordon avil],[william castle],1963-07-21,[william castle pictures],[united states],[89 minutes],"[kathy dunn, murray hamilton, joyce taylor, hu...",,,[otis l. guernsey jr.]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18816,,Defiance,2008,189,59.0,250000,72.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,History/Drama,...,[eduardo serra],[edward zwick],2008-12-31,"[paramount vantage, bedford falls productions,...",[united states],[137 minutes],"[daniel craig, liev schreiber, jamie bell, ale...",71668919.04,44793074.39,"[clayton frohman, edward zwick]"
18817,,Last Chance Harvey,2008,157,71.0,25000,53.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Romance,...,[john de borman],[joel hopkins],2008-12-25,,[united states],[92 minutes],"[dustin hoffman, emma thompson, kathy baker, j...",45492966.18,6998917.87,[joel hopkins]
18818,,The Spirit,2008,115,14.0,100000,25.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Action/Adventure,...,[bill pope],[frank miller],2008-12-25,[oddlot entertainment darklot entertainment li...,[united states],[103 minutes],"[gabriel macht, eva mendes, sarah paulson, dan...",54591559.42,83987014.5,
18819,,Waltz with Bashir,2008,154,96.0,25000,91.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2008,Documentary/Animation,...,,[ari folman],2008-05-13,"[bridgit folman film gang, les films d'ici, ra...","[germany, france, israel]",[90 minutes],[ari folman],15078280.61,1765924.75,[ari folman]


In [68]:
if "level_0" in dataframe.keys():
    dataframe.drop(columns=["level_0"], inplace=True)
if "Unnamed: 0" in dataframe.keys():
    dataframe.drop(columns=["Unnamed: 0"], inplace=True)

dataframe.to_csv(Path(local_path, "film_info_simple.csv"), index=False)

In [71]:
dataframe.to_parquet("film_info_simple.parquet", engine="pyarrow", index=True)

In [72]:
dataframe.keys()

Index(['title', 'year', 'critics_count', 'critics_score', 'audience_count',
       'audience_score', 'suppliers_list', 'rottentomatoes_year',
       'rottentomatoes_genre', 'rottentomatoes_length', 'infobox',
       'produced_by', 'distributed_by', 'language', 'edited_by', 'music_by',
       'cinematography', 'directed_by', 'release_date', 'production_company',
       'country', 'running_time', 'starring', 'box_office', 'budget',
       'written_by'],
      dtype='object')