In [1]:
import pandas as pd
import regex
from pathlib import Path

### Load Data

In [2]:
"""
Get all Files and concatenate
"""
# Path to data files (might change)
parent_path = Path(".", "src", "movies", "movies").absolute()

files_path: list[Path] = []
for data_path in parent_path.iterdir():
    if regex.match(".+/film_info_part_\d.csv$", str(data_path)) != None:
        files_path.append(data_path)

# read all datafiles and concatenate in pandas
dfs = []
lengths = []
for file in files_path:
    df = pd.read_csv(file)
    lengths.append(len(df))
    dfs.append(df)

dataframe = pd.concat(dfs, ignore_index=True)
dataframe

Unnamed: 0,title,year,infobox,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length
0,711 Ocean Drive,1950,"{'Directed by': ['Joseph M. Newman', '', ''], ...",3 Reviews,,100+ Ratings,61.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 42m
1,Abbott and Costello in the Foreign Legion,1950,"{'Directed by': ['Charles Lamont', '', ''], 'W...",3 Reviews,,500+ Ratings,59.0,,1950,Comedy,1h 20m
2,Ambush,1950,"{'Directed by': ['Sam Wood', '', ''], 'Screenp...",3 Reviews,,Fewer than 50 Ratings,62.0,,1949,Western,1h 29m
3,Annie Get Your Gun,1950,"{'Directed by': [' ', ' ', 'George Sidney', 'B...",12 Reviews,100.0,"5,000+ Ratings",67.0,,1950,Musical,1h 47m
4,The Asphalt Jungle,1950,"{'Directed by': ['John Huston', '', ''], 'Scre...",35 Reviews,97.0,"5,000+ Ratings",87.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 52m
...,...,...,...,...,...,...,...,...,...,...,...
12818,The Assassin,2015,"{'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...",128 Reviews,80.0,"5,000+ Ratings",48.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2015,Action/Drama,1h 45m
12819,Go Away Mr. Tumor,2015,"{'Directed by': ['Han Yan', '', ''], 'Starring...",1 Reviews,,250+ Ratings,80.0,"[('amazon-prime-video-us', 'Rent/buy'), ('itun...",2015,Comedy,1h 25m
12820,"Love, At First…",2015,"{'Directed by': ['Tao Hai', '', ''], 'Starring...",0 Reviews,,0 Ratings,,"[('itunes', 'Rent/buy')]",2015,Romance,1h 36m
12821,A Tale of Three Cities,2015,"{'Directed by': ['Mabel Cheung', '', ''], 'Wri...",7 Reviews,43.0,0 Ratings,,,2015,Drama/Foreign,2h 10m


### Filter titles

In [3]:
# There shouldn't be any losses here
dataframe = dataframe.loc[~(dataframe["title"].isnull() | dataframe["title"].isna())]

In [4]:
dataframe.loc[12818]["infobox"]

"{'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed by': ['Hou Hsiao-hsien', '', ''], 'Written by': ['Hou Hsiao-hsien', 'Hsieh Hai-Meng', 'Chu T’ien-wen', 'Ah Cheng', '\\n \\n', ''], 'Produced by': ['Wen-Ying Huang', 'Liao Ching-Sung', '\\n \\n', ''], 'Starring': ['Shu Qi', 'Chang Chen', 'Zhou Yun', 'Satoshi Tsumabuki', '\\n \\n', ''], 'Cinematography': ['Mark Lee Ping Bin', '', ''], 'Edited by': ['Huang Chih-Chia', '', ''], 'Music by': ['Lim Giong', '', ''], 'Production companies': ['', ''], 'Distributed by': ['Well Go USA', 'StudioCanal', '\\n \\n', ''], 'Release dates': ['21\\xa0May\\xa02015', ' (', ')', '27\\xa0August\\xa02015', ' (China & Hong Kong)', '28\\xa0August\\xa02015', ' (Taiwan)', 'Cannes', '\\n \\n', ''], 'Running time': ['105 minutes', '', ''], 'Countries': ['Taiwan', 'China', 'Hong Kong', '\\n \\n', ''], 'Language': ['Mandarin', '', ''], 'Budget': ['90 million (', '14.9 million)', 'CN¥', 'US$', '', ''], 'Box office': [' (China)', ' (worldwide)', '\\n \\n', '']}"

### Filter years

In [5]:
"""
There might occur some differences between the Release Date on Wikipedia and Rotten Tomatoes possibly indicating a different movie alltogether
We're gonna simply remove the ones having a difference bigger than 1 year
"""
_filter = abs(dataframe["year"] - dataframe["rottentomatoes_year"]) > 1
different_movies = dataframe.loc[_filter]

print(f"removed {len(different_movies)} titles")
dataframe = dataframe.loc[~_filter]
dataframe

removed 1460 titles


Unnamed: 0,title,year,infobox,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length
0,711 Ocean Drive,1950,"{'Directed by': ['Joseph M. Newman', '', ''], ...",3 Reviews,,100+ Ratings,61.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 42m
1,Abbott and Costello in the Foreign Legion,1950,"{'Directed by': ['Charles Lamont', '', ''], 'W...",3 Reviews,,500+ Ratings,59.0,,1950,Comedy,1h 20m
2,Ambush,1950,"{'Directed by': ['Sam Wood', '', ''], 'Screenp...",3 Reviews,,Fewer than 50 Ratings,62.0,,1949,Western,1h 29m
3,Annie Get Your Gun,1950,"{'Directed by': [' ', ' ', 'George Sidney', 'B...",12 Reviews,100.0,"5,000+ Ratings",67.0,,1950,Musical,1h 47m
4,The Asphalt Jungle,1950,"{'Directed by': ['John Huston', '', ''], 'Scre...",35 Reviews,97.0,"5,000+ Ratings",87.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 52m
...,...,...,...,...,...,...,...,...,...,...,...
12818,The Assassin,2015,"{'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...",128 Reviews,80.0,"5,000+ Ratings",48.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2015,Action/Drama,1h 45m
12819,Go Away Mr. Tumor,2015,"{'Directed by': ['Han Yan', '', ''], 'Starring...",1 Reviews,,250+ Ratings,80.0,"[('amazon-prime-video-us', 'Rent/buy'), ('itun...",2015,Comedy,1h 25m
12820,"Love, At First…",2015,"{'Directed by': ['Tao Hai', '', ''], 'Starring...",0 Reviews,,0 Ratings,,"[('itunes', 'Rent/buy')]",2015,Romance,1h 36m
12821,A Tale of Three Cities,2015,"{'Directed by': ['Mabel Cheung', '', ''], 'Wri...",7 Reviews,43.0,0 Ratings,,,2015,Drama/Foreign,2h 10m


### Filter length

In [6]:
"""
We might want to include the runtime and convert it into minutes only 
"""

_pattern = regex.compile("^(\d+)h (\d+)m$")
def movie_runtime_transformer(x):
    match = _pattern.match(x.strip())
    # convert into minutes only
    ret_val = None
    if match:
        ret_val = int(match.group(1))*60 + int(match.group(2))
    return ret_val

# replace original length
dataframe["rottentomatoes_length"] = dataframe["rottentomatoes_length"].transform(movie_runtime_transformer)

### Filter Review counts

In [7]:
_pattern = regex.compile("^\D*(\d+).*$")
def review_count_transformer(x: str):
    x = x.replace(",", "")  # integer are divided by ',' for every 3 digits
    match = _pattern.match(x.strip())
    # convert into number of reviews only
    ret_val = 0
    
    if match:
        ret_val = int(match.group(1))
    return ret_val

# replace original counts
dataframe["audience_count"] = dataframe["audience_count"].transform(review_count_transformer)
dataframe["critics_count"] = dataframe["critics_count"].transform(review_count_transformer)
dataframe

Unnamed: 0,title,year,infobox,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length
0,711 Ocean Drive,1950,"{'Directed by': ['Joseph M. Newman', '', ''], ...",3,,100,61.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,102.0
1,Abbott and Costello in the Foreign Legion,1950,"{'Directed by': ['Charles Lamont', '', ''], 'W...",3,,500,59.0,,1950,Comedy,80.0
2,Ambush,1950,"{'Directed by': ['Sam Wood', '', ''], 'Screenp...",3,,50,62.0,,1949,Western,89.0
3,Annie Get Your Gun,1950,"{'Directed by': [' ', ' ', 'George Sidney', 'B...",12,100.0,5000,67.0,,1950,Musical,107.0
4,The Asphalt Jungle,1950,"{'Directed by': ['John Huston', '', ''], 'Scre...",35,97.0,5000,87.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,112.0
...,...,...,...,...,...,...,...,...,...,...,...
12818,The Assassin,2015,"{'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...",128,80.0,5000,48.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2015,Action/Drama,105.0
12819,Go Away Mr. Tumor,2015,"{'Directed by': ['Han Yan', '', ''], 'Starring...",1,,250,80.0,"[('amazon-prime-video-us', 'Rent/buy'), ('itun...",2015,Comedy,85.0
12820,"Love, At First…",2015,"{'Directed by': ['Tao Hai', '', ''], 'Starring...",0,,0,,"[('itunes', 'Rent/buy')]",2015,Romance,96.0
12821,A Tale of Three Cities,2015,"{'Directed by': ['Mabel Cheung', '', ''], 'Wri...",7,43.0,0,,,2015,Drama/Foreign,130.0


### Filter Infobox

In [8]:
"""
We want to preserve as much information as possible
The most important ones are budget, box office and release date (with month) which will have to be included
"""
import json

# Create pattern that ONLY changes " to ' when not needed for Names/language e.g.: "Edmond O'Brian"
_quote_mark_pattern = regex.compile("([A-Za-z][A-Za-z\s]+\"[A-Za-z]+)(?=[^\"]*\".*)")
_release_pattern = regex.compile(r"(\\xa0(?=\d+)(\d+))") # double enclosure group, potentially leading 0 
_structured_number_pattern = regex.compile('(?<=\".?)((\d+,?)+)(?=\")') # commas within natural numbers
_too_many_quotes_pattern = regex.compile('(\"{3,})')    # if 3 or more behind each other

def to_dict_transformer(x: str) -> dict | None:
    x = x.replace("'", "\"")    # needed for json
    # revert wanted '
    for p in _quote_mark_pattern.findall(x):
        x = x.replace(p, p.replace("\"", "'"))

    # replace broken numbers with correct ones
    # for broken, nums in _release_pattern.findall(x):
    #     x = x.replace(broken, f" {int(nums)}") 
    x = x.replace("\\xa0", " ")
    
    for broken, _ in _structured_number_pattern.findall(x):
        x = x.replace(broken, broken.replace(",", ""))

    for p in _too_many_quotes_pattern.findall(x):
        x = x.replace(p, '""')

    try:
        ret_val: dict = json.loads(x)
    except:
        ret_val = None  # most errors should now be fixed anyway
    return ret_val

# _str = 'ributed by": ["Loew"s, Inc.", "", ""], "Release date": ["J'
# print(_quote_mark_pattern.findall(_str))

"""
We can't use dataframe.transform here since then all the " and ' replacements are just reversed by panda
-> create new tags for each category at once
"""
dicts: list[dict[str, str] | None] = []
for dict_str in dataframe["infobox"]:
    dicts.append(to_dict_transformer(dict_str))


# Pattern that recognizes plural (s at end, not ies) and removes it
# words shorter than 3 letters are not plurals
_plural_pattern = regex.compile("(?<!i)(?<=[A-za-z]{3,})s(?=_?)(?![A-za-z])")

synonyms_dict = {}
local_path = Path(".", "src", "movies", "filter").absolute()
with open(Path(local_path, "synonyms.json")) as syns:
    synonyms_list: list[dict] = json.load(syns)
    # pack into one dict
    for _dic in synonyms_list:
        for _key, _list in _dic.items():
            synonyms_dict[_key] = _list

# create reverse lookup table
reverse_synonym_dict = {item: _key for _key, _list in synonyms_dict.items() for item in _list}

def replace_synonyms(in_str: str) -> str:
    base_synonym = reverse_synonym_dict.get(in_str)
    return base_synonym if base_synonym else in_str

def normalize_keys(in_str: str) -> str:
    # basic compression
    in_str = in_str.lower().strip().replace(" ", "_")
    # somtimes numbers are somehow included which we're gonna remove to get a smaller set
    # we're also gonna remove enclosing brackets around a letter
    in_str = regex.sub("\d|\(|\)", "", in_str)
    # some tags are given are in plural some in singular. We're gonna reduce that to always singular by 
    # simply assuming that only plurals end with 's'ArithmeticError
    in_str = regex.sub("ies", "y", in_str)
    # apply regex based plural filter
    in_str = _plural_pattern.sub("", in_str)

    return in_str

category_counter: dict[str, int] = dict()
# normalize categories
for dic in dicts:
    if dic != None:
        key_copy = set(dic.keys())
        for c_key in key_copy:
            normalized_key = normalize_keys(c_key)
            
            # replace synonyms
            normalized_key = replace_synonyms(normalized_key)

            dic[normalized_key] = dic.pop(c_key)

            # add new entry or increase counter
            if category_counter.get(normalized_key):
                category_counter[normalized_key] += 1
            else:
                category_counter[normalized_key] = 1

print(category_counter)

{'distributed_by': 9799, 'language': 10752, 'directed_by': 11017, 'cinematography': 10164, 'edited_by': 9999, 'color_proces': 390, 'production_company': 7861, 'box_office': 6745, 'produced_by': 10374, 'budget': 5260, 'starring': 10776, 'country': 10686, 'written_by': 8525, 'release_date': 10855, 'music_by': 9935, 'running_time': 10696, 'narrated_by': 282, 'screenplay_by': 3772, 'based_on': 3528, '': 1377, 'released': 709, 'animation_by': 9, 'author': 32, 'recorded': 287, 'text': 1, 'published_by': 38, 'published': 7, 'illustrated_by': 5, 'original_title': 14, 'dialogue_by': 13, 'length': 509, 'production_code': 2, 'episode': 4, 'original_air_date': 1, 'date_premiered': 9, 'place_premiered': 8, 'original_language': 138, 'genre': 113, 'publication_date': 28, 'page': 18, 'literally': 86, 'of_episode': 6, 'original_network': 116, 'related': 3, 'original_release': 134, 'picture_format': 54, 'audio_format': 45, 'executive_producer': 70, 'country_of_origin': 131, 'of_season': 2, 'created_by':

In [9]:
test = "'Release dates': ['21\\xa0May\\xa02015', ' (', ')', '27\\xa0August\\xa02015', ' (China & Hong Kong)', '28\\xa0August\\xa02015', ' (Taiwan)', 'Cannes', '\\n "
_release_pattern.findall(test)

[('\\xa02015', '2015'), ('\\xa02015', '2015'), ('\\xa02015', '2015')]

In [10]:
dataframe["infobox"]

0        {'Directed by': ['Joseph M. Newman', '', ''], ...
1        {'Directed by': ['Charles Lamont', '', ''], 'W...
2        {'Directed by': ['Sam Wood', '', ''], 'Screenp...
3        {'Directed by': [' ', ' ', 'George Sidney', 'B...
4        {'Directed by': ['John Huston', '', ''], 'Scre...
                               ...                        
12818    {'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...
12819    {'Directed by': ['Han Yan', '', ''], 'Starring...
12820    {'Directed by': ['Tao Hai', '', ''], 'Starring...
12821    {'Directed by': ['Mabel Cheung', '', ''], 'Wri...
12822    {'Directed by': ['Cao Baoping', '', ''], 'Scre...
Name: infobox, Length: 11363, dtype: object

### Convert Info Box into new Tags

In [11]:
"""
Split dictionary into single data types for easier referencing
"""
# how much percent of the tag with the most occurences another tag MUST have to be included in the dataset
INCLUSION_MARGIN = 1/3
MAX_COUNT = max(category_counter.values())
LOWER_BOUNDARY = int(INCLUSION_MARGIN * MAX_COUNT)

assert(len(dicts) == len(dataframe))

valid_tags: list[tuple[str, int]] = list(filter(lambda _tag_val_tup: _tag_val_tup[1] >= LOWER_BOUNDARY, category_counter.items()))
new_col_dict: dict[str, list] = {_key: [] for _key, _ in valid_tags}

for dic in dicts:
    for tag, _ in valid_tags:
        if dic: # could be None if not readable
            # get all rows with this tag and add as column to dataframe
            item: str | None = dic.get(tag)
            new_col_dict[tag].append(item)
        else:
            new_col_dict[tag].append(None)

# check for any lost items
for _tag, _list in new_col_dict.items():
    assert(len(_list) == len(dataframe))

# add columns to dataframe
existing_keys = list(dataframe.keys())
for _tag, _list in new_col_dict.items():
    # prevent overwriting
    if _tag in existing_keys:
        _tag += "_info"
    dataframe[_tag] = _list

In [12]:
dataframe.keys()

Index(['title', 'year', 'infobox', 'critics_count', 'critics_score',
       'audience_count', 'audience_score', 'suppliers_list',
       'rottentomatoes_year', 'rottentomatoes_genre', 'rottentomatoes_length',
       'distributed_by', 'language', 'directed_by', 'cinematography',
       'edited_by', 'production_company', 'box_office', 'produced_by',
       'budget', 'starring', 'country', 'written_by', 'release_date',
       'music_by', 'running_time', 'screenplay_by'],
      dtype='object')

### Reduce Lists within entries

In [13]:
dataframe.loc[0]

title                                                      711 Ocean Drive
year                                                                  1950
infobox                  {'Directed by': ['Joseph M. Newman', '', ''], ...
critics_count                                                            3
critics_score                                                          NaN
audience_count                                                         100
audience_score                                                        61.0
suppliers_list           [('vudu', 'Rent/buy'), ('amazon-prime-video-us...
rottentomatoes_year                                                   1950
rottentomatoes_genre                                           Crime/Drama
rottentomatoes_length                                                102.0
distributed_by                                     [Columbia Pictures, , ]
language                                                     [English, , ]
directed_by              

In [14]:
"""
Due to the data collection process, the infobox contains many empty elements
These need to be reduced and ideally into one element only
"""
def reduce_list(lis: list[str] | None) -> list:
    if lis:
        reduced_list = []
        for item in lis:
            item = item.replace("\n", "").strip().lower()
            if item:
                reduced_list.append(item)
        return reduced_list
    else:
        return None

# transform data
for _tag in new_col_dict.keys():
    dataframe[_tag] = dataframe[_tag].transform(reduce_list)
    # print(dataframe[_tag].transform(reduce_list))

### Convert Release dates

In [15]:
dataframe["release_date"]

0                          [july 1, 1950, (united states)]
1                                          [july 24, 1950]
2                                       [january 13, 1950]
3                                          [july 17, 1950]
4                          [may 12, 1950, (united states)]
                               ...                        
12818    [21 may 2015, (, ), 27 august 2015, (china & h...
12819                                    [august 13, 2015]
12820                                     [august 7, 2015]
12821    [27 august 2015, (china), 3 september 2015, (h...
12822    [june 15, 2015, (, ), august 27, 2015, (china)...
Name: release_date, Length: 11363, dtype: object

In [16]:
"""
There are some different variants for writing the date unfortunately so we will use regex to extrapolate month day and year and bring it into the following format:
<yyyy>-<mm>-<dd>
furthermore, there are different release dates. We're only gonna look at the oldest release (which came first)
"""
from dateutil.parser import *

# test = dataframe["release_date"][12818]
# print(str(parse(test[0]).date()))
# print(parse(test[0]) < parse("21.05.2016"))

def get_first_date(dates: list[str] | None) -> str | None:
    if dates:
        parsed_dates = []
        for line in dates:
            try:
                _date = parse(line)
                parsed_dates.append(_date)
            except:
                # unrecognizable date or simply something different (e.g. countries)
                continue
        # get oldest date as str
        return str(min(parsed_dates).date()) if parsed_dates else None
    return None

dataframe["release_date"] = dataframe["release_date"].transform(get_first_date)

In [17]:
# amount of valid dates
len(dataframe["release_date"].loc[~dataframe["release_date"].isnull()])

10557

### Box Office/Budget Filter

In [18]:
cols = [k for k in new_col_dict.keys()]
dataframe[cols]

Unnamed: 0,distributed_by,language,directed_by,cinematography,edited_by,production_company,box_office,produced_by,budget,starring,country,written_by,release_date,music_by,running_time,screenplay_by
0,[columbia pictures],[english],[joseph m. newman],[franz planer],[bert jordan],[frank seltzer productions],[$1550000],[frank n. seltzer],[$300000],"[edmond o'brien, joanne dru, otto kruger]",[united states],"[richard english, francis swann]",1950-07-01,[sol kaplan],[102 minutes],
1,[universal-international],[english],[charles lamont],[george robinson],[frank gross],[universal-international],[$1250000],[robert arthur],[$679687],"[bud abbott, lou costello, patricia medina, wa...",[united states],"[john grant, martin ragaway, leonard stern]",1950-07-24,,[80 minutes],
2,[metro-goldwyn-mayer],[english],[sam wood],[harold lipstein],[ben lewis],[metro-goldwyn-mayer],[$3215000],"[, sam wood, armand deutsch]",[$1754000],"[robert taylor, john hodiak, arlene dahl]",[united states],,1950-01-13,[rudolph g. kopp],[89 minutes],[marguerite roberts]
3,"[loew's, inc.]",[english],"[george sidney, busby berkeley, charles walters]",[charles rosher],[james e. newcom],[metro-goldwyn-mayer],[$7756000],"[arthur freed, roger edens]",[$3734000],"[betty hutton, howard keel, louis calhern, kee...",[united states],,1950-07-17,"[songs: (lyrics and music by), music direction...",[107 minutes],[sidney sheldon]
4,[loew's inc.],[english],[john huston],[harold rosson],[george boemler],[metro-goldwyn-mayer],,[arthur hornblow jr.],[$1232000],"[sterling hayden, louis calhern, jean hagen]",[united states],,1950-05-12,[miklós rózsa],[112 minutes],"[john huston, ben maddow]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12818,"[well go usa, studiocanal]",[mandarin],[hou hsiao-hsien],[mark lee ping bin],[huang chih-chia],[],"[(china), (worldwide)]","[wen-ying huang, liao ching-sung]","[90 million (, 14.9 million), cn¥, us$]","[shu qi, chang chen, zhou yun, satoshi tsumabuki]","[taiwan, china, hong kong]","[hou hsiao-hsien, hsieh hai-meng, chu t’ien-we...",2015-05-21,[lim giong],[105 minutes],
12819,[china lion (u.s.)],[mandarin],[han yan],,,[wanda pictures],[us$80.12 million],,,"[bai baihe, daniel wu]",[china],,2015-08-13,,[128 minutes],
12820,"[china film, eastern mordor, tianjing yinhe me...",[mandarin],[tao hai],,,[shanghai jinse tianxia entertainment beijing ...,[],,,"[juck zhang, xiaofeng li, yao zhang, jingjing ...",[china],,2015-08-07,,[98 minutes],
12821,,[mandarin],[mabel cheung],,,,,,,"[huang jue, li jianyi, sean lau, tang wei, qin...","[china, hong kong]","[mabel cheung, alex law]",2015-08-27,,[130 minutes],


In [19]:
inflation_table = pd.read_csv(Path(local_path, "inflation_rate_usa.csv"), dtype=float)
inflation_table["Year"] = inflation_table["Year"].transform(lambda x: int(x))
# TODO use IMF Table for different countries
FIRST_YEAR = 1913
LAST_YEAR = 2022
MONTH_MAPPING = {
    1: "Jan",
    2: "Feb",
    3: "Mar",
    4: "Apr",
    5: "May",
    6: "June",
    7: "July",
    8: "Aug",
    9: "Sep",
   10: "Oct",
   11: "Nov",
   12: "Dec",
}

"""
Inflation Table contains CPI in Columns Jan-Dec and Avg, Dec-Dec and Avg-Avg is already given in percent
"""

def inflation_calculator(value: float, from_year: int, to_year: int = 2022, from_month: int = None, to_month: int = None) -> float:
    """
    Calculate Inflation per year or even per month if <from_month> and <to_month> are specified
    Inflation can be efficiently calculated with CPI money*(<to_cpi> / <from_cpi>)
    Rounded to 2 decimals
    """
    if from_year < FIRST_YEAR or to_year > LAST_YEAR:
        if to_year < FIRST_YEAR or from_year > LAST_YEAR:
            raise ValueError("years are out of range of dataset")
    if from_month:
        if from_month < 1 or from_month > 12:
            raise ValueError("from months are out of range")
    if to_month:
        if to_month < 1 or to_month > 12:
            raise ValueError("to months are out of range")

    _from_table = inflation_table.loc[from_year-FIRST_YEAR]
    _to_table = inflation_table.loc[to_year-FIRST_YEAR]

    _from_pci: float = _from_table[MONTH_MAPPING[from_month]] if from_month else _from_table["Avg"]
    _to_pci: float = _to_table[MONTH_MAPPING[to_month]] if to_month else _to_table["Avg"]

    return round(value * (_to_pci / _from_pci), 2)

# inflation_calculator(1.0, 1913, 1917, 1, 12)

In [180]:
"""
First we need to convert all written text into a natural number (e.g. 46.75 million = 46750000)
After that, the really difficult part starts by accounting for inflation and different currencies
We want US$ only at the end
"""

_number_pattern = regex.compile("\d+")
_reduced_mbillion_pattern = regex.compile("^((\d+\.?\d*)\s?(millions?|billions?|crores?))(?=.*)")
_mbillion_pattern = regex.compile("(?<=.*\s|\W)((\d+\.?\d*)\s?(millions?|billions?|crores?))(?=.*)")
# some entries already or only contain an equivalent for decades later than when the movei was released
# this needs to be regarded for the inflation adjustments
_equivalent_pattern = regex.compile(".*(\D\d+\sin\s\d{4}).*")

"""
Predefined Pattern for the Value type of the return dict of money_heuristic
This is required for the inflation calculator to adjust the values more correctly
if one of the ATTRIBUTES items is None, use the original entry data to infer the used values
"""
CURRENCY = str|None
YEAR = str|None
MONEY_ATTRIBUTES = list[CURRENCY, YEAR]
AMOUNT_ATTRIBUTES = 2

NON_US_DOLLAR_SCORE = 6
# each tuple has Name of Currency/Country and associated score (loosley based on conversion factor)
# sorted with decreasing score!
_currencies_patterns: list[tuple[str, "regex.Pattern", int]] = [
    ("US_DOLLAR", regex.compile("((us)?\$)"), 10),    # score must only be applied if NONE of the other _DOLLAR patterns matched!
    ("NZ_DOLLAR", regex.compile("((au?)\$)"), NON_US_DOLLAR_SCORE),
    ("AUS_DOLLAR", regex.compile("((nz)\$)"), NON_US_DOLLAR_SCORE),
    ("HK_DOLLAR", regex.compile("((hk)\$)"), NON_US_DOLLAR_SCORE),
    ("NT_DOLLAR", regex.compile("((nt)\$)"), NON_US_DOLLAR_SCORE),
    ("MX_DOLLAR", regex.compile("((mx)\$)"), NON_US_DOLLAR_SCORE),
    ("GB_POUND", regex.compile("(£)"), 5),
    ("EURO", regex.compile("(€)"), 5),
    ("JP_YEN", regex.compile("((?<!cn)¥)"), 3),
    ("CN_YUAN", regex.compile("(cn¥|yuan)"), 3),
    ("RUBLES", regex.compile("(rub|₽)"), 3),
    ("SWED_KRON", regex.compile("(sek)"), 3),
    ("INDIAN_RUPEE", regex.compile("(₹)"), 3),
    ("TURK_LIRA", regex.compile("(₺)"), 3),
]

def money_heuristic(entries: list[str]) -> dict[str, MONEY_ATTRIBUTES]:
    """
    wikipedia often has multiple entries about the box office (global, within the country, other currencies, etc.)
    This function assigns a scoring system for these entries to estimate the most beneficial for our dataset.
    
    @return the most useful entry based on heuristics, additional ATTRIBUTES needed for 
    """

    def multiplier(selector: str) -> float:
        """
        the equivalent in numbers to a word
        """
        return {
            "million": 10**6,
            "millions": 10**6,
            "crore": 10**7, # east asian for 10 million, also indian currency
            "crores": 10**7,
            "billion": 10**9,
            "billions": 10**9,
        }[selector]
    
    def amount_heuristic(_heuristics: dict[int, int], _entries: list[str]) -> None:
        """
        modifies heuristic based on number 
        """
        numbers: list[int] = []
        # get biggest number in one entry
        for index, entry in enumerate(_entries):
            numbers.append(-1)
            for num in _number_pattern.findall(entry):
                int_num = int(num)
                if int_num > numbers[-1]:
                    numbers[-1] = int_num

        # get heuristic value based on size
        argmax_list = [0 for i in range(len(_entries))]
        for index, num in enumerate(numbers):
            for i in range(len(_entries)):
                argmax_list[i] += 1 if numbers[i] > num else 0

        # apply scores
        for index, points in enumerate(argmax_list):
            _heuristics[index] += points
    
    def currency_heuristic(_heuristics: dict[int, int], _entries: list[str]) -> dict[int, MONEY_ATTRIBUTES]:
        """
        This method tries to find out which currencies are probably used in each entry.
        We want us$ if possible since we only need to account for inflation then and not for exchange factor as well
        """
        curr_heur: dict[int, int] = {}
        _attributes: dict[int, MONEY_ATTRIBUTES] = {} 
        for index, entry in enumerate(_entries):
            curr_heur[index] = 0    # initiate default score of +0
            _attributes[index] = [None for _ in range(AMOUNT_ATTRIBUTES)]   # initiate empty attribute list

            for cur_name, pattern, score in _currencies_patterns[-2::-1]: # use inverse list and omit US$
                if pattern.findall(entry):
                    curr_heur[index] = score
                    _attributes[index][0] = cur_name
            # check if other form of dollar was already found
            cur_name, pattern, score = _currencies_patterns[0]
            if curr_heur[index] != NON_US_DOLLAR_SCORE and pattern.findall(entry):
                curr_heur[index] = score
                _attributes[index][0] = cur_name    # US_DOLLAR

        # apply scores
        for index, points in curr_heur.items():
            _heuristics[index] += points
        return _attributes


    # filter unwanted characters ',' (in numbers)
    filtered_entries: list[str] = [item.replace(",", "").strip().lower() for item in entries]

    # we assume a box office to include numbers in their string (could also be year, but most of the times)
    # if there isn't a number, concatenate the string with the entry left to it (or right in case its the first index) 
    to_keep: list[int] = []
    for index, entry in enumerate(filtered_entries):
        if not _number_pattern.findall(entry):
            if (index-1) not in to_keep and index+1 < len(filtered_entries):
                filtered_entries[index+1] = f"{entry} {filtered_entries[index+1]}"
            elif to_keep:
                filtered_entries[to_keep[-1]] += f" {entry}"
        else:
            to_keep.append(index)
    # remove entry that was concatenated
    _filtered_entries = []
    for keep in to_keep:
        _filtered_entries.append(filtered_entries[keep])
    del filtered_entries
    filtered_entries = _filtered_entries

    # convert 'million' 'billion' 'crore' into numbers
    for index, entry in enumerate(filtered_entries):
        # print(entry)
        filtered_entries[index] = entry
        # try this pattern first in case string starts directly with number
        for capture, _float, mbil in _reduced_mbillion_pattern.findall(entry):
            replace_with = str(int(float(_float) * multiplier(mbil)))
            # crore is special case since its also the currency itself and not only a multiplier
            if "crore" in capture:
                # indian rupees
                replace_with = f"₹{replace_with}"
            filtered_entries[index] = filtered_entries[index].replace(capture, replace_with)
        # try the extended pattern
        for capture, _float, mbil in _mbillion_pattern.findall(filtered_entries[index]):
            replace_with = str(int(float(_float) * multiplier(mbil)))
            # crore is special case since its also the currency itself and not only a multiplier
            if "crore" in capture:
                # indian rupees
                replace_with = f"₹{replace_with}"
            filtered_entries[index] = filtered_entries[index].replace(capture, replace_with)
        
    # TODO apply heuristics, select best candidate
    # select based on index
    heuristics: dict[int, int] = {_ind: 1 for _ind in range(len(filtered_entries))}
    
    amount_heuristic(heuristics, filtered_entries)
    print(heuristics)
    attributes: dict[int, MONEY_ATTRIBUTES] = currency_heuristic(heuristics, filtered_entries)
    print(heuristics)

    return filtered_entries


In [182]:
# Local Tests
test_number = "asddhasdha asdhasiodhausdh 321 dasdhadsad"
test_number2 = "a da 123123123 asd 123"
print(_number_pattern.findall(test_number))
print(_number_pattern.findall(test_number2))

test_mb = "da d, $3.12 million das 5.21 billions"
#_mbillion_pattern.match(test_mb)
print(_reduced_mbillion_pattern.findall(test_mb))
print(_mbillion_pattern.findall(test_mb))

test_ep = "d asd asdd $26000000 in 2020"
print(_equivalent_pattern.findall(test_ep))

test_currency = "cn¥"
print(_currencies_patterns[8][1].findall(test_currency))

money_heuristic(["da", "d, ", " 3.12 million das 5.21 billions", "$5.12 billion", "3 crore", "$12 million"])

['321']
['123123123', '123']
[]
[('3.12 million', '3.12', 'million'), ('5.21 billions', '5.21', 'billions')]
['$26000000 in 2020']
[]
{0: 4, 1: 3, 2: 2, 3: 1}
{0: 4, 1: 13, 2: 5, 3: 11}


['da d 3120000 das 5210000000', '$5120000000', '₹30000000', '$12000000']

In [22]:
dataframe["box_office"]
# dataframe.loc[12818]
for i in dataframe["box_office"]:
    print(i)

['$1550000']
['$1250000']
['$3215000']
['$7756000']
None
None
['£93,000 (by 1953)', '21,168 admissions (france)']
None
None
['$2.65 million (us rentals)']
['$3.3 million (us rentals)', '1,464,218 admissions (france)']
['£114000']
['2,538,884 admissions (france)']
['$2446000', '$1,750,000 (us/canada rentals)']
['$1.5 million']
['$4.3-4.425 million (u.s. and canada rentals)']
['$182 million']
['$1403000']
['$1.9 million (us rentals)']
['$1625000']
None
None
['$1.6 million']
['£89000']
['$2803000', '$2,250,000 (us rentals)']
['$4490000', '$2.2 million (us rentals)']
['$2211000']
['$603000']
None
['$5 million', 'or $1.3 million (us)']
None
None
['$2096000']
None
None
None
['$2.6 million (us rentals)']
None
['$1.7 million (us rentals)']
['$1,450,000 (us rentals)']
['$1.5 million']
['$2.6 million (us rentals)']
['$1604000']
['£110,000 (uk)']
None
None
None
None
None
None
None
None
['$1.4 million']
None
['$2,450,000 (us rentals)']
['$5348000']
['$15.1 million']
None
['$2973000']
None
['$1.7 m

In [23]:
inflation_table

Unnamed: 0,Year,Jan,Feb,Mar,Apr,May,June,July,Aug,Sep,Oct,Nov,Dec,Avg,Dec-Dec,Avg-Avg
0,1913,9.800,9.800,9.800,9.800,9.700,9.800,9.900,9.900,10.000,10.000,10.100,10.000,9.900,0.0,0.0
1,1914,10.000,9.900,9.900,9.800,9.900,9.900,10.000,10.200,10.200,10.100,10.200,10.100,10.000,1.0,1.0
2,1915,10.100,10.000,9.900,10.000,10.100,10.100,10.100,10.100,10.100,10.200,10.300,10.300,10.100,2.0,1.0
3,1916,10.400,10.400,10.500,10.600,10.700,10.800,10.800,10.900,11.100,11.300,11.500,11.600,10.900,12.6,7.9
4,1917,11.700,12.000,12.000,12.600,12.800,13.000,12.800,13.000,13.300,13.500,13.500,13.700,12.800,18.1,17.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105,2018,247.867,248.991,249.554,250.546,251.588,251.989,252.006,252.146,252.439,252.885,252.038,251.233,251.107,1.9,2.4
106,2019,251.712,252.776,254.202,255.548,256.092,256.143,256.571,256.558,256.759,257.346,257.208,256.974,255.657,2.3,1.8
107,2020,257.971,258.678,258.115,256.389,256.394,257.797,259.101,259.918,260.280,260.388,260.229,260.474,258.811,1.4,1.2
108,2021,261.582,263.014,264.877,267.054,269.195,271.696,273.003,273.567,274.310,276.589,277.948,278.802,270.970,7.0,4.7
