In [1]:
import pandas as pd
import regex
from pathlib import Path

### Load Data

In [2]:
"""
Get all Files and concatenate
"""
# Path to data files (might change)
parent_path = Path(".", "src", "movies", "movies").absolute()

files_path: list[Path] = []
for data_path in parent_path.iterdir():
    if regex.match(".+/film_info_part_\d.csv$", str(data_path)) != None:
        files_path.append(data_path)

# read all datafiles and concatenate in pandas
dfs = []
lengths = []
for file in files_path:
    df = pd.read_csv(file)
    lengths.append(len(df))
    dfs.append(df)

dataframe = pd.concat(dfs, ignore_index=True)
dataframe

Unnamed: 0,title,year,infobox,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length
0,711 Ocean Drive,1950,"{'Directed by': ['Joseph M. Newman', '', ''], ...",3 Reviews,,100+ Ratings,61.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 42m
1,Abbott and Costello in the Foreign Legion,1950,"{'Directed by': ['Charles Lamont', '', ''], 'W...",3 Reviews,,500+ Ratings,59.0,,1950,Comedy,1h 20m
2,Ambush,1950,"{'Directed by': ['Sam Wood', '', ''], 'Screenp...",3 Reviews,,Fewer than 50 Ratings,62.0,,1949,Western,1h 29m
3,Annie Get Your Gun,1950,"{'Directed by': [' ', ' ', 'George Sidney', 'B...",12 Reviews,100.0,"5,000+ Ratings",67.0,,1950,Musical,1h 47m
4,The Asphalt Jungle,1950,"{'Directed by': ['John Huston', '', ''], 'Scre...",35 Reviews,97.0,"5,000+ Ratings",87.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 52m
...,...,...,...,...,...,...,...,...,...,...,...
12818,The Assassin,2015,"{'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...",128 Reviews,80.0,"5,000+ Ratings",48.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2015,Action/Drama,1h 45m
12819,Go Away Mr. Tumor,2015,"{'Directed by': ['Han Yan', '', ''], 'Starring...",1 Reviews,,250+ Ratings,80.0,"[('amazon-prime-video-us', 'Rent/buy'), ('itun...",2015,Comedy,1h 25m
12820,"Love, At First…",2015,"{'Directed by': ['Tao Hai', '', ''], 'Starring...",0 Reviews,,0 Ratings,,"[('itunes', 'Rent/buy')]",2015,Romance,1h 36m
12821,A Tale of Three Cities,2015,"{'Directed by': ['Mabel Cheung', '', ''], 'Wri...",7 Reviews,43.0,0 Ratings,,,2015,Drama/Foreign,2h 10m


### Filter titles

In [3]:
# There shouldn't be any losses here
dataframe = dataframe.loc[~(dataframe["title"].isnull() | dataframe["title"].isna())]

### Filter years

In [4]:
"""
There might occur some differences between the Release Date on Wikipedia and Rotten Tomatoes possibly indicating a different movie alltogether
We're gonna simply remove the ones having a difference bigger than 1 year
"""
_filter = abs(dataframe["year"] - dataframe["rottentomatoes_year"]) > 1
different_movies = dataframe.loc[_filter]

print(f"removed {len(different_movies)} titles")
dataframe = dataframe.loc[~_filter]
dataframe

removed 1460 titles


Unnamed: 0,title,year,infobox,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length
0,711 Ocean Drive,1950,"{'Directed by': ['Joseph M. Newman', '', ''], ...",3 Reviews,,100+ Ratings,61.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 42m
1,Abbott and Costello in the Foreign Legion,1950,"{'Directed by': ['Charles Lamont', '', ''], 'W...",3 Reviews,,500+ Ratings,59.0,,1950,Comedy,1h 20m
2,Ambush,1950,"{'Directed by': ['Sam Wood', '', ''], 'Screenp...",3 Reviews,,Fewer than 50 Ratings,62.0,,1949,Western,1h 29m
3,Annie Get Your Gun,1950,"{'Directed by': [' ', ' ', 'George Sidney', 'B...",12 Reviews,100.0,"5,000+ Ratings",67.0,,1950,Musical,1h 47m
4,The Asphalt Jungle,1950,"{'Directed by': ['John Huston', '', ''], 'Scre...",35 Reviews,97.0,"5,000+ Ratings",87.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,1h 52m
...,...,...,...,...,...,...,...,...,...,...,...
12818,The Assassin,2015,"{'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...",128 Reviews,80.0,"5,000+ Ratings",48.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2015,Action/Drama,1h 45m
12819,Go Away Mr. Tumor,2015,"{'Directed by': ['Han Yan', '', ''], 'Starring...",1 Reviews,,250+ Ratings,80.0,"[('amazon-prime-video-us', 'Rent/buy'), ('itun...",2015,Comedy,1h 25m
12820,"Love, At First…",2015,"{'Directed by': ['Tao Hai', '', ''], 'Starring...",0 Reviews,,0 Ratings,,"[('itunes', 'Rent/buy')]",2015,Romance,1h 36m
12821,A Tale of Three Cities,2015,"{'Directed by': ['Mabel Cheung', '', ''], 'Wri...",7 Reviews,43.0,0 Ratings,,,2015,Drama/Foreign,2h 10m


### Filter length

In [5]:
"""
We might want to include the runtime and convert it into minutes only 
"""

_pattern = regex.compile("^(\d+)h (\d+)m$")
def movie_runtime_transformer(x):
    match = _pattern.match(x.strip())
    # convert into minutes only
    ret_val = None
    if match:
        ret_val = int(match.group(1))*60 + int(match.group(2))
    return ret_val

# replace original length
dataframe["rottentomatoes_length"] = dataframe["rottentomatoes_length"].transform(movie_runtime_transformer)

### Filter Review counts

In [6]:
_pattern = regex.compile("^\D*(\d+).*$")
def review_count_transformer(x: str):
    x = x.replace(",", "")  # integer are divided by ',' for every 3 digits
    match = _pattern.match(x.strip())
    # convert into number of reviews only
    ret_val = 0
    
    if match:
        ret_val = int(match.group(1))
    return ret_val

# replace original counts
dataframe["audience_count"] = dataframe["audience_count"].transform(review_count_transformer)
dataframe["critics_count"] = dataframe["critics_count"].transform(review_count_transformer)
dataframe

Unnamed: 0,title,year,infobox,critics_count,critics_score,audience_count,audience_score,suppliers_list,rottentomatoes_year,rottentomatoes_genre,rottentomatoes_length
0,711 Ocean Drive,1950,"{'Directed by': ['Joseph M. Newman', '', ''], ...",3,,100,61.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,102.0
1,Abbott and Costello in the Foreign Legion,1950,"{'Directed by': ['Charles Lamont', '', ''], 'W...",3,,500,59.0,,1950,Comedy,80.0
2,Ambush,1950,"{'Directed by': ['Sam Wood', '', ''], 'Screenp...",3,,50,62.0,,1949,Western,89.0
3,Annie Get Your Gun,1950,"{'Directed by': [' ', ' ', 'George Sidney', 'B...",12,100.0,5000,67.0,,1950,Musical,107.0
4,The Asphalt Jungle,1950,"{'Directed by': ['John Huston', '', ''], 'Scre...",35,97.0,5000,87.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",1950,Crime/Drama,112.0
...,...,...,...,...,...,...,...,...,...,...,...
12818,The Assassin,2015,"{'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...",128,80.0,5000,48.0,"[('vudu', 'Rent/buy'), ('amazon-prime-video-us...",2015,Action/Drama,105.0
12819,Go Away Mr. Tumor,2015,"{'Directed by': ['Han Yan', '', ''], 'Starring...",1,,250,80.0,"[('amazon-prime-video-us', 'Rent/buy'), ('itun...",2015,Comedy,85.0
12820,"Love, At First…",2015,"{'Directed by': ['Tao Hai', '', ''], 'Starring...",0,,0,,"[('itunes', 'Rent/buy')]",2015,Romance,96.0
12821,A Tale of Three Cities,2015,"{'Directed by': ['Mabel Cheung', '', ''], 'Wri...",7,43.0,0,,,2015,Drama/Foreign,130.0


### Filter Infobox

In [7]:
"""
We want to preserve as much information as possible
The most important ones are budget, box office and release date (with month) which will have to be included
"""
import json

# Create pattern that ONLY changes " to ' when not needed for Names/language e.g.: "Edmond O'Brian"
_quote_mark_pattern = regex.compile("([A-Za-z][A-Za-z\s]+\"[A-Za-z]+)(?=[^\"]*\".*)")
_release_pattern = regex.compile(r"(\\xa(\d+))") # double enclosure group
_structured_number_pattern = regex.compile('(?<=\".?)((\d+,?)+)(?=\")') # commas within natural numbers
_too_many_quotes_pattern = regex.compile('(\"{3,})')    # if 3 or more behind each other

def to_dict_transformer(x: str) -> dict | None:
    x = x.replace("'", "\"")    # needed for json
    # revert wanted '
    for p in _quote_mark_pattern.findall(x):
        x = x.replace(p, p.replace("\"", "'"))

    # replace broken numbers with correct ones
    for broken, nums in _release_pattern.findall(x):
        x = x.replace(broken, f" {int(nums)}") 
    
    for broken, _ in _structured_number_pattern.findall(x):
        x = x.replace(broken, broken.replace(",", ""))

    for p in _too_many_quotes_pattern.findall(x):
        x = x.replace(p, '""')

    try:
        ret_val: dict = json.loads(x)
    except:
        ret_val = None  # most errors should now be fixed anyway
    return ret_val

# _str = 'ributed by": ["Loew"s, Inc.", "", ""], "Release date": ["J'
# print(_quote_mark_pattern.findall(_str))

"""
We can't use dataframe.transform here since then all the " and ' replacements are just reversed by panda
-> create new tags for each category at once
"""
dicts: list[dict[str, str] | None] = []
for dict_str in dataframe["infobox"]:
    dicts.append(to_dict_transformer(dict_str))


# Pattern that recognizes plural (s at end, not ies) and removes it
# words shorter than 3 letters are not plurals
_plural_pattern = regex.compile("(?<!i)(?<=[A-za-z]{3,})s(?=_?)(?![A-za-z])")

synonyms_dict = {}
local_path = Path(".", "src", "movies", "filter").absolute()
with open(Path(local_path, "synonyms.json")) as syns:
    synonyms_list: list[dict] = json.load(syns)
    # pack into one dict
    for _dic in synonyms_list:
        for _key, _list in _dic.items():
            synonyms_dict[_key] = _list

# create reverse lookup table
reverse_synonym_dict = {item: _key for _key, _list in synonyms_dict.items() for item in _list}

def replace_synonyms(in_str: str) -> str:
    base_synonym = reverse_synonym_dict.get(in_str)
    return base_synonym if base_synonym else in_str

def normalize_keys(in_str: str) -> str:
    # basic compression
    in_str = in_str.lower().strip().replace(" ", "_")
    # somtimes numbers are somehow included which we're gonna remove to get a smaller set
    # we're also gonna remove enclosing brackets around a letter
    in_str = regex.sub("\d|\(|\)", "", in_str)
    # some tags are given are in plural some in singular. We're gonna reduce that to always singular by 
    # simply assuming that only plurals end with 's'ArithmeticError
    in_str = regex.sub("ies", "y", in_str)
    # apply regex based plural filter
    in_str = _plural_pattern.sub("", in_str)

    return in_str

category_counter: dict[str, int] = dict()
# normalize categories
for dic in dicts:
    if dic != None:
        key_copy = set(dic.keys())
        for c_key in key_copy:
            normalized_key = normalize_keys(c_key)
            
            # replace synonyms
            normalized_key = replace_synonyms(normalized_key)

            dic[normalized_key] = dic.pop(c_key)

            # add new entry or increase counter
            if category_counter.get(normalized_key):
                category_counter[normalized_key] += 1
            else:
                category_counter[normalized_key] = 1

print(category_counter)

{'music_by': 9935, 'country': 10686, 'written_by': 8525, 'edited_by': 9999, 'starring': 10776, 'release_date': 10855, 'production_company': 7861, 'color_proces': 390, 'directed_by': 11017, 'budget': 5260, 'cinematography': 10164, 'language': 10752, 'produced_by': 10374, 'running_time': 10696, 'box_office': 6745, 'distributed_by': 9799, 'narrated_by': 282, 'based_on': 3528, 'screenplay_by': 3772, '': 1377, 'released': 709, 'animation_by': 9, 'author': 32, 'recorded': 287, 'illustrated_by': 5, 'original_title': 14, 'text': 1, 'published': 7, 'published_by': 38, 'length': 509, 'dialogue_by': 13, 'episode': 4, 'production_code': 2, 'original_air_date': 1, 'place_premiered': 8, 'original_language': 138, 'date_premiered': 9, 'genre': 113, 'page': 18, 'publication_date': 28, 'literally': 86, 'audio_format': 45, 'country_of_origin': 131, 'of_episode': 6, 'executive_producer': 70, 'of_season': 2, 'picture_format': 54, 'original_release': 134, 'related': 3, 'original_network': 116, 'created_by':

In [8]:
dataframe["infobox"]

0        {'Directed by': ['Joseph M. Newman', '', ''], ...
1        {'Directed by': ['Charles Lamont', '', ''], 'W...
2        {'Directed by': ['Sam Wood', '', ''], 'Screenp...
3        {'Directed by': [' ', ' ', 'George Sidney', 'B...
4        {'Directed by': ['John Huston', '', ''], 'Scre...
                               ...                        
12818    {'': ['Cìkè Niè Yǐnniáng', '', ''], 'Directed ...
12819    {'Directed by': ['Han Yan', '', ''], 'Starring...
12820    {'Directed by': ['Tao Hai', '', ''], 'Starring...
12821    {'Directed by': ['Mabel Cheung', '', ''], 'Wri...
12822    {'Directed by': ['Cao Baoping', '', ''], 'Scre...
Name: infobox, Length: 11363, dtype: object

### Convert Info Box into new Tags

In [9]:
"""
Split dictionary into single data types for easier referencing
"""
# how much percent of the tag with the most occurences another tag MUST have to be included in the dataset
INCLUSION_MARGIN = 1/3
MAX_COUNT = max(category_counter.values())
LOWER_BOUNDARY = int(INCLUSION_MARGIN * MAX_COUNT)

assert(len(dicts) == len(dataframe))

valid_tags: list[tuple[str, int]] = list(filter(lambda _tag_val_tup: _tag_val_tup[1] >= LOWER_BOUNDARY, category_counter.items()))
new_col_dict: dict[str, list] = {_key: [] for _key, _ in valid_tags}

for dic in dicts:
    for tag, _ in valid_tags:
        if dic: # could be None if not readable
            # get all rows with this tag and add as column to dataframe
            item: str | None = dic.get(tag)
            new_col_dict[tag].append(item)
        else:
            new_col_dict[tag].append(None)

# check for any lost items
for _tag, _list in new_col_dict.items():
    assert(len(_list) == len(dataframe))

# add columns to dataframe
existing_keys = list(dataframe.keys())
for _tag, _list in new_col_dict.items():
    # prevent overwriting
    if _tag in existing_keys:
        _tag += "_info"
    dataframe[_tag] = _list

In [11]:
dataframe.keys()

Index(['title', 'year', 'infobox', 'critics_count', 'critics_score',
       'audience_count', 'audience_score', 'suppliers_list',
       'rottentomatoes_year', 'rottentomatoes_genre', 'rottentomatoes_length',
       'music_by', 'country', 'written_by', 'edited_by', 'starring',
       'release_date', 'production_company', 'directed_by', 'budget',
       'cinematography', 'language', 'produced_by', 'running_time',
       'box_office', 'distributed_by', 'screenplay_by'],
      dtype='object')