# There is a problem when there are duplicate tconst in the to add list
## only use drop_duplicates() if an index is not set!!!

In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import date
from datetime import timedelta

START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"
PARQ_PATH = "data/imdb/parquet/"
DOWNLOAD_PATH = "data/imdb/download/"
OUTPUT_PATH = "data/generated/"


FILES_IMDB = {
    "cast_crew" : "title.crew.tsv",
    "tit_bas" : "title.basics.tsv",
    "tit_rate" : "title.ratings.tsv",
    "name_bas" : "name.basics.tsv",
    "tit_prin" : "title.principals.tsv",
}

FILES_IMDB_PARQ = {
    "tit_bas" : "title_basics.parquet",
    "genres" : "genres.parquet",
    "tit_rate" : "title_ratings.parquet",
    'directors' : 'directors.parquet',
    'writers' : 'writers.parquet',
    'prim_prof' : 'primary_profession.parquet',
    'known_for' : 'known_for_titles.parquet',
    'name_bas' : 'name_basics.parquet',
    'const' : 'ids.parquet',
    'ordering' : 'ordering.parquet',
    'character' : 'character.parquet',
    'job' : 'job.parquet',
}

FILES_HAND = {
    "raw_status": "raw_status.xlsx",
    "to_add" : "to_add.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}

In [2]:
def setAttr(frame):
    # setting column types
    frame['watched_date'] = pd.to_datetime(frame['watched_date'])
    frame[['enjoyment','story','subject','acting','visual','action','comedy']] = frame[['enjoyment','story','subject','acting','visual','action','comedy']].astype(float)
    frame['watched'] = frame['watched'].astype("Int64").replace(0,np.nan)
    frame[['netflix','prime','priority']] = frame[['netflix','prime','priority']].astype("Int64")
    return frame

In [3]:
# modified!!!
def loadData():
    """Loads the raw excel files."""
    # load data and set types of films to add
    id_stat = os.path.join("data", "handcrafted", FILES_HAND["to_add"])
    to_add = setAttr(pd.read_excel(id_stat))
    # convert link to tconstant
    to_add['link'] = to_add['link'].str.split("/",expand=True).loc[:,4].astype(str)
    # remove duplicates
    to_add = to_add.drop_duplicates().rename(columns={"link":"tconst"})
    # add index as column
    to_add["row_index"] = to_add.index
    to_add = to_add.set_index("tconst")
    # set nan to 0
    to_add.loc[:,['priority', 'watched']] = to_add.loc[:,['priority', 'watched']].fillna(0)

    # loading and preparing film list
    raw_stat_link = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
    raw_stat = setAttr(pd.read_excel(raw_stat_link))
    raw_stat = raw_stat.drop_duplicates().set_index("tconst")
    # set nan to 0
    raw_stat.loc[:,['priority', 'watched']] = raw_stat.loc[:,['priority', 'watched']].fillna(0)

    return to_add, raw_stat

In [4]:
to_add, raw_stat = loadData()

In [5]:
to_add

Unnamed: 0_level_0,watched_date,enjoyment,story,subject,acting,visual,action,comedy,watched,priority,netflix,prime,row_index
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
tt0800325,NaT,,,,,,,,1,0,,,0
tt0800325,2023-11-12,2.5,3.5,2.0,3.0,2.0,3.0,3.0,1,0,1.0,,1
tt13238346,2023-11-15,3.0,1.0,1.0,4.0,4.0,0.0,1.0,1,0,,,2
tt3381008,NaT,,,,,,,,0,0,,,3
tt0320691,NaT,,,,,,,,0,1,,,4
tt23181388,NaT,,,,,,,,0,1,,,5
tt0320691,NaT,,,,,,,,0,0,,,6
tt1448754,NaT,,,,,,,,0,1,,,7
tt1448754,NaT,,,,,,,,0,0,,,8
tt14570440,NaT,,,,,,,,1,0,,,10


In [6]:
raw_stat

Unnamed: 0_level_0,watched,watched_date,netflix,prime,story,subject,acting,visual,action,comedy,enjoyment,priority
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
tt0015324,0,NaT,,,,,,,,,,0
tt0017136,1,2023-03-04,0,0,,,,,,,3.5,0
tt0017925,0,NaT,,,,,,,,,,0
tt0022100,1,2023-05-31,0,0,,,,,,,3.0,0
tt0025316,0,NaT,,,,,,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...,...
tt9755806,0,NaT,,,,,,,,,,0
tt9764362,1,NaT,0,0,,,,,,,4.0,0
tt9783600,1,NaT,,,,,,,,,3.0,0
tt9806192,1,NaT,,,,,,,,,4.0,0


In [8]:
def removeDuplicates(dup_rows):
    """Removes duplicate rows of movies that need to be added"""
    # TODO: redundancies must be removed
    for index, row in dup_rows.copy().iterrows():
        # get row numbers of dup rows
        row_indices = dup_rows[dup_rows.index==index]["row_index"].to_list()
        row_indices_mask = dup_rows["row_index"].isin(row_indices)
        to_check = dup_rows.loc[row_indices_mask]
        
        # the easy boolean values (searching for the value 1)
        # one of them watched? => turn them all into watched
        if sum(to_check["watched"]) >= 1:
            dup_rows.loc[row_indices_mask,"watched"] = 1
        # one of them priority? => turn them all into priority
        if sum(to_check["priority"]) >= 1:
            dup_rows.loc[row_indices_mask,"priority"] = 1
        # one of them netflix? => turn them all into netflix
        if 1 in to_check["netflix"].dropna().tolist():
            dup_rows.loc[row_indices_mask,"netflix"] = 1
        # one of them prime? => turn them all into prime
        if 1 in to_check["prime"].dropna().tolist():
            dup_rows.loc[row_indices_mask,"prime"] = 1
        
        # if there is a date:
        # input     -> isna()      -> invert      -> any()
        # date date -> false false -> true true   -> true
        # datum NA  -> false true  -> true false  -> true
        # NA NA     -> true true   -> false false -> false
        if (~to_check["watched_date"].isna()).any():
            # get most current date
            most_cur_date = dup_rows.loc[row_indices_mask,"watched_date"].max()
            # sort with newest date on top
            for index, row in dup_rows.loc[row_indices_mask].sort_values(
                'watched_date',ascending=True).copy().iterrows():
                # Set values to the newest available
                # #The loops keep overwriting the values even if the newest non-NA is already set.
                # That's not a problem, although it's not efficient
                for column_key in ["enjoyment", "story", "subject", "acting",
                                "visual", "action", "comedy"]:
                    # if date has a score that is not entered yet > enter the score
                    if not pd.isna(row[column_key]):
                        dup_rows.loc[index, column_key] = row[column_key]
            # set date to most current
            dup_rows.loc[row_indices_mask, "watched_date"] = most_cur_date
        
        # IF THERE IS NO DATE
        else:
            # sort with newest date on top
            for index, row in dup_rows.loc[row_indices_mask].sort_values(
                'row_index',ascending=True).copy().iterrows():
                # Set values to the newest available
                # #The loops keep overwriting the values even if the newest non-NA is already set.
                # That's not a problem, although it's not efficient
                for column_key in ["enjoyment", "story", "subject", "acting",
                                    "visual", "action", "comedy"]:
                    # if date has a score that is not entered yet > enter the score
                    if not pd.isna(row[column_key]):
                        dup_rows.loc[index, column_key] = row[column_key]
        
    # drop duplicates and reset index
    dup_rows = dup_rows.drop(columns=["row_index"]).reset_index().drop_duplicates().set_index("tconst")
    
    return dup_rows

In [None]:
# 1) get duplicates in the to_add list
# 2) fix duplicates in to_add
# 3) merge to_add [non-duplicates and non-duplicates]
# 4) find to_add ttconst that are already in raw_stat
# 5) take those out of raw_stat and merge them with to_add duplicates
# 6) fix [raw_stat and to_add] duplicates
# 7) take 6 and merge them with the to_add movies that are not already in raw_stat
# 8) take 7 and add them to the movie list



# everything in this block is a bit of a mess, needs to be fixed according to the above steps


# filter the input movies so they don't have duplicates anymore

# only get movies with duplicate movie id's, otherwise the loop gets a lot bigger
check_for_duplicates = to_add[~to_add.index.isin(raw_stat.index)]

to_add = removeDuplicates(to_add) # still contains movies that 
# 


# search duplicate movies in movies that are not yet in the movie list
dup_input_rows = check_for_duplicates[check_for_duplicates.index.duplicated(keep=False)]
# filter duplicate movies in movies that are not yet in the movie list
dup_input_rows = removeDuplicates(dup_input_rows)
# 
can_be_added = check_for_duplicates[~check_for_duplicates.index.duplicated(keep=False)]
to_add = pd.concat([dup_rows,can_be_added])
to_add

# continue after the above

In [7]:
# old code, can be used for snippets
# """Adds new movies to raw_stat, and get movies that need to be updated."""
# # adding new films and creating a subset of films to update
# check_for_duplicates = to_add[~to_add.index.isin(raw_stat.index)] # direct_toevoegen changed for check_for_duplicates
# check_for_duplicates
# # door_scanner = to_add[to_add.index.isin(raw_stat.index)]
# # raw_stat = pd.concat([raw_stat, direct_toevoegen])


# # # preparing data for comparison
# # door_scanner = door_scanner.fillna(-1)
# # door_scanner.loc[:,"watched"] = door_scanner.loc[:,"watched"].replace(-1,0)
# # door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))

# # raw_stat = raw_stat.fillna(-1)
# # raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(-1,0)
# # raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))

# # # get the movies for adding that are already in the movie list
# # identical_rows = pd.merge(door_scanner.reset_index(drop=False),
# #                         raw_stat.reset_index(drop=False),
# #                         on=door_scanner.reset_index(drop=False).columns.values.tolist(),
# #                         how='inner')['tconst']
# # changed_rows = door_scanner[~door_scanner.index.isin(identical_rows)]


Unnamed: 0_level_0,watched_date,enjoyment,story,subject,acting,visual,action,comedy,watched,priority,netflix,prime,row_index
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
tt0800325,NaT,,,,,,,,1,0,,,0
tt0800325,2023-11-12,2.5,3.5,2.0,3.0,2.0,3.0,3.0,1,0,1.0,,1
tt13238346,2023-11-15,3.0,1.0,1.0,4.0,4.0,0.0,1.0,1,0,,,2
tt3381008,NaT,,,,,,,,0,0,,,3
tt23181388,NaT,,,,,,,,0,1,,,5
tt1448754,NaT,,,,,,,,0,1,,,7
tt1448754,NaT,,,,,,,,0,0,,,8
tt14570440,NaT,,,,,,,,1,0,,,10
tt14570440,2023-10-07,,,,,3.0,,,0,0,,,13
tt14570440,2023-10-08,4.0,4.0,4.0,4.0,2.0,4.0,4.0,1,0,,,14
