In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import date
from datetime import timedelta

START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"
PARQ_PATH = "data/imdb/parquet/"
DOWNLOAD_PATH = "data/imdb/download/"
OUTPUT_PATH = "data/generated/"


FILES_IMDB = {
    "cast_crew" : "title.crew.tsv",
    "tit_bas" : "title.basics.tsv",
    "tit_rate" : "title.ratings.tsv",
    "name_bas" : "name.basics.tsv",
    "tit_prin" : "title.principals.tsv",
}

FILES_IMDB_PARQ = {
    "tit_bas" : "title_basics.parquet",
    "genres" : "genres.parquet",
    "tit_rate" : "title_ratings.parquet",
    'directors' : 'directors.parquet',
    'writers' : 'writers.parquet',
    'prim_prof' : 'primary_profession.parquet',
    'known_for' : 'known_for_titles.parquet',
    'name_bas' : 'name_basics.parquet',
    'const' : 'ids.parquet',
    'ordering' : 'ordering.parquet',
    'character' : 'character.parquet',
    'job' : 'job.parquet',
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt", # old
    "add_unseen": "add_movies_unseen.txt", # old
    "add_secop": "add_movies_second_opinion.txt", # old
    "raw_status": "raw_status.xlsx",
    "to_add" : "to_add.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}

In [2]:
id_stat = os.path.join("data", "handcrafted", FILES_HAND["to_add"])
to_add = pd.read_excel(id_stat)
to_add['watched_date'] = to_add['watched_date'].dt.date
to_add['enjoyment'] = to_add['enjoyment'].astype(float)
to_add['watched'] = to_add['watched'].astype("Int64")
to_add[['netflix','prime','priority']] = to_add[['netflix','prime','priority']].astype("Int64")
to_add['link'] = to_add['link'].str.split("/",expand=True).loc[:,4].astype(str)
to_add = to_add.rename(columns={"link":"tconst"})
to_add = to_add.drop_duplicates()

In [3]:
raw_stat_link = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
raw_stat = pd.read_excel(raw_stat_link)
raw_stat['watched_date'] = raw_stat['watched_date'].dt.date
raw_stat['enjoyment'] = raw_stat['enjoyment'].astype(float)
raw_stat['watched'] = raw_stat['watched'].astype("Int64").replace(0,np.nan)
raw_stat[['netflix','prime','priority']] = raw_stat[['netflix','prime','priority']].astype("Int64")
raw_stat = raw_stat.set_index("tconst")

* tconst mag worden overschreven
* (done) watched mag alleen worden geupdate als de raw_state waarde een nan was.
* (done) watched_date moet de nieuwste waarde hebben, maar alleen als nieuwe waarde geen nan is.
* (done) netflix en prime mag worden geupdate als het geen nan is.
* enjoyment mag worden geupdate als nieuwe waarde geen 0 is.
* priority mag worden geupdate als raw_state waarde 0 was

kijken hoe de filter_func optie werkt
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html
en doe eerst een test

In [4]:
# raw_stat.update(to_add)

hetvolgende mag meteen worden toegevoegd

In [5]:
nieuw = to_add
to_add = to_add.set_index("tconst")
to_add["watched_date"] = pd.to_datetime(to_add["watched_date"]).dt.date

raw_stat = raw_stat
raw_stat["watched_date"] = pd.to_datetime(raw_stat["watched_date"]).dt.date

direct_toevoegen = to_add[~to_add.index.isin(raw_stat.index)]
door_scanner = to_add[to_add.index.isin(raw_stat.index)]

door_scanner = door_scanner.fillna(-1)
door_scanner.loc[:,"watched"] = door_scanner.loc[:,"watched"].replace(-1,0)
door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(pd.NA,0)
raw_stat.loc[:,"priority"] = raw_stat.loc[:,"priority"].replace(pd.NA,0)

raw_stat = raw_stat.fillna(-1)
raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(-1,0)
raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].dt.date
door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].dt.date

  door_scanner.loc[:,"watched"] = door_scanner.loc[:,"watched"].replace(-1,0)
  door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
  raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(pd.NA,0)
  raw_stat.loc[:,"priority"] = raw_stat.loc[:,"priority"].replace(pd.NA,0)
  raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(-1,0)
  raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
  raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].dt.date
  door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].dt.date


In [6]:
for index, row in door_scanner.iterrows():
    # if movie is watched, also make it watched in original list
    if door_scanner.loc[index,"watched"] == 1:
        raw_stat.loc[index,"watched"] = 1
    # if watch date of new one is bigger, replace old with new date
    if door_scanner.loc[index,"watched_date"]  > raw_stat.loc[index,"watched_date"]:
        raw_stat.loc[index,"watched_date"] = door_scanner.loc[index,"watched_date"]
    # only update neflix/prime status if status is not null (3)
    if door_scanner.loc[index, "netflix"] != -1:
       raw_stat.loc[index,"netflix"] = door_scanner.loc[index,"netflix"]
    if door_scanner.loc[index, "prime"] != -1:
       raw_stat.loc[index,"prime"] = door_scanner.loc[index,"prime"]
    # only update enjoyment is new value is not NA (-1)
    if door_scanner.loc[index, "enjoyment"] != -1:
       raw_stat.loc[index,"enjoyment"] = door_scanner.loc[index,"enjoyment"]
    # only update priority if old value is NA:
    if raw_stat.loc[index,"priority"] != -1:
        raw_stat.loc[index,"priority"] = door_scanner.loc[index,"priority"]

In [7]:
raw_stat.loc[raw_stat["watched"].isna(), "watched"] = 0 # does nothing if there are no NA's
raw_stat.loc[raw_stat["watched_date"] == pd.to_datetime("1900-1-1"), "watched_date"] = np.NaN
raw_stat["netflix"] = raw_stat["netflix"].replace(-1, pd.NA)
raw_stat["prime"] = raw_stat["prime"].replace(-1, pd.NA)
raw_stat["priority"] = raw_stat["priority"].replace(-1, pd.NA)
raw_stat["enjoyment"] = raw_stat["enjoyment"].replace(-1.0,np.NaN)

direct_toevoegen.loc[direct_toevoegen["watched"].isna(), "watched"] = 0
direct_toevoegen.loc[direct_toevoegen["priority"].isna(), "priority"] = 0
nieuw = pd.concat([raw_stat, direct_toevoegen])

  raw_stat.loc[raw_stat["watched_date"] == pd.to_datetime("1900-1-1"), "watched_date"] = np.NaN


In [8]:
nieuw

Unnamed: 0_level_0,watched,watched_date,netflix,prime,enjoyment,priority
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0015324,0,,,,,0
tt0017136,1,2023-03-04,0,0,3.5,
tt0022100,0,,0,0,,0
tt0025316,0,,,,,0
tt0031381,0,,,,,0
...,...,...,...,...,...,...
tt17024450,0,,,,,0
tt16257866,0,,,,,0
tt17044106,0,,,,,0
tt14539740,0,,,,,0


# wat je misschien beter kan doen is de duplicate waarden van to_add en raw_stat apart houden, dan update() doen op de nieuwe waarden updaten in je aparte gehouden table. en dan weer update doen.