In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import date
from datetime import timedelta

START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"
PARQ_PATH = "data/imdb/parquet/"
DOWNLOAD_PATH = "data/imdb/download/"
OUTPUT_PATH = "data/generated/"


FILES_IMDB = {
    "cast_crew" : "title.crew.tsv",
    "tit_bas" : "title.basics.tsv",
    "tit_rate" : "title.ratings.tsv",
    "name_bas" : "name.basics.tsv",
    "tit_prin" : "title.principals.tsv",
}

FILES_IMDB_PARQ = {
    "tit_bas" : "title_basics.parquet",
    "genres" : "genres.parquet",
    "tit_rate" : "title_ratings.parquet",
    'directors' : 'directors.parquet',
    'writers' : 'writers.parquet',
    'prim_prof' : 'primary_profession.parquet',
    'known_for' : 'known_for_titles.parquet',
    'name_bas' : 'name_basics.parquet',
    'const' : 'ids.parquet',
    'ordering' : 'ordering.parquet',
    'character' : 'character.parquet',
    'job' : 'job.parquet',
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt", # old
    "add_unseen": "add_movies_unseen.txt", # old
    "add_secop": "add_movies_second_opinion.txt", # old
    "raw_status": "raw_status.xlsx",
    "to_add" : "to_add.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}

# wat je misschien beter kan doen is de duplicate waarden van to_add en raw_stat apart houden, dan update() doen op de nieuwe waarden updaten in je aparte gehouden table. en dan weer update doen.

In [2]:
def setAttr(frame):
    # setting column types
    frame['watched_date'] = pd.to_datetime(frame['watched_date'])
    frame['enjoyment'] = frame['enjoyment'].astype(float)
    frame['watched'] = frame['watched'].astype("Int64").replace(0,np.nan)
    frame[['netflix','prime','priority']] = frame[['netflix','prime','priority']].astype("Int64")
    frame = frame.drop_duplicates()
    return frame


# loading and preparing films to add
id_stat = os.path.join("data", "handcrafted", FILES_HAND["to_add"])
to_add = pd.read_excel(id_stat)
to_add = setAttr(to_add)
to_add['link'] = to_add['link'].str.split("/",expand=True).loc[:,4].astype(str)
to_add = to_add.rename(columns={"link":"tconst"}).set_index("tconst")

# loading and preparing film list
raw_stat_link = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
raw_stat = pd.read_excel(raw_stat_link)
raw_stat = setAttr(raw_stat)
raw_stat = raw_stat.set_index("tconst")

# raw_stat['watched_date'] = raw_stat['watched_date'].dt.date
# raw_stat['enjoyment'] = raw_stat['enjoyment'].astype(float)
# raw_stat['watched'] = raw_stat['watched'].astype("Int64").replace(0,np.nan)
# raw_stat[['netflix','prime','priority']] = raw_stat[['netflix','prime','priority']].astype("Int64")
# raw_stat = raw_stat.set_index("tconst")


# to_add['watched_date'] = to_add['watched_date'].dt.date
# to_add['enjoyment'] = to_add['enjoyment'].astype(float)
# to_add['watched'] = to_add['watched'].astype("Int64")
# to_add[['netflix','prime','priority']] = to_add[['netflix','prime','priority']].astype("Int64")
# to_add['link'] = to_add['link'].str.split("/",expand=True).loc[:,4].astype(str)
# to_add = to_add.rename(columns={"link":"tconst"})
# to_add = to_add.drop_duplicates()

In [3]:
to_add.loc["tt0066434"]

watched_date    2023-03-01 00:00:00
enjoyment                       3.0
watched                           1
priority                       <NA>
netflix                        <NA>
prime                          <NA>
Name: tt0066434, dtype: object

In [4]:
# adding new films and creating a subset of films to update
direct_toevoegen = to_add[~to_add.index.isin(raw_stat.index)]
door_scanner = to_add[to_add.index.isin(raw_stat.index)]
raw_stat = pd.concat([raw_stat, direct_toevoegen])

In [5]:
# raw_stat_link = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
# raw_stat = pd.read_excel(raw_stat_link)
# raw_stat['watched_date'] = raw_stat['watched_date'].dt.date
# raw_stat['enjoyment'] = raw_stat['enjoyment'].astype(float)
# raw_stat['watched'] = raw_stat['watched'].astype("Int64").replace(0,np.nan)
# raw_stat[['netflix','prime','priority']] = raw_stat[['netflix','prime','priority']].astype("Int64")
# raw_stat = raw_stat.set_index("tconst")

* tconst mag worden overschreven
* (done) watched mag alleen worden geupdate als de raw_state waarde een nan was.
* (done) watched_date moet de nieuwste waarde hebben, maar alleen als nieuwe waarde geen nan is.
* (done) netflix en prime mag worden geupdate als het geen nan is.
* enjoyment mag worden geupdate als nieuwe waarde geen 0 is.
* priority mag worden geupdate als raw_state waarde 0 was

kijken hoe de filter_func optie werkt
https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.update.html
en doe eerst een test

In [6]:
# preparing data for comparison
door_scanner = door_scanner.fillna(-1)
door_scanner.loc[:,"watched"] = door_scanner.loc[:,"watched"].replace(-1,0)
door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
# door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].dt.date

  door_scanner.loc[:,"watched"] = door_scanner.loc[:,"watched"].replace(-1,0)
  door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))


In [7]:
raw_stat = raw_stat.fillna(-1)
raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(-1,0)
raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
# raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].dt.date

  raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(-1,0)
  raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))


In [8]:
raw_stat

Unnamed: 0_level_0,watched,watched_date,netflix,prime,enjoyment,priority
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0015324,0,1900-01-01,-1,-1,-1.0,-1
tt0017136,1,2023-01-01,0,0,-1.0,-1
tt0022100,0,1900-01-01,0,0,-1.0,-1
tt0025316,0,1900-01-01,-1,-1,-1.0,-1
tt0031381,0,1900-01-01,-1,-1,-1.0,-1
...,...,...,...,...,...,...
tt17024450,0,1900-01-01,-1,-1,-1.0,-1
tt16257866,0,1900-01-01,-1,-1,-1.0,-1
tt17044106,0,1900-01-01,-1,-1,-1.0,-1
tt14539740,0,1900-01-01,-1,-1,-1.0,-1


In [9]:
door_scanner[door_scanner.index.isin(raw_stat.index)]

Unnamed: 0_level_0,watched_date,enjoyment,watched,priority,netflix,prime
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0017136,2023-03-04,3.5,1,-1,-1,-1
tt14209916,2023-03-11,2.5,1,-1,-1,-1
tt1016150,2023-03-12,3.5,1,-1,-1,-1
tt1921064,2023-03-11,2.5,1,-1,-1,-1
tt0795368,2023-03-12,2.0,1,-1,-1,-1
tt0094142,1900-01-01,-1.0,1,-1,-1,-1
tt1499658,2023-03-12,2.0,1,-1,-1,-1
tt0884328,1900-01-01,-1.0,1,-1,-1,-1
tt0443706,1900-01-01,-1.0,1,-1,-1,-1
tt1504320,1900-01-01,-1.0,1,-1,-1,-1


remove door_scanner rows that are identical to the ones in raw_stat

In [10]:
identical_rows = pd.merge(door_scanner.reset_index(drop=False),
                          raw_stat.reset_index(drop=False),
                          on=door_scanner.reset_index(drop=False).columns.values.tolist(),
                          how='inner')['tconst']
changed_rows = door_scanner[~door_scanner.index.isin(identical_rows)]

In [11]:
changed_rows

Unnamed: 0_level_0,watched_date,enjoyment,watched,priority,netflix,prime
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
tt0017136,2023-03-04,3.5,1,-1,-1,-1
tt14209916,2023-03-11,2.5,1,-1,-1,-1
tt1016150,2023-03-12,3.5,1,-1,-1,-1
tt0443706,1900-01-01,-1.0,1,-1,-1,-1
tt1677720,1900-01-01,-1.0,1,-1,-1,-1
tt10288566,1900-01-01,-1.0,1,-1,-1,-1
tt15398776,1900-01-01,-1.0,0,1,-1,-1


In [12]:
raw_stat.loc['tt15398776']

watched                           0
watched_date    1900-01-01 00:00:00
netflix                           0
prime                             0
enjoyment                      -1.0
priority                          1
Name: tt15398776, dtype: object

In [13]:
to_add.loc['tt15398776']

watched_date     NaT
enjoyment        NaN
watched         <NA>
priority           1
netflix         <NA>
prime           <NA>
Name: tt15398776, dtype: object

In [14]:
# to_add = to_add.set_index("tconst")
# to_add["watched_date"] = pd.to_datetime(to_add["watched_date"]).dt.date

# raw_stat = raw_stat
# raw_stat["watched_date"] = pd.to_datetime(raw_stat["watched_date"]).dt.date

# direct_toevoegen = to_add[~to_add.index.isin(raw_stat.index)]
# door_scanner = to_add[to_add.index.isin(raw_stat.index)]

# door_scanner = door_scanner.fillna(-1)
# door_scanner.loc[:,"watched"] = door_scanner.loc[:,"watched"].replace(-1,0)
# door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
# raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(pd.NA,0)
# raw_stat.loc[:,"priority"] = raw_stat.loc[:,"priority"].replace(pd.NA,0)

# raw_stat = raw_stat.fillna(-1)
# raw_stat.loc[:,"watched"] = raw_stat.loc[:,"watched"].replace(-1,0)
# raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].replace(-1,pd.to_datetime("1/1/1900"))
# raw_stat.loc[:,"watched_date"] = raw_stat.loc[:,"watched_date"].dt.date
# door_scanner.loc[:,"watched_date"] = door_scanner.loc[:,"watched_date"].dt.date

i removed tt0066434.
the problem could be that i updated the raw status file before determening missing movies.
It should not have to go through the loop, because it's just added fully, not a comparison.
de scores die geupdate moeten worden staan er al in???? huh?

update: scores are updated the right way. 

In [15]:
for index, row in changed_rows.iterrows():
    # if movie is watched, also make it watched in original list
    if changed_rows.loc[index,"watched"] == 1:
        raw_stat.loc[index,"watched"] = 1
    # if watch date of new one is bigger, replace old with new date
    if changed_rows.loc[index,"watched_date"]  > raw_stat.loc[index,"watched_date"]:
        raw_stat.loc[index,"watched_date"] = changed_rows.loc[index,"watched_date"]
    # only update neflix/prime status if status is not null (3)
    if changed_rows.loc[index, "netflix"] != -1:
       raw_stat.loc[index,"netflix"] = changed_rows.loc[index,"netflix"]
    if changed_rows.loc[index, "prime"] != -1:
       raw_stat.loc[index,"prime"] = changed_rows.loc[index,"prime"]
    # only update enjoyment is new value is not NA (-1)
    if changed_rows.loc[index, "enjoyment"] != -1:
        raw_stat.loc[index,"enjoyment"] = changed_rows.loc[index,"enjoyment"]
    # only update priority if old value is NA:
    if raw_stat.loc[index,"priority"] not in [-1, 1]:
        raw_stat.loc[index,"priority"] = changed_rows.loc[index,"priority"]

In [16]:
raw_stat.loc[:,['netflix','prime','priority']] = raw_stat.loc[:,['netflix','prime','priority']].replace(-1,pd.NA)
raw_stat["enjoyment"] = raw_stat["enjoyment"].replace(-1,np.NaN)
raw_stat["watched_date"] = raw_stat["watched_date"].replace(pd.to_datetime("1900-1-1"), np.NaN)

  raw_stat.loc[:,['netflix','prime','priority']] = raw_stat.loc[:,['netflix','prime','priority']].replace(-1,pd.NA)


In [17]:
# empty to._add.xlsx
new_empty = pd.DataFrame(data=None, columns=["link"]+to_add.columns.to_list())
to_add = os.path.join("data", "handcrafted", FILES_HAND["to_add"])
new_empty.to_excel(to_add, index=False)

# overwrite raw_status
output = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
raw_stat.to_excel(output)