In [1]:
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import date
from datetime import timedelta

START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"
PARQ_PATH = "data/imdb/parquet/"
DOWNLOAD_PATH = "data/imdb/download/"
OUTPUT_PATH = "data/generated/"


FILES_IMDB = {
    "cast_crew" : "title.crew.tsv",
    "tit_bas" : "title.basics.tsv",
    "tit_rate" : "title.ratings.tsv",
    "name_bas" : "name.basics.tsv",
    "tit_prin" : "title.principals.tsv",
}

FILES_IMDB_PARQ = {
    "tit_bas" : "title_basics.parquet",
    "genres" : "genres.parquet",
    "tit_rate" : "title_ratings.parquet",
    'directors' : 'directors.parquet',
    'writers' : 'writers.parquet',
    'prim_prof' : 'primary_profession.parquet',
    'known_for' : 'known_for_titles.parquet',
    'name_bas' : 'name_basics.parquet',
    'const' : 'ids.parquet',
    'ordering' : 'ordering.parquet',
    'character' : 'character.parquet',
    'job' : 'job.parquet',
}

FILES_HAND = {
    "raw_status": "raw_status.xlsx",
    "to_add" : "to_add.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}


def setAttr(frame):
    # setting column types
    frame['watched_date'] = pd.to_datetime(frame['watched_date'])
    frame[['enjoyment','story','subject','acting','script','visual','action','comedy']] = frame[['enjoyment','story','subject','acting','script','visual','action','comedy']].astype(float)
    frame['watched'] = frame['watched'].astype("Int64").replace(0,np.nan)
    frame[['netflix','prime','priority']] = frame[['netflix','prime','priority']].astype("Int64")
    return frame


def loadData():
    """Loads the raw excel files."""
    # load data and set types of films to add
    id_stat = os.path.join("data", "handcrafted", FILES_HAND["to_add"])
    to_add = setAttr(pd.read_excel(id_stat))
    # convert link to tconstant
    to_add['link'] = to_add['link'].str.split("/",expand=True).loc[:,4].astype(str)
    # remove duplicates
    to_add = to_add.drop_duplicates().rename(columns={"link":"tconst"})
    # add index as column
    to_add["row_index"] = to_add.index
    to_add = to_add.set_index("tconst")
    # set nan to 0
    to_add.loc[:,['priority', 'watched']] = to_add.loc[:,['priority', 'watched']].fillna(0)

    # loading and preparing film list
    raw_stat_link = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
    raw_stat = setAttr(pd.read_excel(raw_stat_link))
    raw_stat = raw_stat.drop_duplicates().set_index("tconst")
    # set nan to 0
    raw_stat.loc[:,['priority', 'watched']] = raw_stat.loc[:,['priority', 'watched']].fillna(0)

    return to_add, raw_stat