# todo list
- directors en writers must get column names
- optimize
- check todo's
- remove redundancy by making functions
- sqlite vs parquet

## there is redundancy in functions:
- createGenreTables
- createTitleType
- createNameBasicsTables
- makeLinkTableAndCatTable

The following code must be made into a function

In [14]:
# # getting ready to convert to numbers
# genre_converter = title_basics_genres.loc[:,'genre'].factorize()
# nummerized = genre_converter[0]
# converter = genre_converter[1].codes
# cats = genre_converter[1].categories

# # convert genres in link table to numbers/categories
# title_basics_genres.loc[:,'genre'] = nummerized
# title_basics_genres.loc[:,'genre'] = title_basics_genres.loc[:,'genre'].astype('Int64')

# # create genres table
# # todo: maybe convert to datatype category
# genres = pd.DataFrame({"genre": [ cats[converter[x]] for x in range(len(converter))]})
# genres.index.name = "index"

In [1]:
import os
import re
import sys
import time
import gzip
import shutil
import requests
import numpy as np
import pandas as pd
from datetime import date

START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"

FILES_IMDB = {
    "tit_bas": "title.basics.tsv",
    "tit_rate": "title.ratings.tsv",
    "name_bas": "name.basics.tsv",
    "tit_prin": "title.principals.tsv",
    "cast_crew": "title.crew.tsv",
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt",
    "add_unseen": "add_movies_unseen.txt",
    "add_secop": "add_movies_second_opinion.txt",
    "raw_status": "raw_status.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}

# functions

In [2]:
def setTNconst(df, key, set_index, drop_col):
    # tconst
    # We remove the first two tt and convert it to int.
    # Commands are not chained, baecause this can cause memory issues with big files.
    df.loc[:,key] = df.loc[:,key].str.slice(2)
    df.loc[:,key] = df.loc[:,key].astype('Int64')
    if set_index:
        df = df.set_index(key, drop=drop_col)
    return df

# convert title.basics

In [3]:
def cleanTitlesBasic(tit_bas):
    tit_bas = setTNconst(tit_bas, 'tconst', set_index=True, drop_col=True)

    # titleType
    # There are a few null values (np.nan)
    # convert to category
    tit_bas.loc[:,"titleType"] = tit_bas.loc[:,"titleType"].astype('category')

    # primaryTitle & originalTitle
    # Is already either string or np.nan

    # isAdult
    # todo: don't drop this
    tit_bas = tit_bas.drop(columns='isAdult')
    # # Those movies are checked, and were not adult movies.
    # # movies that are not an 1 or '1' aren't adult movies
    # tit_bas.loc[:,"isAdult"] = tit_bas.loc[:,"isAdult"].fillna(0).astype('str')
    # tit_bas.loc[tit_bas.loc[:,'isAdult'] != '1', 'isAdult'] = '0'
    # tit_bas.loc[:,"isAdult"] = tit_bas.loc[:,"isAdult"].map({'1': True, '0': False})

    # startYear & endYear & runtimeMinutes
    # both contain a lot of np.nan
    tit_bas.loc[:,"startYear"] = tit_bas.loc[:,"startYear"].astype('Int64')
    tit_bas.loc[:,"endYear"] = tit_bas.loc[:,"endYear"].astype('Int64')
    tit_bas.loc[:,"runtimeMinutes"] = tit_bas.loc[:,"runtimeMinutes"].astype('Int64')

    # genres
    # will be split elsewere

    return tit_bas

In [4]:
def createGenreTables(title_basics):
    genres_raw = title_basics.loc[:,'genres'].copy()
    title_basics = title_basics.drop(columns='genres')
    genres_raw = genres_raw.str.split(",", expand=True)

    # create link table 
    title_basics_genres = (genres_raw
        .stack()
        .reset_index()
        .rename(columns={"level_0":'tconst', 0:"genre"})
        .set_index("tconst")
        .drop(columns='level_1'))
    title_basics_genres.loc[:,'genre'] = title_basics_genres.loc[:,'genre'].astype('category')

    # getting ready to convert to numbers
    genre_converter = title_basics_genres.loc[:,'genre'].factorize()
    nummerized = genre_converter[0]
    converter = genre_converter[1].codes
    cats = genre_converter[1].categories

    # convert genres in link table to numbers/categories
    title_basics_genres.loc[:,'genre'] = nummerized
    title_basics_genres.loc[:,'genre'] = title_basics_genres.loc[:,'genre'].astype('Int64')

    # create genres table
    # todo: maybe convert to datatype category
    genres = pd.DataFrame({"genre": [ cats[converter[x]] for x in range(len(converter))]})
    genres.index.name = "index"

    return title_basics, title_basics_genres, genres

In [5]:
def createTitleType(title_basics):
    # getting ready to convert to numbers
    titleType_converter = title_basics.loc[:,'titleType'].factorize()
    nummerized = titleType_converter[0]
    converter = titleType_converter[1].codes
    cats = titleType_converter[1].categories

    # convert to numbers
    title_basics.loc[:,'titleType'] = nummerized
    title_basics.loc[:,'titleType'] = title_basics.loc[:,'titleType'].astype('Int64')

    # create titleType table
    # todo: maybe convert to datatype category
    title_type = pd.DataFrame({"genre": [ cats[converter[x]] for x in range(len(converter))]})
    title_type.index.name = "index"

    return title_basics, title_type

In [6]:
title_basics_path = os.path.join("data/imdb/",FILES_IMDB['tit_bas'])
title_basics = pd.read_csv(title_basics_path,sep='\t', quotechar='\t', low_memory=False)
title_basics = title_basics.replace(to_replace = "\\N", value = np.nan)

title_basics = cleanTitlesBasic(title_basics)
title_basics, title_basics_genres, genres = createGenreTables(title_basics)
title_basics, title_type = createTitleType(title_basics)

# convert title rate

In [11]:
def convertTitleRate(title_rate):
    title_rate = setTNconst(title_rate, 'tconst', set_index=True, drop_col=True)

    # convert averageRating
    title_rate.loc[:,'averageRating'] = title_rate.loc[:,'averageRating'].astype('float')

    # convert numVotes
    title_rate.loc[:,'numVotes'] = title_rate.loc[:,'numVotes'].astype('int')

    return title_rate

In [12]:
title_rate_path = os.path.join("data/imdb/",FILES_IMDB['tit_rate'])
title_rate = pd.read_csv(title_rate_path,sep='\t', quotechar='\t', low_memory=False)
title_rate = title_rate.replace(to_replace = "\\N", value = np.nan)

title_rate = convertTitleRate(title_rate)

# convert title.crew

In [14]:
def convertDirectorsAndWriters(title_crew):
    title_crew = setTNconst(title_crew, 'tconst', set_index=True, drop_col=True)

    # todo: make function
    # directors
    # turn this into an table on it's own
    directors = title_crew.loc[:,'directors'].copy()
    directors = directors.str.replace("nm","")
    directors = directors.str.split(',')
    directors = directors.loc[~directors.isna()].explode().astype("Int64").copy()

    # writers
    # turn this into an table on it's own
    writers = title_crew.loc[:,'writers'].copy()
    writers = writers.str.replace("nm","")
    writers = writers.str.split(',')
    writers = writers.loc[~writers.isna()].explode().astype("Int64").copy()

    return directors, writers

In [15]:
title_crew_path = os.path.join("data/imdb/",FILES_IMDB['cast_crew'])
title_crew = pd.read_csv(title_crew_path,sep='\t', quotechar='\t', low_memory=True)
title_crew = title_crew.replace(to_replace = "\\N", value = np.nan)

directors, writers = convertDirectorsAndWriters(title_crew)

# convert name basics

In [2]:
def createNameBasicsTables(name_bas):
    name_bas = setTNconst(name_bas, 'nconst', set_index=True, drop_col=True)

    # primaryName
    ## Some names have quotes, and some names are missing.
    ## We don't have to modify them.

    # birthYear & deathYear has a lot of missing values
    # some are year 4, or 12. those are greek writers like nm0653992 (Ovid) 
    # which play was used for the movie.
    name_bas.loc[:,"birthYear"] = name_bas.loc[:,"birthYear"].astype('Int64')
    name_bas.loc[:,"deathYear"] = name_bas.loc[:,"deathYear"].astype('Int64')

    # knownForTitles
    # turn this into an table on it's own
    knownfor = name_bas.loc[:,"knownForTitles"].copy()
    name_bas = name_bas.drop(columns="knownForTitles")
    knownfor = knownfor.str.replace("tt","")
    knownfor = knownfor.str.split(',')
    knownfor = knownfor.loc[~knownfor.isna()].explode().astype("Int64").copy()

    prim_prof_raw = name_bas.loc[:,"primaryProfession"].copy()
    name_bas = name_bas.drop(columns="primaryProfession")
    prim_prof_raw = prim_prof_raw.str.replace("nm","")
    prim_prof_raw = prim_prof_raw.str.split(",", expand=True)

    # create link table
    name_bas_prim_prof = (prim_prof_raw
        .stack()
        .reset_index()
        .rename(columns={"level_0":'nconst', 0:"profession"})
        .set_index("nconst")
        .drop(columns='level_1'))
    name_bas_prim_prof.loc[:,'profession'] = name_bas_prim_prof.loc[:,'profession'].astype('category')

    # getting ready to convert to numbers
    profession_converter = name_bas_prim_prof.loc[:,'profession'].factorize()
    nummerized = profession_converter[0]
    converter = profession_converter[1].codes
    cats = profession_converter[1].categories

    # convert genres in link table to numbers/categories
    name_bas_prim_prof.loc[:,'profession'] = nummerized
    name_bas_prim_prof.loc[:,'profession'] = name_bas_prim_prof.loc[:,'profession'].astype('Int64')

    # create genres table
    # todo: maybe convert to datatype category
    professions = pd.DataFrame({"profession": [ cats[converter[x]] for x in range(len(converter))]})
    professions.index.name = "index"
    professions.loc[:,'profession'] = professions.loc[:,'profession'].astype('category')

    return name_bas, name_bas_prim_prof, professions, knownfor

In [5]:
name_bas_path = os.path.join("data/imdb/",FILES_IMDB['name_bas'])
name_bas = pd.read_csv(name_bas_path,sep='\t', low_memory=True)
name_bas = name_bas.replace(to_replace = "\\N", value = np.nan)

name_bas, name_bas_prim_prof, professions, knownfor = createNameBasicsTables(name_bas)

# convert title.principals

In [3]:
def makeLinkTableAndCatTable(single_attribute_df):
    # universal function for category, characters and job
    # takes in dataframe with 1 key

    # remove null values
    key = list(single_attribute_df.columns.values)[0]
    has_category = ~single_attribute_df[key].isna()
    single_attribute_df = single_attribute_df[has_category].copy()

    # getting ready to convert to numbers
    category_converter = single_attribute_df.loc[:,key].astype('category').factorize()
    nummerized = category_converter[0]
    converter = category_converter[1].codes
    cats = category_converter[1].categories

    # convert table to link table with nummerized categories
    single_attribute_df.loc[:,key] = nummerized
    # category_link.loc[:,key] = category_link.loc[:,key].astype('Int64') # not sure if this is needed

    # create value table for the numerized categories
    # todo: maybe convert to datatype category
    categories = pd.DataFrame({key: [ cats[converter[x]] for x in range(len(converter))]})
    categories.index.name = "index"
    categories.loc[:,key] = categories.loc[:,key].astype('category')

    return single_attribute_df, categories

In [4]:
def prepareTitlePrin(tit_prin):
    tit_prin = setTNconst(tit_prin, 'tconst', set_index=False, drop_col=False)
    tit_prin = setTNconst(tit_prin, 'nconst', set_index=False, drop_col=False)

    # set index
    tit_prin = tit_prin.set_index(['tconst', 'nconst'], drop=True)

    return tit_prin


def makeTitlePrimTables(tit_prin):
    # prepare data
    characters_link = tit_prin.loc[:,"characters"].to_frame()
    characters_link.loc[:,'characters'] = tit_prin.loc[:,'characters'].str.replace("\",\"","@").replace(r'[\[\]\"]',"", regex=True)
    characters_link.loc[:,'characters'] = tit_prin.loc[:,'characters'].str.split('@')
    characters_link = characters_link.explode('characters')
    categories_link = tit_prin.loc[:,"category"].to_frame()
    jobs_link = tit_prin.loc[:,"job"].to_frame()

    # make new tables
    characters_link, characters_category = makeLinkTableAndCatTable(characters_link)
    categories_link, categories_category = makeLinkTableAndCatTable(categories_link)
    jobs_link, jobs_category = makeLinkTableAndCatTable(jobs_link)

    return characters_link, characters_category, categories_link, categories_category, jobs_link, jobs_category  

In [5]:
tit_prin_path = os.path.join("data/imdb/",FILES_IMDB['tit_prin'])
tit_prin = pd.read_csv(tit_prin_path, sep='\t', low_memory=True)
tit_prin = tit_prin.replace(to_replace = "\\N", value = np.nan)

In [6]:
tit_prin =  prepareTitlePrin(tit_prin)

characters_link, characters_category, categories_link, categories_category, jobs_link, jobs_category = makeTitlePrimTables(tit_prin)

# remove columns
tit_prin = tit_prin.drop(columns=['characters','category','job'])