In [1]:
import os
import time
import numpy as np
import pandas as pd

In [2]:
START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"

FILES_IMDB = {
    "tit_bas": "title.basics.tsv",
    "tit_rate": "title.ratings.tsv",
    "name_bas": "name.basics.tsv",
    "cast_crew": "title.principals.tsv",
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt",
    "add_unseen": "add_movies_unseen.txt",
    "add_secop": "add_movies_second_opinion.txt",
    "raw_status": "raw_status.xlsx"
}

FILES_GENERATED = {
    "films_raw": os.path.join("data", "generated", "films_raw.pkl"),
    "films_readable": os.path.join("data", "generated", "films_reading.xlsx"),
    "films_mining": os.path.join("data", "generated", "films_mining.xlsx")
}

In [3]:
raw_film_data = pd.read_pickle(FILES_GENERATED["films_raw"])

In [4]:
raw_film_data

Unnamed: 0,tconst,watched,watched_date,netflix,prime,enjoyment,priority,titleType,primaryTitle,originalTitle,...,averageRating,numVotes,ordering,nconst,category,job,primaryName,birthYear,deathYear,primaryProfession
0,tt0015324,False,NaT,,,,,movie,Sherlock Jr.,Sherlock Jr.,...,8.2,51136,10,nm0504380,cinematographer,,Elgin Lessley,1883,1944,"cinematographer,actor"
1,tt0015324,False,NaT,,,,,movie,Sherlock Jr.,Sherlock Jr.,...,8.2,51136,1,nm0000036,actor,,Buster Keaton,1895,1966,"actor,writer,director"
2,tt0015324,False,NaT,,,,,movie,Sherlock Jr.,Sherlock Jr.,...,8.2,51136,2,nm0570230,actress,,Kathryn McGuire,1903,1978,actress
3,tt0015324,False,NaT,,,,,movie,Sherlock Jr.,Sherlock Jr.,...,8.2,51136,3,nm0444172,actor,,Joe Keaton,1867,1946,actor
4,tt0015324,False,NaT,,,,,movie,Sherlock Jr.,Sherlock Jr.,...,8.2,51136,4,nm0175068,actor,,Erwin Connelly,1878,1931,actor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6114,tt9806192,True,NaT,,,4.0,,movie,I Lost My Body,J'ai perdu mon corps,...,7.5,33775,5,nm3021346,director,,Jérémy Clapin,1974,,"writer,director,editor"
6115,tt9806192,True,NaT,,,4.0,,movie,I Lost My Body,J'ai perdu mon corps,...,7.5,33775,6,nm0491011,writer,adaptation and dialogue,Guillaume Laurant,1961,,"writer,actor"
6116,tt9806192,True,NaT,,,4.0,,movie,I Lost My Body,J'ai perdu mon corps,...,7.5,33775,7,nm0238941,producer,producer,Marc Du Pontavice,,,"producer,production_manager,writer"
6117,tt9806192,True,NaT,,,4.0,,movie,I Lost My Body,J'ai perdu mon corps,...,7.5,33775,8,nm1776887,composer,,Dan Levy,1976,,"soundtrack,composer,music_department"


In [5]:
staff = raw_film_data.loc[:,['tconst','nconst','category','primaryName','birthYear','deathYear']]
staff.loc[:,'printname'] = staff.loc[:,'primaryName'] + " " + staff.loc[:,'birthYear'].astype('str') + ' - ' + staff.loc[:,'deathYear'].astype(str)
staff.drop(['nconst','primaryName','birthYear','deathYear'], axis=1, inplace=True)
staff = staff.groupby(['tconst','category'])['printname'].aggregate(lambda x: tuple(x)).unstack()

genres = raw_film_data.loc[:,['tconst','genres']].drop_duplicates()
genres.genres = genres.genres.str.split(',')
genres = genres.explode('genres')
genres['value'] = 1
genres = pd.pivot_table(genres.explode('genres'), values='value', index='tconst', columns='genres', fill_value=0)

readable_data = raw_film_data.copy()
readable_data.drop(['isAdult','ordering', 'nconst','category','job','primaryName','birthYear','deathYear', 'genres','endYear','primaryProfession'], axis=1, inplace=True)
readable_data.drop_duplicates(inplace=True)
readable_data = pd.merge(readable_data, genres, on="tconst", how="left")
readable_data = pd.merge(readable_data, staff, on="tconst", how="left")
readable_data.loc[:,'watched'] = readable_data.loc[:,'watched'].replace(True,1).replace(False,0)
readable_data.head()

Unnamed: 0,tconst,watched,watched_date,netflix,prime,enjoyment,priority,titleType,primaryTitle,originalTitle,...,actress,archive_footage,cinematographer,composer,director,editor,producer,production_designer,self,writer
0,tt0015324,0,NaT,,,,,movie,Sherlock Jr.,Sherlock Jr.,...,"(Kathryn McGuire 1903 - 1978,)",,"(Elgin Lessley 1883 - 1944, Byron Houck 1891 -...","(Club Foot Orchestra <NA> - <NA>,)",,,,,,"(Jean C. Havez 1872 - 1925, Joseph A. Mitchell..."
1,tt0017136,0,NaT,0.0,0.0,,,movie,Metropolis,Metropolis,...,"(Brigitte Helm 1906 - 1996,)",,,"(Sandro Forte 1970 - 2020, Maximianno Cobra 19...","(Fritz Lang 1890 - 1976,)",,"(Erich Pommer 1889 - 1966,)",,,"(Thea von Harbou 1888 - 1954,)"
2,tt0022100,0,NaT,0.0,0.0,,,movie,M,M - Eine Stadt sucht einen Mörder,...,"(Ellen Widmann 1894 - 1985, Inge Landgut 1922 ...",,"(Fritz Arno Wagner 1894 - 1958,)",,"(Fritz Lang 1890 - 1976,)","(Paul Falkenberg 1903 - 1986,)",,,,"(Thea von Harbou 1888 - 1954, Egon Jacobsohn 1..."
3,tt0025316,0,NaT,,,,,movie,It Happened One Night,It Happened One Night,...,"(Claudette Colbert 1903 - 1996,)",,"(Joseph Walker 1892 - 1985,)",,"(Frank Capra 1897 - 1991,)","(Gene Havlick 1894 - 1959,)",,,,"(Robert Riskin 1897 - 1955, Samuel Hopkins Ada..."
4,tt0031381,0,NaT,,,,,movie,Gone with the Wind,Gone with the Wind,...,"(Vivien Leigh 1913 - 1967, Barbara O'Neil 1910...",,,,"(Victor Fleming 1889 - 1949, George Cukor 1899...",,,,,"(Oliver H.P. Garrett 1894 - 1952, Margaret Mit..."
