# to test compile movie list
Er is een index error, komt denk ik omdat vi2668606489 niet voorkomt in de ids.parquet.
Misschien verstandig om toch een string als index te hebben.
Of je je maakt een hele nieuwe id.

Dus dat je alle film en cast id's bij elkaar gooit en ze een unieke integer id geeft.
Misschien is dat het verstandigst.

In [1]:
"""
Takes the movie list and appends IMDb data to it.
Made as a class for use with other programs by inheritance.
Output is an ordered excel file.
"""
import os
import sys
import time
import numpy as np
import pandas as pd
from datetime import date
from datetime import timedelta

START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"
PARQ_PATH = "data/imdb/parquet/"
DOWNLOAD_PATH = "data/imdb/download/"
OUTPUT_PATH = "data/generated/"


FILES_IMDB = {
    "cast_crew" : "title.crew.tsv",
    "tit_bas" : "title.basics.tsv",
    "tit_rate" : "title.ratings.tsv",
    "name_bas" : "name.basics.tsv",
    "tit_prin" : "title.principals.tsv",
}

FILES_IMDB_PARQ = {
    "tit_bas" : "title_basics.parquet",
    "genres" : "genres.parquet",
    "tit_rate" : "title_ratings.parquet",
    'directors' : 'directors.parquet',
    'writers' : 'writers.parquet',
    'prim_prof' : 'primary_profession.parquet',
    'known_for' : 'known_for_titles.parquet',
    'name_bas' : 'name_basics.parquet',
    'const' : 'ids.parquet',
    'ordering' : 'ordering.parquet',
    'character' : 'character.parquet',
    'job' : 'job.parquet',
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt",
    "add_unseen": "add_movies_unseen.txt",
    "add_secop": "add_movies_second_opinion.txt",
    "raw_status": "raw_status.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}

def main():
    # Make a instance
    test = AppendedMovieList(
        url_movie_list = os.path.join("data", "handcrafted", FILES_HAND["raw_status"]),
        url_imdb_ids = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["const"]),
        url_title_basics = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["tit_bas"]),
        url_title_rate = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["tit_rate"]),
        url_genre = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["genres"]),
        url_directors = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["directors"]),
        url_writers = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["writers"]),
        url_personnel =  os.path.join(PARQ_PATH, FILES_IMDB_PARQ["ordering"]),
        url_name_basics = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["name_bas"])
        )

    # Get created movie list and write to excel.
    # output_excel =   os.path.join(OUTPUT_PATH, FILES_GENERATED["films_reading"])
    output_excel =   os.path.join(OUTPUT_PATH, "test.xlsx")
    movies = test.getMovieList()
    movies.to_excel(output_excel, index=False)

    END_TIME = time.time()
    time_format = time.strftime("%H:%M:%S", time.gmtime(END_TIME-START_TIME))
    print("Execution time: ",time_format)



class AppendedMovieList():
    BASE_URL = "https://datasets.imdbws.com/"
    PARQ_PATH = "data/imdb/parquet/"
    DOWNLOAD_PATH = "data/imdb/download/"

    def __init__(self,
                 url_movie_list,
                 url_imdb_ids,
                 url_title_basics,
                 url_title_rate,
                 url_genre,
                 url_directors,
                 url_writers,
                 url_personnel,
                 url_name_basics
                 ):
        
        self.__movie_list = pd.DataFrame()
        self.url_movie_list = url_movie_list
        self.url_imdb_ids = url_imdb_ids
        self.url_title_basics = url_title_basics
        self.url_title_rate = url_title_rate
        self.url_genre = url_genre
        self.url_directors = url_directors
        self.url_writers = url_writers
        self.url_personnel = url_personnel
        self.url_name_basics = url_name_basics

        self._loadList()
        self._addMovieIds()
        self._addTitleBasics()
        self._addRating()
        self._addGenre()
        self._addPersonnel()


    def _loadList(self):
        # load the raw movie list
        self.__movie_list = pd.read_excel(self.url_movie_list)

        # convert date
        self.__movie_list['watched_date'] = self.__movie_list['watched_date'].dt.date


    def _addMovieIds(self):
        # load movieID's
        const = pd.read_parquet(self.url_imdb_ids)
        
        # add movieID's
        self.__movie_list = pd.merge(
            self.__movie_list, const[const["type"]=="tconst"]["intid"],
            how='left', left_on="tconst", right_index=True)
        
        # set index
        self.__movie_list = self.__movie_list.set_index('intid', drop=True)
    

    def _addTitleBasics(self):
        # load title basics
        tit_bas = pd.read_parquet(self.url_title_basics)

        # delete adult column
        tit_bas = tit_bas.drop(columns='isAdult')

        # add title basics
        self.__movie_list = pd.merge(self.__movie_list, tit_bas,
                                     how='left', left_index=True, right_index=True)
    

    def _addRating(self):
        # load ratings
        tit_rate = pd.read_parquet(self.url_title_rate)

        # add title rate
        self.__movie_list = pd.merge(self.__movie_list, tit_rate, how='left',
                                  left_index=True, right_index=True)
        
        # convert rate
        self.__movie_list["averageRating"] = self.__movie_list["averageRating"].astype('float64')
        self.__movie_list["numVotes"] = self.__movie_list["numVotes"].astype('Int64')


    def _addGenre(self):
        # load genres
        genre = pd.read_parquet(self.url_genre)

        # get needed ones
        genre = genre[genre.index.isin(self.__movie_list.index)]
        genre['genres'] = genre['genres'].astype('category')

        # convert to multi one-hot
        genre = pd.crosstab(genre.index, genre['genres'])

        # NOTE: 
        # Column values of a this particular crosstab are categorical.
        # This does not play nice with the merge function
        # So we have to change change it back to a normal list
        genre.columns = genre.columns.values.tolist()
        
        # add genre
        self.__movie_list = pd.merge(self.__movie_list, genre, how='left', left_index=True, right_index=True)

        print(genre.index)
        print(self.__movie_list.index)


    def _loadPersonnel(self):
        # load neaded directors
        directors = pd.read_parquet(self.url_directors)
        directors = directors[directors.index.isin(self.__movie_list.index)]
        directors['category'] = 'director'
        directors = directors.rename(columns={'directors':'nconst'})

        # load needed writers
        writers = pd.read_parquet(self.url_writers)
        writers = writers[writers.index.isin(self.__movie_list.index)]
        writers['category'] = 'writer'
        writers = writers.rename(columns={'writers':'nconst'})

        # load needed ordering
        personnel = pd.read_parquet(self.url_personnel)
        personnel = personnel[personnel.index.isin(self.__movie_list.index)].drop(columns='ordering')

        # combine personell id's
        all_personnel = pd.concat([writers, directors, personnel])
        all_personnel = all_personnel.drop_duplicates()
        all_personnel.loc[:,'category'] = all_personnel.loc[:,'category'].astype('category')

        return all_personnel
    
    
    def _addPersonnel(self):
        # prepare personell ids
        all_personnel = self._loadPersonnel()

        # get needed personell info
        name_bas = pd.read_parquet(self.url_name_basics)
        names = name_bas[name_bas.index.isin(all_personnel['nconst'])]

        # convert personell info
        names = names["primaryName"] + " (" + \
            names["birthYear"].astype(str).replace("<NA>","") + \
                "-" + names["deathYear"].astype(str).replace("<NA>","") + ")"
        names.name = "info"

        # add personell info to movie indices
        personell = pd.merge(all_personnel, names, how='left', left_on='nconst', right_index=True).drop(columns='nconst')

        # convert personell categories to columns
        personell = pd.pivot_table(personell, values='info', index=['tconst'], columns=['category'], aggfunc=list)

        # add personell to watched movies (can probably be replaced by a normal add)
        self.__movie_list = pd.merge(self.__movie_list, personell, how='left', left_index=True, right_index=True)


    def getMovieList(self):
        # probably better to group and sort the groups, but whatever.
        # Order first by watched date; 
        # then order the wachted movies without date on score; 
        # then put unwatched movies at the bottom with highest score on top.
        watched = self.__movie_list.query('watched==1')
        watched_no_date = watched[watched['watched_date'].isna()].sort_values('averageRating', ascending=False)
        watched_date = watched[~watched['watched_date'].isna()].sort_values('watched_date', ascending=False)
        not_watched = self.__movie_list.query('watched==0').sort_values('averageRating', ascending=False)
        
        return pd.concat([not_watched, watched_date, watched_no_date])


main()


Index([   15324,    17136,    17925,    22100,    25316,    31381,    31679,
          32138,    32551,    33467,
       ...
       22687790, 22688572, 23289160, 23329452, 23724682, 24083908, 24216998,
       25289836, 27503384, 27695005],
      dtype='uint32', name='row_0', length=896)
Index([   15324,    17136,    17925,    22100,    25316,    31381,    31679,
          32138,    32551,    33467,
       ...
       22687790, 22688572, 23289160, 23329452, 23724682, 24083908, 24216998,
       25289836, 27503384, 27695005],
      dtype='int64', length=899)
Execution time:  00:03:32


Ik krijg weer deze error, ook al heb ik de vi-id eruit gehaald.
InvalidIndexError: slice(None, None, None)
weer bij addGenre(), bij regel 179 als ik de merge doe.

In [1]:
import pandas as pd
import os

BASE_URL = "https://datasets.imdbws.com/"
PARQ_PATH = "data/imdb/parquet/"
DOWNLOAD_PATH = "data/imdb/download/"
OUTPUT_PATH = "data/generated/"


FILES_IMDB = {
    "cast_crew" : "title.crew.tsv",
    "tit_bas" : "title.basics.tsv",
    "tit_rate" : "title.ratings.tsv",
    "name_bas" : "name.basics.tsv",
    "tit_prin" : "title.principals.tsv",
}

FILES_IMDB_PARQ = {
    "tit_bas" : "title_basics.parquet",
    "genres" : "genres.parquet",
    "tit_rate" : "title_ratings.parquet",
    'directors' : 'directors.parquet',
    'writers' : 'writers.parquet',
    'prim_prof' : 'primary_profession.parquet',
    'known_for' : 'known_for_titles.parquet',
    'name_bas' : 'name_basics.parquet',
    'const' : 'ids.parquet',
    'ordering' : 'ordering.parquet',
    'character' : 'character.parquet',
    'job' : 'job.parquet',
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt",
    "add_unseen": "add_movies_unseen.txt",
    "add_secop": "add_movies_second_opinion.txt",
    "raw_status": "raw_status.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}

status = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
genres = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["genres"])
ids = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["const"])
url_title_basics = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["tit_bas"])

s = pd.read_excel(status)
g = pd.read_parquet(genres)
i = pd.read_parquet(ids)
b = pd.read_parquet(url_title_basics)


er zijn geen nan id's. dat is vreemd

In [20]:
# convert date
s['watched_date'] = s['watched_date'].dt.date

In [21]:
# add movieID's
s = pd.merge(
    s,
    i[i["type"]=="tconst"]["intid"],
    how='left',
    left_on="tconst",
    right_index=True)

In [23]:
# title basics
# delete adult column
b = b.drop(columns='isAdult')

# add title basics
s = pd.merge(s,
             b,
             how='left',
             left_index=True,
             right_index=True)

In [26]:
url_title_rate = os.path.join(PARQ_PATH, FILES_IMDB_PARQ["tit_rate"])
tit_rate = pd.read_parquet(url_title_rate)

# add title rate
s = pd.merge(
    s,
    tit_rate,
    how='left',
    left_index=True,
    right_index=True)

# convert rate
s["averageRating"] = s["averageRating"].astype('float64')
s["numVotes"] = s["numVotes"].astype('Int64')

In [2]:
s

NameError: name 's' is not defined

In [40]:
# get needed ones
g = g[g.index.isin(s.intid)]
g.loc[:,'genres'] = g.loc[:,'genres'].astype('category')

In [44]:
# convert to multi one-hot
g = pd.crosstab(g.index, g['genres'])
g

genres,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15324,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
17136,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
17925,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22100,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
25316,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24083908,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
24216998,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
25289836,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
27503384,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
s.index

Index([  15324,   17136,   17925,   22100,   25316,   31381,   31679,   32138,
         32551,   33467,
       ...
       9603212, 9646240, 9686708, 9691136, 9731598, 9737876, 9755806, 9764362,
       9783600, 9806192],
      dtype='int64', name='intid', length=899)

In [77]:
g.index

Index([   15324,    17136,    17925,    22100,    25316,    31381,    31679,
          32138,    32551,    33467,
       ...
       22687790, 22688572, 23289160, 23329452, 23724682, 24083908, 24216998,
       25289836, 27503384, 27695005],
      dtype='uint32', name='row_0', length=896)

In [86]:
s.columns.values

array(['tconst', 'watched', 'watched_date', 'netflix', 'prime', 'story',
       'subject', 'acting', 'visual', 'action', 'comedy', 'enjoyment',
       'priority', 'titleType', 'primaryTitle', 'originalTitle',
       'startYear', 'endYear', 'runtimeMinutes', 'averageRating',
       'numVotes'], dtype=object)

In [100]:
# g.columns.values
g2 = g.copy()
g.columns = g.columns.values.tolist()

In [101]:

s.merge(g, how='right', left_index=True, right_index=True)

Unnamed: 0_level_0,tconst,watched,watched_date,netflix,prime,story,subject,acting,visual,action,...,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
intid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15324,tt0015324,0,NaT,,,,,,,,...,0,0,1,0,0,0,0,0,0,0
17136,tt0017136,1,2023-03-04,0.0,0.0,,,,,,...,0,0,0,1,0,0,0,0,0,0
17925,tt0017925,0,NaT,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
22100,tt0022100,1,2023-05-31,0.0,0.0,,,,,,...,0,1,0,0,0,0,0,1,0,0
25316,tt0025316,0,NaT,,,,,,,,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24083908,tt24083908,0,,,,,,,,,...,0,0,0,0,0,0,0,1,0,0
24216998,tt24216998,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
25289836,tt25289836,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
27503384,tt27503384,0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [89]:
g.unstack()

genres   row_0   
Action   15324       1
         17136       0
         17925       1
         22100       0
         25316       0
                    ..
Western  24083908    0
         24216998    0
         25289836    0
         27503384    0
         27695005    0
Length: 21504, dtype: int64

In [65]:
s = s.set_index("intid")
s

Unnamed: 0_level_0,tconst,watched,watched_date,netflix,prime,story,subject,acting,visual,action,...,enjoyment,priority,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,averageRating,numVotes
intid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15324,tt0015324,0,NaT,,,,,,,,...,,,,,,,,,,
17136,tt0017136,1,2023-03-04,0.0,0.0,,,,,,...,3.5,,short,Carmencita,Carmencita,1894,,1,5.7,2007
17925,tt0017925,0,NaT,,,,,,,,...,,,short,Le clown et ses chiens,Le clown et ses chiens,1892,,5,5.8,269
22100,tt0022100,1,2023-05-31,0.0,0.0,,,,,,...,3.0,,short,Pauvre Pierrot,Pauvre Pierrot,1892,,4,6.5,1912
25316,tt0025316,0,NaT,,,,,,,,...,,,short,Un bon bock,Un bon bock,1892,,12,5.5,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737876,tt9737876,0,,,,,,,,,...,,,short,The Farmer's Grandson,Herremandens Barnebarn,1909,,,,
9755806,tt9755806,0,,,,,,,,,...,,,short,The Hessian Renegades,The Hessian Renegades,1909,,10,5.5,209
9764362,tt9764362,1,,0.0,0.0,,,,,,...,4.0,,short,The Hindoo Dagger,The Hindoo Dagger,1909,,10,3.2,24
9783600,tt9783600,1,,,,,,,,,...,3.0,,short,His Duty,His Duty,1909,,5,5.3,65


In [71]:
# add genre
g.rename({"genres":"intid"})
# s2 = pd.merge(s, g, how='left', left_on="intid", right_index=True)

genres,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Talk-Show,Thriller,War,Western
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15324,1,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
17136,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,0,0
17925,1,1,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
22100,0,0,0,0,0,1,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
25316,0,0,0,0,1,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24083908,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
24216998,0,0,0,0,1,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,0
25289836,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
27503384,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [22]:
s

Unnamed: 0,tconst,watched,watched_date,netflix,prime,story,subject,acting,visual,action,comedy,enjoyment,priority,intid
0,tt0015324,0,NaT,,,,,,,,,,,15324
1,tt0017136,1,2023-03-04,0.0,0.0,,,,,,,3.5,,17136
2,tt0017925,0,NaT,,,,,,,,,,,17925
3,tt0022100,1,2023-05-31,0.0,0.0,,,,,,,3.0,,22100
4,tt0025316,0,NaT,,,,,,,,,,,25316
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,tt9737876,0,,,,,,,,,,,,9737876
895,tt9755806,0,,,,,,,,,,,,9755806
896,tt9764362,1,,0.0,0.0,,,,,,,4.0,,9764362
897,tt9783600,1,,,,,,,,,,3.0,,9783600


In [25]:
s

Unnamed: 0,tconst,watched,watched_date,netflix,prime,story,subject,acting,visual,action,comedy,enjoyment,priority,intid,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes
0,tt0015324,0,NaT,,,,,,,,,,,15324,,,,,,
1,tt0017136,1,2023-03-04,0.0,0.0,,,,,,,3.5,,17136,short,Carmencita,Carmencita,1894,,1
2,tt0017925,0,NaT,,,,,,,,,,,17925,short,Le clown et ses chiens,Le clown et ses chiens,1892,,5
3,tt0022100,1,2023-05-31,0.0,0.0,,,,,,,3.0,,22100,short,Pauvre Pierrot,Pauvre Pierrot,1892,,4
4,tt0025316,0,NaT,,,,,,,,,,,25316,short,Un bon bock,Un bon bock,1892,,12
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,tt9737876,0,,,,,,,,,,,,9737876,short,The Farmer's Grandson,Herremandens Barnebarn,1909,,
895,tt9755806,0,,,,,,,,,,,,9755806,short,The Hessian Renegades,The Hessian Renegades,1909,,10
896,tt9764362,1,,0.0,0.0,,,,,,,4.0,,9764362,short,The Hindoo Dagger,The Hindoo Dagger,1909,,10
897,tt9783600,1,,,,,,,,,,3.0,,9783600,short,His Duty,His Duty,1909,,5


In [27]:
s

Unnamed: 0,tconst,watched,watched_date,netflix,prime,story,subject,acting,visual,action,...,priority,intid,titleType,primaryTitle,originalTitle,startYear,endYear,runtimeMinutes,averageRating,numVotes
0,tt0015324,0,NaT,,,,,,,,...,,15324,,,,,,,,
1,tt0017136,1,2023-03-04,0.0,0.0,,,,,,...,,17136,short,Carmencita,Carmencita,1894,,1,5.7,2007
2,tt0017925,0,NaT,,,,,,,,...,,17925,short,Le clown et ses chiens,Le clown et ses chiens,1892,,5,5.8,269
3,tt0022100,1,2023-05-31,0.0,0.0,,,,,,...,,22100,short,Pauvre Pierrot,Pauvre Pierrot,1892,,4,6.5,1912
4,tt0025316,0,NaT,,,,,,,,...,,25316,short,Un bon bock,Un bon bock,1892,,12,5.5,178
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
894,tt9737876,0,,,,,,,,,...,,9737876,short,The Farmer's Grandson,Herremandens Barnebarn,1909,,,,
895,tt9755806,0,,,,,,,,,...,,9755806,short,The Hessian Renegades,The Hessian Renegades,1909,,10,5.5,209
896,tt9764362,1,,0.0,0.0,,,,,,...,,9764362,short,The Hindoo Dagger,The Hindoo Dagger,1909,,10,3.2,24
897,tt9783600,1,,,,,,,,,...,,9783600,short,His Duty,His Duty,1909,,5,5.3,65


In [38]:
s.intid
g.index

Index([      1,       1,       2,       2,       3,       3,       3,       4,
             4,       5,
       ...
       9916850, 9916850, 9916850, 9916852, 9916852, 9916852, 9916856, 9916880,
       9916880, 9916880],
      dtype='uint32', name='tconst', length=15977519)

In [41]:
g

Unnamed: 0_level_0,genres
tconst,Unnamed: 1_level_1
15324,Action
15324,Comedy
15324,Romance
17136,Drama
17136,Sci-Fi
...,...
9783600,Crime
9783600,Drama
9806192,Animation
9806192,Drama
