# setup

In [1]:
import os
import time
import numpy as np
import pandas as pd

In [2]:
START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"

FILES_IMDB = {
    "tit_bas": "title.basics.tsv",
    "tit_rate": "title.ratings.tsv",
    "name_bas": "name.basics.tsv",
    "cast_crew": "title.principals.tsv",
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt",
    "add_unseen": "add_movies_unseen.txt",
    "add_secop": "add_movies_second_opinion.txt",
    "raw_status": "raw_status.xlsx"
}

FILES_GENERATED = {
    "films_raw": os.path.join("data", "generated", "films_raw.pkl"),
    "films_readable": os.path.join("data", "generated", "films_reading.xlsx"),
    "films_mining": os.path.join("data", "generated", "films_mining.xlsx")
}

## Get film data

In [3]:
raw_film_data = pd.read_pickle(FILES_GENERATED["films_raw"])

#view DataFrame
print(raw_film_data)

         tconst  watched  netflix  prime  enjoyment  priority titleType  \
0     tt0015324    False     <NA>   <NA>       <NA>       NaN     movie   
1     tt0015324    False     <NA>   <NA>       <NA>       NaN     movie   
2     tt0015324    False     <NA>   <NA>       <NA>       NaN     movie   
3     tt0015324    False     <NA>   <NA>       <NA>       NaN     movie   
4     tt0015324    False     <NA>   <NA>       <NA>       NaN     movie   
...         ...      ...      ...    ...        ...       ...       ...   
5945  tt9806192     True     <NA>   <NA>          4       NaN     movie   
5946  tt9806192     True     <NA>   <NA>          4       NaN     movie   
5947  tt9806192     True     <NA>   <NA>          4       NaN     movie   
5948  tt9806192     True     <NA>   <NA>          4       NaN     movie   
5949  tt9806192     True     <NA>   <NA>          4       NaN     movie   

        primaryTitle         originalTitle isAdult  ... averageRating  \
0       Sherlock Jr.      

# Create a human readable file

## summarizing people data per movie

In [4]:
raw_film_data.drop(['titleType','isAdult'], axis=1, inplace=True)

In [5]:
list(raw_film_data.loc[:,"category"].unique())

['cinematographer',
 'actor',
 'actress',
 'writer',
 'composer',
 'director',
 'producer',
 'editor',
 'production_designer',
 'archive_footage',
 'self']

I keep this code, bacause it is a common issue if stumble upon.

In [6]:
# raw_film_data.loc[:,['tconst','nconst','category']].pivot(index="tconst", columns="category",values='nconst')
raw_film_data.loc[:,['tconst','nconst','category']].groupby(['tconst','category']).nconst.apply(list)

tconst     category           
tt0015324  actor                  [nm0000036, nm0444172, nm0175068]
           actress                                      [nm0570230]
           cinematographer                   [nm0504380, nm0396327]
           composer                                     [nm3816287]
           writer                 [nm0369841, nm0593477, nm0115669]
                                                ...                
tt9806192  director                                     [nm3021346]
           editor                                       [nm1928263]
           producer                                     [nm0238941]
           production_designer                         [nm10909656]
           writer                                       [nm0491011]
Name: nconst, Length: 3376, dtype: object

## sumarise personel

In [7]:
# staff = raw_film_data.loc[:,['tconst','nconst','category','primaryName','birthYear','deathYear']]
# staff.loc[:,'printname'] = staff.loc[:,'primaryName'] + " " + staff.loc[:,'birthYear'].astype('str') + ' - ' + staff.loc[:,'deathYear'].astype(str)
# staff.drop(['nconst','primaryName','birthYear','deathYear','primaryProfession'], axis=1, inplace=True)
# staff = staff.groupby(['tconst','category']).printname.apply(list).reset_index().set_index(['tconst','category']).unstack()
# staff.head()

In [8]:
staff = raw_film_data.loc[:,['tconst','nconst','category','primaryName','birthYear','deathYear']]
staff.loc[:,'printname'] = staff.loc[:,'primaryName'] + " " + staff.loc[:,'birthYear'].astype('str') + ' - ' + staff.loc[:,'deathYear'].astype(str)
staff.drop(['nconst','primaryName','birthYear','deathYear'], axis=1, inplace=True)
staff = staff.groupby(['tconst','category'])['printname'].aggregate(lambda x: tuple(x)).unstack()
staff

category,actor,actress,archive_footage,cinematographer,composer,director,editor,producer,production_designer,self,writer
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
tt0015324,"(Buster Keaton 1895 - 1966, Joe Keaton 1867 - ...","(Kathryn McGuire 1903 - 1978,)",,"(Elgin Lessley 1883 - 1944, Byron Houck 1891 -...","(Club Foot Orchestra <NA> - <NA>,)",,,,,,"(Jean C. Havez 1872 - 1925, Joseph A. Mitchell..."
tt0017136,"(Alfred Abel 1879 - 1937, Gustav Fröhlich 1902...","(Brigitte Helm 1906 - 1996,)",,,"(Sandro Forte 1970 - 2020, Maximianno Cobra 19...","(Fritz Lang 1890 - 1976,)",,"(Erich Pommer 1889 - 1966,)",,,"(Thea von Harbou 1888 - 1954,)"
tt0022100,"(Peter Lorre 1904 - 1964, Otto Wernicke 1893 -...","(Ellen Widmann 1894 - 1985, Inge Landgut 1922 ...",,"(Fritz Arno Wagner 1894 - 1958,)",,"(Fritz Lang 1890 - 1976,)","(Paul Falkenberg 1903 - 1986,)",,,,"(Thea von Harbou 1888 - 1954, Egon Jacobsohn 1..."
tt0025316,"(Jameson Thomas 1888 - 1939, Clark Gable 1901 ...","(Claudette Colbert 1903 - 1996,)",,"(Joseph Walker 1892 - 1985,)",,"(Frank Capra 1897 - 1991,)","(Gene Havlick 1894 - 1959,)",,,,"(Robert Riskin 1897 - 1955, Samuel Hopkins Ada..."
tt0031381,"(Clark Gable 1901 - 1960, Thomas Mitchell 1892...","(Vivien Leigh 1913 - 1967, Barbara O'Neil 1910...",,,,"(Victor Fleming 1889 - 1949, George Cukor 1899...",,,,,"(Oliver H.P. Garrett 1894 - 1952, Margaret Mit..."
...,...,...,...,...,...,...,...,...,...,...,...
tt9691136,"(Nick Robinson 1995 - <NA>, Beulah Koale <NA> ...","(Chloë Grace Moretz 1997 - <NA>,)",,,,"(Roseanne Liang <NA> - <NA>,)",,"(Kelly McCormick 1977 - <NA>, Fred Berger 1981...",,,"(Max Landis 1985 - <NA>,)"
tt9731598,"(Billy Eichner 1978 - <NA>, Luke Macfarlane 19...",,,"(Brandon Trost 1981 - <NA>,)","(Marc Shaiman 1959 - <NA>,)","(Nicholas Stoller 1976 - <NA>,)","(Daniel Gabbe <NA> - <NA>,)","(Judd Apatow 1967 - <NA>, Josh Church <NA> - <...",,,
tt9764362,"(Ralph Fiennes 1962 - <NA>, Nicholas Hoult 198...","(Anya Taylor-Joy 1996 - <NA>, Hong Chau 1979 -...",,,,"(Mark Mylod <NA> - <NA>,)",,"(Betsy Koch <NA> - <NA>, Will Ferrell 1967 - <...",,,"(Seth Reiss <NA> - <NA>, Will Tracy <NA> - <NA>)"
tt9783600,"(Chris Hemsworth 1983 - <NA>, Miles Teller 198...","(Jurnee Smollett 1986 - <NA>,)",,,,"(Joseph Kosinski 1974 - <NA>,)",,"(Tommy Harper <NA> - <NA>, Agnes Chu <NA> - <NA>)",,,"(George Saunders <NA> - <NA>, Rhett Reese <NA>..."


## sumarise genres

In [9]:
genres = raw_film_data.loc[:,['tconst','genres']].drop_duplicates()
genres.genres = genres.genres.str.split(',')
genres = genres.explode('genres')
genres['value'] = 1
genres = pd.pivot_table(genres.explode('genres'), values='value', index='tconst', columns='genres', fill_value=0)
genres.head()

genres,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,Fantasy,...,Music,Musical,Mystery,Romance,Sci-Fi,Short,Sport,Thriller,War,Western
tconst,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
tt0015324,1,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
tt0017136,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
tt0022100,0,0,0,0,0,1,0,0,0,0,...,0,0,1,0,0,0,0,1,0,0
tt0025316,0,0,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
tt0031381,0,0,0,0,0,0,0,1,0,0,...,0,0,0,1,0,0,0,0,1,0


## add them to unique data and write out

In [10]:
readable_data = raw_film_data.copy()
readable_data.drop(['ordering', 'nconst','category','job','primaryName','birthYear','deathYear', 'genres','endYear','primaryProfession'], axis=1, inplace=True)
readable_data.drop_duplicates(inplace=True)
readable_data = pd.merge(readable_data, genres, on="tconst", how="left")
readable_data = pd.merge(readable_data, staff, on="tconst", how="left")
readable_data.loc[:,'numVotes'] = np.log10(readable_data.loc[:,'numVotes'])
readable_data.loc[:,'watched'] = readable_data.loc[:,'watched'].replace(True,1).replace(False,0)
readable_data.head()

Unnamed: 0,tconst,watched,netflix,prime,enjoyment,priority,primaryTitle,originalTitle,startYear,runtimeMinutes,...,actress,archive_footage,cinematographer,composer,director,editor,producer,production_designer,self,writer
0,tt0015324,0,,,,,Sherlock Jr.,Sherlock Jr.,1924,45,...,"(Kathryn McGuire 1903 - 1978,)",,"(Elgin Lessley 1883 - 1944, Byron Houck 1891 -...","(Club Foot Orchestra <NA> - <NA>,)",,,,,,"(Jean C. Havez 1872 - 1925, Joseph A. Mitchell..."
1,tt0017136,0,0.0,0.0,,,Metropolis,Metropolis,1927,153,...,"(Brigitte Helm 1906 - 1996,)",,,"(Sandro Forte 1970 - 2020, Maximianno Cobra 19...","(Fritz Lang 1890 - 1976,)",,"(Erich Pommer 1889 - 1966,)",,,"(Thea von Harbou 1888 - 1954,)"
2,tt0022100,0,0.0,0.0,,,M,M - Eine Stadt sucht einen Mörder,1931,117,...,"(Ellen Widmann 1894 - 1985, Inge Landgut 1922 ...",,"(Fritz Arno Wagner 1894 - 1958,)",,"(Fritz Lang 1890 - 1976,)","(Paul Falkenberg 1903 - 1986,)",,,,"(Thea von Harbou 1888 - 1954, Egon Jacobsohn 1..."
3,tt0025316,0,,,,,It Happened One Night,It Happened One Night,1934,105,...,"(Claudette Colbert 1903 - 1996,)",,"(Joseph Walker 1892 - 1985,)",,"(Frank Capra 1897 - 1991,)","(Gene Havlick 1894 - 1959,)",,,,"(Robert Riskin 1897 - 1955, Samuel Hopkins Ada..."
4,tt0031381,0,,,,,Gone with the Wind,Gone with the Wind,1939,238,...,"(Vivien Leigh 1913 - 1967, Barbara O'Neil 1910...",,,,"(Victor Fleming 1889 - 1949, George Cukor 1899...",,,,,"(Oliver H.P. Garrett 1894 - 1952, Margaret Mit..."


In [11]:
readable_data.sort_values(by=['watched', 'averageRating'], ascending=False).to_excel(FILES_GENERATED["films_readable"],index=False)

# creating a file for datamining

### transforming the genres, this could be redundant, because it was already done for the readable dataset

In [12]:
genres = raw_film_data.loc[:,['tconst','genres']].drop_duplicates()
genres.genres = genres.genres.str.split(',')
genres = genres.explode('genres')
genres['value'] = 1
genres = pd.pivot_table(genres.explode('genres'), values='value', index='tconst', columns='genres', fill_value=0)
genres.head()
raw_film_data = pd.merge(raw_film_data, genres, on="tconst", how="left")

In [13]:
raw_film_data.drop(['priority','endYear','genres'],axis=1,inplace=True)
raw_film_data.loc[:,'primaryProfession'] = raw_film_data.loc[:,'primaryProfession'].str.split(',')
raw_film_data = raw_film_data.explode('primaryProfession')

TypeError: unhashable type: 'DataFrame'

In [None]:
raw_film_data

Unnamed: 0,tconst,watched,netflix,prime,enjoyment,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes,ordering,nconst,category,job,primaryName,birthYear,deathYear,primaryProfession
0,tt0015324,False,,,,Sherlock Jr.,Sherlock Jr.,1924,45,"Action,Comedy,Romance",8.2,51136,10,nm0504380,cinematographer,,Elgin Lessley,1883,1944,cinematographer
0,tt0015324,False,,,,Sherlock Jr.,Sherlock Jr.,1924,45,"Action,Comedy,Romance",8.2,51136,10,nm0504380,cinematographer,,Elgin Lessley,1883,1944,actor
1,tt0015324,False,,,,Sherlock Jr.,Sherlock Jr.,1924,45,"Action,Comedy,Romance",8.2,51136,1,nm0000036,actor,,Buster Keaton,1895,1966,actor
1,tt0015324,False,,,,Sherlock Jr.,Sherlock Jr.,1924,45,"Action,Comedy,Romance",8.2,51136,1,nm0000036,actor,,Buster Keaton,1895,1966,writer
1,tt0015324,False,,,,Sherlock Jr.,Sherlock Jr.,1924,45,"Action,Comedy,Romance",8.2,51136,1,nm0000036,actor,,Buster Keaton,1895,1966,director
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5948,tt9806192,True,,,4,I Lost My Body,J'ai perdu mon corps,2019,81,"Animation,Drama,Fantasy",7.5,33775,8,nm1776887,composer,,Dan Levy,1976,,composer
5948,tt9806192,True,,,4,I Lost My Body,J'ai perdu mon corps,2019,81,"Animation,Drama,Fantasy",7.5,33775,8,nm1776887,composer,,Dan Levy,1976,,music_department
5949,tt9806192,True,,,4,I Lost My Body,J'ai perdu mon corps,2019,81,"Animation,Drama,Fantasy",7.5,33775,9,nm1928263,editor,,Benjamin Massoubre,,,editor
5949,tt9806192,True,,,4,I Lost My Body,J'ai perdu mon corps,2019,81,"Animation,Drama,Fantasy",7.5,33775,9,nm1928263,editor,,Benjamin Massoubre,,,editorial_department


can be deleted for ML: tconst, nconst, netflix, prime, primaryTitle, originalTitle

In [None]:
raw_film_data.sort_values(by=['watched', 'averageRating'], ascending=False).to_excel(FILES_GENERATED["films_mining"],index=False)