In [1]:
"""
Important lesson i will put here:
if there are quotes in a string, then you must set the quote sign the same as the field delimiter.
like this: pd.read_csv(tit_bas_path,sep='\t', quotechar='\t', low_memory=False)
"""

import os
import re
import time
import gzip
import shutil
import requests
import numpy as np
import pandas as pd
from datetime import date

START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"

FILES_IMDB = {
    "tit_bas": "title.basics.tsv",
    "tit_rate": "title.ratings.tsv",
    "name_bas": "name.basics.tsv",
    "tit_prin": "title.principals.tsv",
    "cast_crew": "title.crew.tsv",
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt",
    "add_unseen": "add_movies_unseen.txt",
    "add_secop": "add_movies_second_opinion.txt",
    "raw_status": "raw_status.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl",
    "films_reading": "films_reading.xlsx"
}

tit_bas_path = os.path.join("data/imdb/",FILES_IMDB['tit_bas'])

test = pd.read_csv(tit_bas_path,sep='\t', quotechar='\t', low_memory=False)

def cleanTitlesBasic(tit_bas):
    tit_bas = tit_bas.replace(to_replace = "\\N", value = np.nan)
    
    # tconst
    # We remove the first two tt and convert it to int.
    tit_bas.loc[:,"tconst"] = tit_bas.loc[:,"tconst"].str.slice(2).astype(int)

    # titleType
    # There are a few null values (np.nan)
    # convert to category
    tit_bas.loc[:,"titleType"] = tit_bas.loc[:,"titleType"].astype('category')

    # primaryTitle & originalTitle
    # Is already either string or np.nan

    # isAdult
    # There are movies that have a year as adult status.
    # Those movies are checked, and were not adult movies.
    # movies that are not an 1 or '1' aren't adult movies
    tit_bas.loc[:,"isAdult"] = tit_bas.loc[:,"isAdult"].fillna(0).astype('str')
    tit_bas.loc[tit_bas.loc[:,'isAdult'] != '1', 'isAdult'] = '0'
    tit_bas.loc[:,"isAdult"] = tit_bas.loc[:,"isAdult"].map({'1': True, '0': False})

    # startYear & endYear & runtimeMinutes
    # both contain a lot of np.nan
    tit_bas.loc[:,"startYear"] = tit_bas.loc[:,"startYear"].astype('Int64')
    tit_bas.loc[:,"endYear"] = tit_bas.loc[:,"endYear"].astype('Int64')
    tit_bas.loc[:,"runtimeMinutes"] = tit_bas.loc[:,"runtimeMinutes"].astype('Int64')

    # genres
    # We don't convert the genres yet.
    # the way of transforming this attribute is dependent to the realtions of the elements in the list.
    # If the genres in the list have a correlation => 1 column for 1 genre.
    # If not: 1 new row for every genre for the movie
    # the quoted code converts the strings to a lists, then convert the values to boolean columns
    # yes, it's this stupendously easy
    # unique_values = list(set(",".join(tit_bas.loc[~tit_bas.loc[:,'genres'].isna(),'genres'].unique().tolist()).split(',')))
    # tit_bas.loc[:,'genres'] = tit_bas.loc[:,'genres'].str.split(',')
    # tit_bas.loc[:,'genres'] = tit_bas.loc[:,'genres'].fillna("")
    # new_columns = pd.DataFrame()
    # for value in unique_values:
    #     new_columns['genres_'+value] = tit_bas.loc[:,'genres'].apply(lambda x: value in x)
    # tit_bas = tit_bas.drop('genres', axis=1)
    # tit_bas = pd.concat([tit_bas, new_columns], axis=1)
    tit_bas.loc[:,'genres'] = tit_bas.loc[:,'genres'].str.split(',')

    return tit_bas

test = cleanTitlesBasic(test)
# test.to_parquet()


tit_rate_path = os.path.join("data/imdb/",FILES_IMDB["tit_rate"])

tit_rate = pd.read_csv(tit_rate_path,sep='\t', quotechar='\t', low_memory=False)

def cleanTitlesRate(tit_rate):
    tit_rate = tit_rate.replace(to_replace = "\\N", value = np.nan)
    
    # tconst
    # We remove the first two tt and convert it to int.
    tit_rate.loc[:,"tconst"] = tit_rate.loc[:,"tconst"].str.slice(2).astype(int)

    # averageRating & numVotes
    # There are no missing values, we only have to convert them.
    tit_rate.loc[:,'averageRating'] = tit_rate.loc[:,'averageRating'].astype(float)
    tit_rate.loc[:,'numVotes'] = tit_rate.loc[:,'numVotes'].astype(int)

    return tit_rate

name_bas_path = os.path.join("data/imdb/",FILES_IMDB["name_bas"])

name_bas = pd.read_csv(name_bas_path, sep='\t', low_memory=False)

def cleanNameBasics(name_bas):
    name_bas = name_bas.replace(to_replace = "\\N", value = np.nan)
    
    # nconst
    # We remove the first two tt and convert it to int.
    name_bas.loc[:,"nconst"] = name_bas.loc[:,"nconst"].str.slice(2).astype(int)

    # primaryName
    # does not need modification
    # has 3 missing values
    # name_bas.loc[name_bas.loc[:,"primaryName"].isna(),:]

    # birthYear & deathYear has a lot of missing values
    # some are year 4, or 12. those are greek writers like nm0653992 (Ovid) 
    # which play was used for the movie.
    name_bas.loc[:,"birthYear"] = name_bas.loc[:,"birthYear"].astype('Int64')
    name_bas.loc[:,"deathYear"] = name_bas.loc[:,"deathYear"].astype('Int64')

    # primaryProfession
    # There are 43 unique values.
    # contains NaN values
    # If the values are independent form each other, then we must transform them to 1 row/value.
    # If they are dependent on each other, then we must transform then as 1 column/value.
    name_bas.loc[:,'primaryProfession'] = name_bas.loc[:,'primaryProfession'].str.split(',')

    # knownForTitles
    # interesting info, but probably better to remove it.
    name_bas.loc[:,'knownForTitles'] = name_bas.loc[:,'knownForTitles'].str.replace("tt","").str.split(',')
    tempknowntit = name_bas.loc[~name_bas.loc[:,'knownForTitles'].isna(),['nconst','knownForTitles']].explode("knownForTitles").astype("Int64").copy()
    tempknowntit = tempknowntit.groupby('nconst')['knownForTitles'].apply(list)
    name_bas = name_bas.drop(columns='knownForTitles')
    name_bas = pd.merge(name_bas, tempknowntit, how='left', on="nconst")

    return name_bas

name_bas = cleanNameBasics(name_bas)

cast_crew_path = os.path.join("data/imdb/",FILES_IMDB["cast_crew"])
cast_crew = pd.read_csv(cast_crew_path, sep='\t', low_memory=False)

cast_crew = cast_crew.replace(to_replace = "\\N", value = np.nan)

cast_crew

# tconst
# We remove the first two tt and convert it to int.
cast_crew.loc[:,"tconst"] = cast_crew.loc[:,"tconst"].str.slice(2).astype(int)

# directors
# there are no missing values
cast_crew.loc[:,"directors"] = cast_crew.loc[:,"directors"].str.replace("nm","").str.split(',')
tempcastcrew = cast_crew.loc[~cast_crew.loc[:,'directors'].isna(),['tconst','directors']].explode("directors").astype("Int64").copy()
tempcastcrew = tempcastcrew.groupby('tconst')['directors'].apply(list)
cast_crew = cast_crew.drop(columns='directors')
cast_crew = pd.merge(cast_crew, tempcastcrew, how='left', on="tconst")

# writers
# there are missing values
cast_crew.loc[:,"writers"] = cast_crew.loc[:,"writers"].str.replace("nm","").str.split(',')
tempcastcrew = cast_crew.loc[~cast_crew.loc[:,'writers'].isna(),['tconst','writers']].explode("writers").astype("Int64").copy()
tempcastcrew = tempcastcrew.groupby('tconst')['writers'].apply(list)
cast_crew = cast_crew.drop(columns='writers')
cast_crew = pd.merge(cast_crew, tempcastcrew, how='left', on="tconst")

cast_crew

cast_crew_path = os.path.join("data/imdb/",FILES_IMDB["cast_crew"])
cast_crew = pd.read_csv(cast_crew_path, sep='\t', low_memory=False)

def CleanCrew(cast_crew):
    cast_crew = cast_crew.replace(to_replace = "\\N", value = np.nan)

    # tconst
    # We remove the first two tt and convert it to int.
    cast_crew.loc[:,"tconst"] = cast_crew.loc[:,"tconst"].str.slice(2).astype(int)

    # directors
    # there are NO missing values
    cast_crew.loc[:,"directors"] = cast_crew.loc[:,"directors"].str.replace("nm","").str.split(',')
    tempcastcrew = cast_crew.loc[~cast_crew.loc[:,'directors'].isna(),['tconst','directors']].explode("directors").astype("Int64").copy()
    tempcastcrew = tempcastcrew.groupby('tconst')['directors'].apply(list)
    cast_crew = cast_crew.drop(columns='directors')
    cast_crew = pd.merge(cast_crew, tempcastcrew, how='left', on="tconst")

    # writers
    # there ARE missing values
    cast_crew.loc[:,"writers"] = cast_crew.loc[:,"writers"].str.replace("nm","").str.split(',')
    tempcastcrew = cast_crew.loc[~cast_crew.loc[:,'writers'].isna(),['tconst','writers']].explode("writers").astype("Int64").copy()
    tempcastcrew = tempcastcrew.groupby('tconst')['writers'].apply(list)
    cast_crew = cast_crew.drop(columns='writers')
    cast_crew = pd.merge(cast_crew, tempcastcrew, how='left', on="tconst")

    return cast_crew

cast_crew = CleanCrew(cast_crew)

cast_crew

In [2]:
cast_crew_path = os.path.join("data/imdb/",FILES_IMDB["tit_prin"])
cast_crew = pd.read_csv(cast_crew_path, sep='\t', low_memory=True)

In [3]:
cast_crew.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,\N,"[""Self""]"
1,tt0000001,2,nm0005690,director,\N,\N
2,tt0000001,3,nm0374658,cinematographer,director of photography,\N
3,tt0000002,1,nm0721526,director,\N,\N
4,tt0000002,2,nm1335271,composer,\N,\N


In [4]:
cast_crew = cast_crew.replace(to_replace = "\\N", value = np.nan)

In [5]:
cast_crew

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,tt0000001,1,nm1588970,self,,"[""Self""]"
1,tt0000001,2,nm0005690,director,,
2,tt0000001,3,nm0374658,cinematographer,director of photography,
3,tt0000002,1,nm0721526,director,,
4,tt0000002,2,nm1335271,composer,,
...,...,...,...,...,...,...
54718134,tt9916880,4,nm10535738,actress,,"[""Horrid Henry""]"
54718135,tt9916880,5,nm0996406,director,principal director,
54718136,tt9916880,6,nm1482639,writer,,
54718137,tt9916880,7,nm2586970,writer,books,


In [6]:
# tconst
# We remove the first two tt and convert it to int.
cast_crew.loc[:,"tconst"] = cast_crew.loc[:,"tconst"].str.slice(2)
cast_crew.loc[:,"tconst"] = cast_crew.loc[:,"tconst"].astype('Int64')

In [7]:
# ordering
# data is clean
cast_crew.loc[:,"ordering"] = cast_crew.loc[:,"ordering"].astype('Int64')

In [8]:
# nconst
# We remove the first two tt and convert it to int.
# all values start with nm
cast_crew.loc[:,"nconst"] = cast_crew.loc[:,"nconst"].str.slice(2)
cast_crew.loc[:,"nconst"] = cast_crew.loc[:,"nconst"].astype('Int64')

In [9]:
# category
# data is clean
cast_crew.loc[:,"category"] = cast_crew.loc[:,"category"].astype('category')

In [10]:
# job
# There are a lot of values that are the same:
# 'writer' and 'written by'; 'creator' and 'created by' etc 
cast_crew.loc[:,"job"].value_counts()

producer                                                                                   3491822
writer                                                                                     1001848
written by                                                                                  515370
creator                                                                                     414798
created by                                                                                  320201
                                                                                            ...   
additional story element                                                                         1
based on "The Old Dark House" by                                                                 1
inspired by some segments from the plays "Operación Embajada" and "Cartas credenciales"          1
novel "Gold for the Caesars"                                                                     1
short stor

In [24]:
cast_crew.isna().value_counts()

tconst  ordering  nconst  category  job    characters
False   False     False   False     True   False         26593814
                                           True          19147346
                                    False  True           8976979
dtype: int64

as you can see, it's only job and characters that miss data.

In [11]:
# characters
# has a lot of missing values.

In [2]:
tit_prin_path = os.path.join("data/imdb/",FILES_IMDB["tit_prin"])
tit_prin = pd.read_csv(tit_prin_path, sep='\t', low_memory=True)

def cleantitlePrinciples(tit_prin):
    tit_prin = tit_prin.replace(to_replace = "\\N", value = np.nan)

    # tconst
    # We remove the first two tt and convert it to int.
    # File is to big to do it in one go.
    tit_prin.loc[:,"tconst"] = tit_prin.loc[:,"tconst"].str.slice(2)
    tit_prin.loc[:,"tconst"] = tit_prin.loc[:,"tconst"].astype('Int64')

    # ordering
    # data is clean
    tit_prin.loc[:,"ordering"] = tit_prin.loc[:,"ordering"].astype('Int64')

    # nconst
    # We remove the first two tt and convert it to int.
    # all values start with nm
    tit_prin.loc[:,"nconst"] = tit_prin.loc[:,"nconst"].str.slice(2)
    tit_prin.loc[:,"nconst"] = tit_prin.loc[:,"nconst"].astype('Int64')

    # category
    # data is clean
    tit_prin.loc[:,"category"] = tit_prin.loc[:,"category"].astype('category')

    # job
    # There are a lot of values that are the same:
    # 'writer' and 'written by'; 'creator' and 'created by' etc 
    # tit_prin.loc[:,"job"].value_counts()

    # characters
    # there are a lot of missing values
    
    return tit_prin

tit_prin = cleantitlePrinciples(tit_prin)

In [3]:
tit_prin.head()

Unnamed: 0,tconst,ordering,nconst,category,job,characters
0,1,1,1588970,self,,"[""Self""]"
1,1,2,5690,director,,
2,1,3,374658,cinematographer,director of photography,
3,2,1,721526,director,,
4,2,2,1335271,composer,,
