# setup

In [1]:
import os
import re
import time
import gzip
import shutil
import requests
import numpy as np
import pandas as pd

## Set constants

In [2]:
START_TIME = time.time()

BASE_URL = "https://datasets.imdbws.com/"

FILES_IMDB = {
    "tit_bas": "title.basics.tsv",
    "tit_rate": "title.ratings.tsv",
    "name_bas": "name.basics.tsv",
    "cast_crew": "title.principals.tsv",
}

FILES_HAND = {
    "add_seen": "add_movies_seen.txt",
    "add_unseen": "add_movies_unseen.txt",
    "add_secop": "add_movies_second_opinion.txt",
    "raw_status": "raw_status.xlsx"
}

FILES_GENERATED = {
    "films_raw": "films_raw.pkl"
}

## Downloading files

In [3]:
# Change to True is you want to update the imdb files
DOWNLOAD = False

In [4]:
# removes old files if already excist
if DOWNLOAD:

    if not os.path.exists("data/imdb"):
        os.makedirs("data/imdb")
    
    for file in FILES_IMDB.values():
        file_name = os.path.join("data/imdb/",file)
        file_zip = file_name+".gz"
        file_url = BASE_URL+file+".gz"
        
        # remove old files
        if os.path.exists(file_name):
            os.remove(file_name)
        if os.path.exists(file_zip):
            os.remove(file_zip)
        
        # download files
        response = requests.get(file_url)
        open(file_zip , "wb").write(response.content)

        # unzip files
        with gzip.open(file_zip, 'rb') as f_in:
            with open(file_name, 'wb') as f_out:
                shutil.copyfileobj(f_in, f_out)
        
        # remove zips
        os.remove(file_zip)

# Step 1: Generating a raw dataset for mining

## Adding new movies to raw_watched

In [5]:
# getting the new seen movies
seen_path = os.path.join("data", "handcrafted", FILES_HAND["add_seen"])
seen_raw_f = open(seen_path,'r')
seen_raw = seen_raw_f.readlines()

# transforming the new seen movie data
for linei in range(len(seen_raw)):
    seen_raw[linei] = seen_raw[linei].strip().split(" ")
    if len(seen_raw[linei]) == 1:
        temp = seen_raw[linei][0].split("/")
    elif len(seen_raw[linei]) == 2:
        temp = seen_raw[linei][0].split("/")
    tofind = re.compile("^tt\d+\d$")
    ttcode = ""
    for x in temp:
        y = tofind.findall(x)
        if len(list(y)) != 0:
            ttcode = list(y)[0]
    if len(seen_raw[linei]) == 1:
        seen_raw[linei] = [ttcode, None]
    elif len(seen_raw[linei]) == 2:
        seen_raw[linei] = [ttcode, int(seen_raw[linei][1])]
seen_raw


[['tt0054135', None],
 ['tt0903624', None],
 ['tt1170358', None],
 ['tt2310332', None],
 ['tt0120363', None],
 ['tt1630029', None],
 ['tt0050083', 4],
 ['tt8579674', 4]]

In [6]:
# getting the new unseen movies
unseen_path = os.path.join("data", "handcrafted", FILES_HAND["add_unseen"])
unseen_raw_f = open(unseen_path,'r')
unseen_raw = unseen_raw_f.readlines()

# transforming the new unseen movie data
for linei in range(len(unseen_raw)):
    unseen_raw[linei] = unseen_raw[linei].strip()
    temp = unseen_raw[linei].split("/")
    
    tofind = re.compile("^tt\d+\d$")
    ttcode = ""
    for x in temp:
        y = tofind.findall(x)
        if len(list(y)) != 0:
            ttcode = list(y)[0]
    unseen_raw[linei] = ttcode
unseen_raw

['tt1462764',
 'tt6791350',
 'tt15239678',
 'tt9362722',
 'tt11210390',
 'tt15843316',
 'tt10366206',
 'tt2584384']

Adding the 2d list to the raw_watched.xlsx (if not already added)

In [7]:
ids_and_status = os.path.join("data", "handcrafted", FILES_HAND["raw_status"])
movie_list_raw = pd.read_excel(ids_and_status)
movie_list_raw

Unnamed: 0,tconst,watched,netflix,prime,enjoyment,priority
0,tt0015324,0,,,,
1,tt0017136,0,0.0,0.0,,
2,tt0022100,0,0.0,0.0,,
3,tt0025316,0,,,,
4,tt0031381,0,,,,
...,...,...,...,...,...,...
595,tt9691136,0,0.0,0.0,,
596,tt9731598,0,0.0,0.0,,
597,tt9764362,1,0.0,0.0,4.0,
598,tt9783600,1,,,4.0,


In [8]:
for seen in seen_raw:
    movieid = seen[0]
    score = seen[1]
    if movieid in movie_list_raw["tconst"].values:
        # if watched movie already in list
        found_index = movie_list_raw.loc[movie_list_raw.loc[:,"tconst"]==movieid].index.tolist()[0]
        movie_list_raw.at[found_index,"enjoyment"]
        enjoyment = movie_list_raw.at[found_index,"enjoyment"]
        watched = int(movie_list_raw.at[found_index,"watched"])
        if watched==1 and pd.isnull(enjoyment):
            # update the score only if null
            movie_list_raw.at[found_index,"enjoyment"] = score
        elif watched==0:
            # updated watched and add score (which can be nan)
            movie_list_raw.at[found_index,"enjoyment"] = score
            movie_list_raw.at[found_index,"watched"] = 1
    else:
        # if watched movie not in list
        to_add = pd.Series({
            'tconst':movieid, 'watched':1, 'netflix':np.nan,
            'prime':np.nan, "enjoyment":score , "priority": np.nan})

        movie_list_raw = pd.concat([movie_list_raw, to_add.to_frame().T], ignore_index=True)

    # movie_list_raw.loc[movie_list_raw.loc[:,"tconst"]==movieid]    

In [9]:
for movieid in unseen_raw:
    if not movieid in movie_list_raw["tconst"].values:
        # if watched movie not already in list
        to_add = pd.Series({
            'tconst':movieid, 'watched':0, 'netflix':np.nan,
            'prime':np.nan, "enjoyment":np.nan , "priority": np.nan})

        movie_list_raw = pd.concat([movie_list_raw, to_add.to_frame().T], ignore_index=True)

In [10]:
movie_list_raw.sort_values(["tconst"]).to_excel(ids_and_status, index=False)
del movie_list_raw

## Clean watched data

enjoyment score: 0=no; 1=mweh; 2=fun; 3=good; 4=great

In [11]:
watched = pd.read_excel(ids_and_status)
watched["watched"] = watched["watched"].astype('Int64').astype(bool)
watched["prime"] = watched["prime"].astype('Int64').replace(0, False).replace(1, True)
watched["netflix"] = watched["netflix"].astype('Int64').replace(0, False).replace(1, True)
watched["enjoyment"] = watched["enjoyment"].astype('Int64').replace(0, False).replace(1, True)
watched["tconst"] = watched["tconst"].str.strip()
watched

Unnamed: 0,tconst,watched,netflix,prime,enjoyment,priority
0,tt0015324,False,,,,
1,tt0017136,False,0,0,,
2,tt0022100,False,0,0,,
3,tt0025316,False,,,,
4,tt0031381,False,,,,
...,...,...,...,...,...,...
595,tt9691136,False,0,0,,
596,tt9731598,False,0,0,,
597,tt9764362,True,0,0,4,
598,tt9783600,True,,,4,


## Add imdb data

### Add basic title data

In [12]:
title_basics_file = os.path.join("data", "imdb", FILES_IMDB["tit_bas"])
title_basics = pd.read_csv(title_basics_file, sep="\t")
title_basics = title_basics.replace(to_replace = "\\N", value = np.nan)
# title_watched = pd.merge(watched, title_basics, on="tconst", how="left") # new merge, keeps wrong stuff
watched_title = pd.merge(watched, title_basics, on="tconst", how="left") # new merge, keeps wrong stuff
del title_basics # cleanup memory by force
watched_title

  title_basics = pd.read_csv(title_basics_file, sep="\t")


Unnamed: 0,tconst,watched,netflix,prime,enjoyment,priority,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres
0,tt0015324,False,,,,,movie,Sherlock Jr.,Sherlock Jr.,0,1924,,45,"Action,Comedy,Romance"
1,tt0017136,False,0,0,,,movie,Metropolis,Metropolis,0,1927,,153,"Drama,Sci-Fi"
2,tt0022100,False,0,0,,,movie,M,M - Eine Stadt sucht einen Mörder,0,1931,,117,"Crime,Mystery,Thriller"
3,tt0025316,False,,,,,movie,It Happened One Night,It Happened One Night,0,1934,,105,"Comedy,Romance"
4,tt0031381,False,,,,,movie,Gone with the Wind,Gone with the Wind,0,1939,,238,"Drama,Romance,War"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,tt9691136,False,0,0,,,movie,Shadow in the Cloud,Shadow in the Cloud,0,2020,,83,"Action,Horror,War"
596,tt9731598,False,0,0,,,movie,Bros,Bros,0,2022,,115,"Comedy,Romance"
597,tt9764362,True,0,0,4,,movie,The Menu,The Menu,0,2022,,107,"Comedy,Horror,Thriller"
598,tt9783600,True,,,4,,movie,Spiderhead,Spiderhead,0,2022,,106,"Action,Crime,Drama"


### Add ratings


In [13]:
title_rate_file = os.path.join("data", "imdb", FILES_IMDB["tit_rate"])
title_rate = pd.read_csv(title_rate_file, sep="\t")
title_rate = title_rate.replace(to_replace = "\\N", value = np.nan)
title_rate.loc[:,"numVotes"] = title_rate.loc[:,"numVotes"].astype('Int64')
# watched_film_fin = pd.merge(watched_title,title_rate, on="tconst", how="left")
watched_title_rate = pd.merge(watched_title,title_rate, on="tconst", how="left")
del watched_title # cleanup memory by force
watched_title_rate

Unnamed: 0,tconst,watched,netflix,prime,enjoyment,priority,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes
0,tt0015324,False,,,,,movie,Sherlock Jr.,Sherlock Jr.,0,1924,,45,"Action,Comedy,Romance",8.2,51128
1,tt0017136,False,0,0,,,movie,Metropolis,Metropolis,0,1927,,153,"Drama,Sci-Fi",8.3,175950
2,tt0022100,False,0,0,,,movie,M,M - Eine Stadt sucht einen Mörder,0,1931,,117,"Crime,Mystery,Thriller",8.3,159522
3,tt0025316,False,,,,,movie,It Happened One Night,It Happened One Night,0,1934,,105,"Comedy,Romance",8.1,105338
4,tt0031381,False,,,,,movie,Gone with the Wind,Gone with the Wind,0,1939,,238,"Drama,Romance,War",8.2,317795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
595,tt9691136,False,0,0,,,movie,Shadow in the Cloud,Shadow in the Cloud,0,2020,,83,"Action,Horror,War",5.0,27782
596,tt9731598,False,0,0,,,movie,Bros,Bros,0,2022,,115,"Comedy,Romance",6.4,24786
597,tt9764362,True,0,0,4,,movie,The Menu,The Menu,0,2022,,107,"Comedy,Horror,Thriller",7.3,133744
598,tt9783600,True,,,4,,movie,Spiderhead,Spiderhead,0,2022,,106,"Action,Crime,Drama",5.4,57923


### Add cast and crew

#### retrieve only needed cast and crew members from massive file

I tried all joining variations.
Concat takes more than an hour.
Merge takes less than 5 minutes.
Join does not give the needed result.

In [14]:
cast_crew_mega_file = os.path.join("data", "imdb", FILES_IMDB["cast_crew"])
watched_films_cast  = pd.DataFrame(columns=["tconst", "ordering","nconst", "category", "job", "characters"])

for chunk in pd.read_csv(cast_crew_mega_file, sep="\t", chunksize=1000):
    rows = pd.merge(watched_title_rate.loc[:,"tconst"], chunk, on="tconst", how="inner")
    watched_films_cast = pd.concat([rows,watched_films_cast], ignore_index = True)

watched_films_cast.replace(to_replace = "\\N", value = np.nan, inplace=True)
watched_films_cast.drop(['characters'], axis=1, inplace=True)

In [15]:
watched_films_cast

Unnamed: 0,tconst,ordering,nconst,category,job
0,tt9806192,10,nm10909656,production_designer,
1,tt9806192,1,nm7079932,actor,
2,tt9806192,2,nm4973460,actress,
3,tt9806192,3,nm1253936,actor,
4,tt9806192,4,nm11127862,actor,
...,...,...,...,...,...
5945,tt0015324,5,nm0369841,writer,story
5946,tt0015324,6,nm0593477,writer,story
5947,tt0015324,7,nm0115669,writer,story
5948,tt0015324,8,nm3816287,composer,


#### Add cast and crew memebers to the movie list

In [16]:
watched_title_rate_personel = pd.merge(watched_title_rate, watched_films_cast, how='inner', on='tconst')
del watched_title_rate
del watched_films_cast

### Add info about personell

In [17]:
names_file = os.path.join("data", "imdb", FILES_IMDB["name_bas"])
names_basics = pd.read_csv(names_file, sep="\t")
names_basics = names_basics.replace(to_replace = "\\N", value = np.nan)
names_basics.loc[:,"birthYear"] = names_basics.loc[:,"birthYear"].astype('Int64')
names_basics.loc[:,"deathYear"] = names_basics.loc[:,"deathYear"].astype('Int64')
col_delete = ["knownForTitles"]
names_basics = names_basics.drop(col_delete, axis=1)
names_basics

Unnamed: 0,nconst,primaryName,birthYear,deathYear,primaryProfession
0,nm0000001,Fred Astaire,1899,1987,"soundtrack,actor,miscellaneous"
1,nm0000002,Lauren Bacall,1924,2014,"actress,soundtrack"
2,nm0000003,Brigitte Bardot,1934,,"actress,soundtrack,music_department"
3,nm0000004,John Belushi,1949,1982,"actor,soundtrack,writer"
4,nm0000005,Ingmar Bergman,1918,2007,"writer,director,actor"
...,...,...,...,...,...
12229758,nm9993714,Romeo del Rosario,,,"animation_department,art_department"
12229759,nm9993716,Essias Loberg,,,
12229760,nm9993717,Harikrishnan Rajan,,,cinematographer
12229761,nm9993718,Aayush Nair,,,cinematographer


In [18]:
watched_title_rate_personel_names = pd.merge(watched_title_rate_personel, names_basics, how='left', on="nconst")
del watched_title_rate_personel

In [19]:
watched_title_rate_personel_names

Unnamed: 0,tconst,watched,netflix,prime,enjoyment,priority,titleType,primaryTitle,originalTitle,isAdult,...,averageRating,numVotes,ordering,nconst,category,job,primaryName,birthYear,deathYear,primaryProfession
0,tt0015324,False,,,,,movie,Sherlock Jr.,Sherlock Jr.,0,...,8.2,51128,10,nm0504380,cinematographer,,Elgin Lessley,1883,1944,"cinematographer,actor"
1,tt0015324,False,,,,,movie,Sherlock Jr.,Sherlock Jr.,0,...,8.2,51128,1,nm0000036,actor,,Buster Keaton,1895,1966,"actor,writer,director"
2,tt0015324,False,,,,,movie,Sherlock Jr.,Sherlock Jr.,0,...,8.2,51128,2,nm0570230,actress,,Kathryn McGuire,1903,1978,actress
3,tt0015324,False,,,,,movie,Sherlock Jr.,Sherlock Jr.,0,...,8.2,51128,3,nm0444172,actor,,Joe Keaton,1867,1946,actor
4,tt0015324,False,,,,,movie,Sherlock Jr.,Sherlock Jr.,0,...,8.2,51128,4,nm0175068,actor,,Erwin Connelly,1878,1931,actor
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5945,tt9806192,True,,,4,,movie,I Lost My Body,J'ai perdu mon corps,0,...,7.5,33767,5,nm3021346,director,,Jérémy Clapin,1974,,"writer,director,editor"
5946,tt9806192,True,,,4,,movie,I Lost My Body,J'ai perdu mon corps,0,...,7.5,33767,6,nm0491011,writer,adaptation and dialogue,Guillaume Laurant,1961,,"writer,actor"
5947,tt9806192,True,,,4,,movie,I Lost My Body,J'ai perdu mon corps,0,...,7.5,33767,7,nm0238941,producer,producer,Marc Du Pontavice,,,"producer,production_manager,writer"
5948,tt9806192,True,,,4,,movie,I Lost My Body,J'ai perdu mon corps,0,...,7.5,33767,8,nm1776887,composer,,Dan Levy,1976,,"soundtrack,composer,music_department"


# save the data as a pickle for step 2: processing data

In [26]:
output = os.path.join("data", "generated", FILES_GENERATED["films_raw"])
watched_title_rate_personel_names
watched_title_rate_personel_names.to_pickle(output)

END_TIME = time.time()
time_format = time.strftime("%H:%M:%S", time.gmtime(END_TIME-START_TIME))
print("Execution time: ",time_format)

Execution time:  00:15:07
