In [1]:
import pandas as pd

In [2]:
imdb_titles_path = '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.basics.tsv'
imdb_ratings_path = '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.ratings.tsv'
imdb_principals_path = '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.principals.tsv'
imdb_crew_path = '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.crew.tsv'
imdb_names_path = '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/name.basics.tsv'
rotten_tomatoes_movies_path = '/Users/antoniam/Desktop/personal/netflix-ratings/data/rotten_tomatoes/rotten_tomatoes_movies.csv'
netflix_titles_path = '/Users/antoniam/Desktop/personal/netflix-ratings/data/netflix/netflix_titles.csv'

## IMDB dataset

In [None]:
imdb_titles_df = pd.read_csv(imdb_titles_path, '\t', low_memory=False)
imdb_titles_df.head(2)

In [None]:
imdb_ratings_df = pd.read_csv(imdb_ratings_path,'\t')
imdb_ratings_df.head(2)

In [None]:
imdb_principals_df = pd.read_csv(imdb_principals_path,'\t')
imdb_principals_df.head(2)

In [None]:
imdb_names_df = pd.read_csv(imdb_names_path,'\t')
imdb_names_df.head(2)

## Rotten Tomatoes Dataset

In [None]:
rt_df = pd.read_csv(rotten_tomatoes_movies_path)
rt_df.head(2)

## Netflix Dataset

In [None]:
netflix_df = pd.read_csv(netflix_titles_path)
netflix_df.head(2)

## Tables Create Statements

In [3]:
import psycopg2

  """)


In [4]:
def create_database():

    # connect to default database
    conn = psycopg2.connect("host=127.0.0.1 dbname=postgres user=antoniam")
    conn.set_session(autocommit=True)
    cur = conn.cursor()
    
    # create sparkify database with UTF8 encoding
    cur.execute("""
    SELECT pg_terminate_backend(pg_stat_activity.pid)
    FROM pg_stat_activity
    WHERE pg_stat_activity.datname = 'movies_db';""")
    cur.execute("DROP DATABASE IF EXISTS movies_db")
    cur.execute("CREATE DATABASE movies_db WITH ENCODING 'utf8' TEMPLATE template0")

    # close connection to default database
    conn.close()    
    
    # connect to sparkify database
    conn = psycopg2.connect("host=127.0.0.1 dbname=movies_db")
    conn.set_session(autocommit=True)
    cur = conn.cursor()
    
    return cur, conn

In [5]:
cur, conn = create_database()

In [6]:
create_staging_imdb_titles ="""CREATE TABLE IF NOT EXISTS staging_imdb_titles(
tconst VARCHAR NOT NULL PRIMARY KEY,
titleType VARCHAR,
primaryTitle VARCHAR,
originalTitle VARCHAR,
isAdult VARCHAR,
startYear VARCHAR,
endYear VARCHAR,
runtimeMinutes VARCHAR,
genres VARCHAR);
"""

create_staging_imdb_ratings ="""CREATE TABLE IF NOT EXISTS staging_imdb_ratings(
tconst VARCHAR NOT NULL,
averageRating FLOAT,
numVotes INT);
"""

create_staging_imdb_principals = """CREATE TABLE IF NOT EXISTS staging_imdb_principals(
tconst VARCHAR NOT NULL,
ordering INTEGER NOT NULL,
nconst VARCHAR NOT NULL, 
category VARCHAR NOT NULL,
job VARCHAR NOT NULL,
characters VARCHAR NOT NULL);
"""

create_staging_imdb_crew = """CREATE TABLE IF NOT EXISTS staging_imdb_crew(
tconst VARCHAR NOT NULL,
directors VARCHAR, 
writers VARCHAR);
"""

create_staging_imdb_names = """CREATE TABLE IF NOT EXISTS staging_imdb_names(
nconst VARCHAR NOT NULL PRIMARY KEY,
primaryName VARCHAR NOT NULL,
birthYear VARCHAR,
deathYear VARCHAR,
primaryProfession VARCHAR,
knownForTitles VARCHAR);
"""

create_staging_rotten_tomatoes_titles = """CREATE TABLE IF NOT EXISTS staging_rotten_tomatoes_titles(
rotten_tomatoes_link VARCHAR, 
movie_title VARCHAR,
movie_info VARCHAR,
critics_consensus VARCHAR, 
content_rating VARCHAR, 
genres VARCHAR, 
directors VARCHAR, 
authors VARCHAR,
actors VARCHAR, 
original_release_date DATE, 
streaming_release_date DATE, 
runtime FLOAT,
production_company VARCHAR, 
tomatometer_status VARCHAR, 
tomatometer_rating FLOAT,
tomatometer_count INT, 
audience_status VARCHAR, 
audience_rating FLOAT,
audience_count INT, 
tomatometer_top_critics_count INT,
tomatometer_fresh_critics_count INT, 
tomatometer_rotten_critics_count INT);
"""
create_staging_netflix_titles = """CREATE TABLE IF NOT EXISTS staging_netflix_titles(
show_id VARCHAR NOT NULL PRIMARY KEY,
"type" VARCHAR,
title VARCHAR,
director VARCHAR,
"cast" VARCHAR,
country VARCHAR,
date_added DATE,
release_year INT4,
rating VARCHAR,
duration VARCHAR,
listed_in VARCHAR,
description VARCHAR);
"""

In [7]:
create_temp_table = \
"""CREATE TABLE IF NOT EXISTS temp_netflix_imdb(
    imdb_title_id VARCHAR,
    netflix_title_id VARCHAR);
"""

In [8]:
create_titles = \
"""CREATE TABLE IF NOT EXISTS titles(
  id SERIAL PRIMARY KEY,
  name VARCHAR,
  description VARCHAR,
  year INT4,
  runtime_minutes INT,
  country VARCHAR,
  isAdult BOOLEAN,
  type VARCHAR,
  imdb_avg_score FLOAT,
  imdb_n_ratings INT,
  rt_critics_score FLOAT,
  rt_n_critics INT,
  rt_audience_score FLOAT,
  rt_n_audience INT);
"""

create_roles = \
"""CREATE TABLE IF NOT EXISTS roles(
id SERIAL PRIMARY KEY,
title_id VARCHAR,
person_id VARCHAR,
role_name VARCHAR NOT NULL
);
"""

create_persons = \
"""CREATE TABLE IF NOT EXISTS persons(
id VARCHAR NOT NULL PRIMARY KEY,
full_name VARCHAR NOT NULL,
birth_year INT4,
death_year INT4
);
"""

create_genres = \
"""CREATE TABLE IF NOT EXISTS genres(
id SERIAL PRIMARY KEY,
title_id VARCHAR NOT NULL,
genre_name VARCHAR NOT NULL);"""


In [9]:
staging_tables_sql = [create_staging_imdb_titles, create_staging_imdb_ratings, create_staging_imdb_principals, create_staging_imdb_names, 
 create_staging_rotten_tomatoes_titles, create_staging_netflix_titles, create_staging_imdb_crew, create_temp_table]

final_tables = [create_titles, create_roles, create_persons, create_genres]

In [10]:
for s in staging_tables_sql + final_tables:
    cur.execute(s)

In [11]:
copy_staging_titles = f"COPY staging_imdb_titles FROM '{imdb_titles_path}' delimiter '\t' CSV HEADER"
copy_staging_ratings = f"COPY staging_imdb_ratings FROM '{imdb_ratings_path}' delimiter '\t' CSV HEADER"
copy_staging_principals = f"COPY staging_imdb_principals FROM '{imdb_principals_path}' delimiter '\t' CSV HEADER"
copy_staging_crew = f"COPY staging_imdb_crew FROM '{imdb_crew_path}' delimiter '\t' CSV HEADER"
copy_staging_names = f"COPY staging_imdb_names FROM '{imdb_names_path}' delimiter '\t' CSV HEADER"
copy_staging_rt = f"COPY staging_rotten_tomatoes_titles FROM '{rotten_tomatoes_movies_path}' delimiter ',' CSV HEADER"
copy_staging_netflix = f"COPY staging_netflix_titles FROM '{netflix_titles_path}' delimiter ',' CSV HEADER"


copy_staging = [copy_staging_titles, copy_staging_ratings, copy_staging_principals, copy_staging_names, copy_staging_crew, copy_staging_rt, copy_staging_netflix]

for s in copy_staging: 
    print(s)
    cur.execute(s)

COPY staging_imdb_titles FROM '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.basics.tsv' delimiter '	' CSV HEADER
COPY staging_imdb_ratings FROM '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.ratings.tsv' delimiter '	' CSV HEADER
COPY staging_imdb_principals FROM '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.principals.tsv' delimiter '	' CSV HEADER
COPY staging_imdb_names FROM '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/name.basics.tsv' delimiter '	' CSV HEADER
COPY staging_imdb_crew FROM '/Users/antoniam/Desktop/personal/netflix-ratings/data/imdb/title.crew.tsv' delimiter '	' CSV HEADER
COPY staging_rotten_tomatoes_titles FROM '/Users/antoniam/Desktop/personal/netflix-ratings/data/rotten_tomatoes/rotten_tomatoes_movies.csv' delimiter ',' CSV HEADER
COPY staging_netflix_titles FROM '/Users/antoniam/Desktop/personal/netflix-ratings/data/netflix/netflix_titles.csv' delimiter ',' CSV HEADER


## Final tables

### Temp table

In [None]:
cur.execute('DROP table temp_netflix_imdb')

In [None]:
cur.execute(create_temp_table)

In [None]:
from time import time

In [None]:
t = time()
st = \
"""INSERT INTO temp_netflix_imdb(imdb_title_id, netflix_title_id)
SELECT distinct imdb.tconst, netflix.show_id
FROM staging_netflix_titles netflix 
LEFT JOIN staging_imdb_titles imdb
ON lower(substr(netflix.title, 1, 8)) = lower(substr(imdb.primaryTitle,1, 8))
WHERE imdb.titleType in ('tvSpecial', 'tvSeries', 'tvShort', 'movie', 'tvMovie', 'short', 'tvMiniSeries')
"""
cur.execute(st)
print(time() - t)

In [None]:
r = cur.fetchall()

In [None]:
len(r)

In [None]:
len(r)

### Genres table

In [12]:
cur.execute(create_genres)

In [13]:
t = time()
st = \
"""INSERT INTO genres(title_id, genre_name)
   SELECT tconst, unnest(string_to_array(genres, ',')) as genre
   FROM staging_imdb_titles 
   WHERE genres!='\\N' 
   AND titleType in ('tvSpecial', 'tvSeries', 'tvShort', 'movie', 'tvMovie', 'short', 'tvMiniSeries')
   """
cur.execute(st)
print(time() - t)

### Persons table

In [14]:
%% time
st = \
"""INSERT INTO persons(id, full_name, birth_year, death_year)
SELECT nconst, primaryName, 
NULLIF(regexp_replace(birthYear, '\D','','g'), '')::numeric AS birth_year,
NULLIF(regexp_replace(deathYear, '\D','','g'), '')::numeric AS death_year
from staging_imdb_names
"""
cur.execute(st)

### Roles table

In [18]:
%%time
st = \
"""INSERT INTO roles(title_id, person_id, role_name)
SELECT titles.tconst, nconst, category
FROM staging_imdb_titles titles
JOIN staging_imdb_principals principals
ON titles.tconst = principals.tconst
WHERE principals.category in ('director', 'writer', 'actor', 'actress') 
AND titles.titleType in ('tvSpecial', 'tvSeries', 'tvShort', 'movie', 'tvMovie', 'short', 'tvMiniSeries')

UNION
SELECT titles.tconst, unnest(string_to_array(directors, ',')) as nconst, 'director'
FROM staging_imdb_titles titles
JOIN staging_imdb_crew crew
ON titles.tconst = crew.tconst
WHERE directors!='\\N'
AND titles.titleType in ('tvSpecial', 'tvSeries', 'tvShort', 'movie', 'tvMovie', 'short', 'tvMiniSeries')

UNION
SELECT titles.tconst, unnest(string_to_array(writers, ',')) as nconst, 'writer'
FROM staging_imdb_titles titles
JOIN staging_imdb_crew crew
ON titles.tconst = crew.tconst
WHERE writers!='\\N'
AND titles.titleType in ('tvSpecial', 'tvSeries', 'tvShort', 'movie', 'tvMovie', 'short', 'tvMiniSeries')
"""
cur.execute(st)

CPU times: user 5.39 ms, sys: 3.14 ms, total: 8.53 ms
Wall time: 3min 59s


### Titles table

In [51]:
%%time
st = """
SELECT staging_netflix_titles.show_id, staging_netflix_titles.type, staging_netflix_titles.title, 
staging_netflix_titles.country, staging_netflix_titles.release_year, 
staging_netflix_titles.description, with_imdb.tconst, staging_netflix_titles.rating, with_imdb.runtimeMinutes, 
averageRating, numVotes, with_rt.tomatometer_rating, with_rt.tomatometer_count, with_rt.audience_rating, 
with_rt.audience_count, with_rt.critics_consensus

FROM staging_netflix_titles 
LEFT JOIN (
        SELECT distinct netflix.show_id, netflix.type, netflix.title, netflix.country, netflix.release_year, 
                netflix.description, imdb.tconst, runtimeMinutes, averageRating, numVotes
                FROM(
                    SELECT show_id, type, title, country, release_year, description,
                    unnest(string_to_array(staging_netflix_titles.director, ',')) as director,
                    unnest(string_to_array(staging_netflix_titles.cast, ',')) as actor
                    FROM staging_netflix_titles) as netflix
                JOIN(
                    SELECT *
                    FROM staging_imdb_titles 
                    JOIN roles
                    ON roles.title_id = staging_imdb_titles.tconst
                    JOIN persons 
                    ON persons.id = roles.person_id
                    WHERE titleType in ('tvSpecial', 'tvSeries', 'tvShort', 'movie', 'tvMovie', 'short', 'tvMiniSeries')) as imdb
                JOIN staging_imdb_ratings 
                    ON staging_imdb_ratings.tconst = imdb.tconst
            ON lower(substr(netflix.title, 1, 5)) = lower(substr(imdb.OriginalTitle, 1, 5))
            AND netflix.release_year::varchar = imdb.startYear
            AND (netflix.actor = imdb.full_name or netflix.director = imdb.full_name)
        ) as with_imdb
ON with_imdb.show_id = staging_netflix_titles.show_id

LEFT JOIN (
        SELECT distinct netflix.show_id, netflix.type, netflix.title, netflix.country, netflix.release_year, 
                netflix.description, rt.tomatometer_rating, rt.tomatometer_count, rt.audience_rating, rt.audience_count,
                rt.critics_consensus
                FROM(
                    SELECT show_id, type, title, country, release_year, description,
                    unnest(string_to_array(staging_netflix_titles.director, ',')) as director,
                    unnest(string_to_array(staging_netflix_titles.cast, ',')) as actor
                    FROM staging_netflix_titles) as netflix
                JOIN(
                    SELECT rt.tomatometer_rating, rt.tomatometer_count, rt.audience_rating, rt.audience_count,
                    rt.critics_consensus, rt.movie_title, rt.original_release_date,
                    unnest(string_to_array(rt.directors, ',')) as director,
                    unnest(string_to_array(rt.actors, ',')) as actor
                    FROM staging_rotten_tomatoes_titles rt) rt
                ON lower(substr(netflix.title, 1, 5)) = lower(substr(rt.movie_title, 1, 5))
                AND netflix.release_year = extract(year from rt.original_release_date)
                AND (netflix.director = rt.director or netflix.actor = rt.actor) 
                ) as with_rt
ON with_rt.show_id = staging_netflix_titles.show_id
"""
cur.execute(st)

CPU times: user 4.17 ms, sys: 3.67 ms, total: 7.84 ms
Wall time: 22.7 s
