# Introduction to Data Warehouses

Pagila is a sample database that is used to demonstrate the use of PostgreSQL. It is a port of the Sakila sample database for MySQL. The Pagila database is a DVD rental store database. It contains information about the store's inventory, staff, and customers. It was provided by Udacity as part of the Data Engineering course.  

Originally, pagila was in 3NF, like this:  

<img src="../../images/pagila-3nf.png"  width="600">  

As always, we start by importing the necessary libraries and setting some variables.

In [None]:
# Load extension
%load_ext sql

# Import libraries
from dotenv import dotenv_values
import pandas as pd
import psycopg2 as pg
from sqlalchemy import create_engine
from sqlalchemy.engine.base import Engine
from sqlalchemy.sql import text

In [None]:
# Load environment variables
config = dotenv_values()

# DB_INFO
DB_INFO = {
    'path': config['DB_PATH'],
    'host': config['DB_HOST'],
    'port': config['DB_PORT'],
    'database': config['DB_NAME'],
    'user': config['DB_USER'],
    'password': config['DB_PASS']
}

In [24]:
!PGPASSWORD={DB_INFO["password"]} {DB_INFO["path"]}/createdb -h {DB_INFO["host"]} -U {DB_INFO["user"]} {DB_INFO["database"]}
!PGPASSWORD={DB_INFO["password"]} {DB_INFO["path"]}/psql -q -h {DB_INFO["host"]} -U {DB_INFO["user"]} -d {DB_INFO["database"]} -f ../../data/pagila-0.10.1/pagila-schema.sql
!PGPASSWORD={DB_INFO["password"]} {DB_INFO["path"]}/psql -q -h {DB_INFO["host"]} -U {DB_INFO["user"]} -d {DB_INFO["database"]} -f ../../data/pagila-0.10.1/pagila-data.sql



createdb: error: database creation failed: ERROR:  database "pagila" already exists
 setval 
--------
    200
(1 row)

 setval 
--------
    605
(1 row)

 setval 
--------
     16
(1 row)

 setval 
--------
    600
(1 row)

 setval 
--------
    109
(1 row)

 setval 
--------
    599
(1 row)

 setval 
--------
   1000
(1 row)

 setval 
--------
   4581
(1 row)

 setval 
--------
      6
(1 row)

 setval 
--------
  32098
(1 row)

 setval 
--------
  16049
(1 row)

 setval 
--------
      2
(1 row)

 setval 
--------
      2
(1 row)



In [27]:
# Load environment variables
config = dotenv_values()

# DB_INFO
DB_INFO = {
    'host': config['DB_HOST'],
    'port': config['DB_PORT'],
    'database': config['DB_NAME'],
    'user': config['DB_USER'],
    'password': config['DB_PASS']
}

In [29]:
# Create connection strings for SQL magic and SQLAlchemy
CONNECTION_STRING = f"postgresql://{DB_INFO['user']}:{DB_INFO['password']}@{DB_INFO['host']}:{DB_INFO['port']}/{DB_INFO['database']}"
ENGINE_STRING = f"postgresql+psycopg2://{DB_INFO['user']}:{DB_INFO['password']}@{DB_INFO['host']}:{DB_INFO['port']}/{DB_INFO['database']}"

### Connection and Check
#### With SQL

In [30]:
%sql $CONNECTION_STRING

query = """
SELECT COUNT(*) AS count
FROM information_schema.tables
WHERE table_schema = 'public';
"""

%sql $query

 * postgresql://postgres:***@localhost:5432/pagila
1 rows affected.


count
22


#### With psycopg2

In [31]:
# Connect to database
connection = pg.connect(
    host=DB_INFO['host'],
    port=DB_INFO['port'],
    database=DB_INFO['database'],
    user=DB_INFO['user'],
    password=DB_INFO['password']
)

# Get cursor
cursor = connection.cursor()

# Check number of tables
cursor.execute(
"""
SELECT COUNT(*) AS count
FROM information_schema.tables
WHERE table_schema = 'public';
""")

# Get table count
table_count = cursor.fetchone()[0]
print("table_count", table_count)

# Close connection
cursor.close()
connection.close()

table_count 22


#### With sqlalchemy

In [32]:
ENGINE = create_engine(ENGINE_STRING)

def get_df_from_query(engine: Engine, query: str) -> pd.DataFrame:
    with engine.connect() as conn:
        df = pd.read_sql(sql=text(query), con=conn)
    return df

def get_string_from_query(engine: Engine, query: str) -> None:
    df = get_df_from_query(engine, query)
    return df.to_string(index=None)

query = """
SELECT COUNT(*) AS table_count
FROM information_schema.tables
WHERE table_schema = 'public';
"""

print(get_string_from_query(ENGINE, query))

 table_count
          22


### Exploration
#### How many rows and columns are there in selected tables?

In [33]:
def get_table_count(engine: Engine, table_name: str):
    query = f"""
    SELECT COUNT(*) AS count
    FROM {table_name};
    """
    return f"{table_name}: {get_df_from_query(engine, query).values[0][0]}"

for table_name in ["film", "customer", "rental", "payment", "staff", "store", "city", "country"]:
    print(get_table_count(ENGINE, table_name))

film: 1000
customer: 599
rental: 16044
payment: 16049
staff: 2
store: 2
city: 600
country: 109


#### What time periode is covered by the data?

In [34]:
query = """
SELECT min(payment_date) as start, max(payment_date) as end from payment;
"""

print(get_string_from_query(ENGINE, query))

                           start                              end
2017-01-24 20:21:56.996577+00:00 2017-05-14 11:44:29.996577+00:00


#### Where do events occur?

In [35]:
query = """
SELECT district, count(district) as n
FROM address
GROUP BY district
ORDER BY n DESC LIMIT 10;
"""

print(get_string_from_query(ENGINE, query))

         district   n
     Buenos Aires  10
         Shandong   9
       California   9
     West Bengali   9
    Uttar Pradesh   8
         So Paulo   8
          England   7
      Maharashtra   7
 Southern Tagalog   6
             Gois   5


### Create Star Schema

From the 3NF, we aim to transform it into a star schema like this:

<img src="../../images/pagila-star.png"  width="600">

In [36]:
query = """
SELECT 
    DISTINCT(TO_CHAR(payment_date :: DATE, 'yyyyMMDD')::integer) AS date_key,
    DATE(payment_date)                                           AS date,
    EXTRACT(year FROM payment_date)                              AS year,
    EXTRACT(quarter FROM payment_date)                           AS quarter,
    EXTRACT(month FROM payment_date)                             AS month,
    EXTRACT(day FROM payment_date)                               AS day,
    EXTRACT(week FROM payment_date)                              AS week,
    CASE 
        WHEN EXTRACT(ISODOW FROM payment_date) IN (6, 7) 
        THEN true 
        ELSE false END 
    AS is_weekend
FROM payment;
"""

get_df_from_query(ENGINE, query).head()

Unnamed: 0,date_key,date,year,quarter,month,day,week,is_weekend
0,20170430,2017-04-30,2017.0,2.0,4.0,30.0,17.0,True
1,20170412,2017-04-12,2017.0,2.0,4.0,12.0,15.0,False
2,20170302,2017-03-02,2017.0,1.0,3.0,2.0,9.0,False
3,20170131,2017-01-31,2017.0,1.0,1.0,31.0,5.0,False
4,20170126,2017-01-26,2017.0,1.0,1.0,26.0,4.0,False


In [37]:
query = """
SELECT
    customer.customer_id,
    customer.first_name,
    customer.last_name,
    customer.email,
    address.address,
    address.address2,
    address.district,
    city.city,
    country.country,
    address.postal_code,
    address.phone,
    customer.active,
    customer.create_date
FROM 
    customer
    JOIN address ON (customer.address_id = address.address_id)
    JOIN city ON (address.city_id = city.city_id)
    JOIN country ON (city.country_id = country.country_id)
"""

get_df_from_query(ENGINE, query).head()

Unnamed: 0,customer_id,first_name,last_name,email,address,address2,district,city,country,postal_code,phone,active,create_date
0,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,1913 Hanoi Way,,Nagasaki,Sasebo,Japan,35200,28303384290,1,2017-02-14
1,2,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,1121 Loja Avenue,,California,San Bernardino,United States,17886,838635286649,1,2017-02-14
2,3,LINDA,WILLIAMS,LINDA.WILLIAMS@sakilacustomer.org,692 Joliet Street,,Attika,Athenai,Greece,83579,448477190408,1,2017-02-14
3,4,BARBARA,JONES,BARBARA.JONES@sakilacustomer.org,1566 Inegl Manor,,Mandalay,Myingyan,Myanmar,53561,705814003527,1,2017-02-14
4,5,ELIZABETH,BROWN,ELIZABETH.BROWN@sakilacustomer.org,53 Idfu Parkway,,Nantou,Nantou,Taiwan,42399,10655648674,1,2017-02-14


In [38]:
query = """
SELECT
    store.store_id,
    address.address,
    address.address2,
    address.district,
    city.city,
    country.country,
    address.postal_code,
    staff.first_name AS manager_first_name,
    staff.last_name AS manager_last_name
FROM 
    store
    JOIN address ON (store.address_id = address.address_id)
    JOIN city ON (address.city_id = city.city_id)
    JOIN country ON (city.country_id = country.country_id)
    JOIN staff ON (store.manager_staff_id = staff.staff_id)
"""

get_df_from_query(ENGINE, query).head()

Unnamed: 0,store_id,address,address2,district,city,country,postal_code,manager_first_name,manager_last_name
0,1,47 MySakila Drive,,Alberta,Lethbridge,Canada,,Mike,Hillyer
1,2,28 MySQL Boulevard,,QLD,Woodridge,Australia,,Jon,Stephens


In [47]:
query = """
SELECT
    film.film_id,
    film.title,
    film.description,
    film.release_year,
    this_language.name AS language,
    orig_language.name AS original_language,
    film.rental_duration,
    film.length,
    film.rating,
    film.special_features
FROM
    film
    JOIN language AS this_language ON (film.language_id = this_language.language_id)
    LEFT JOIN language AS orig_language ON (film.original_language_id = orig_language.language_id)
"""

get_df_from_query(ENGINE, query).head()

Unnamed: 0,film_id,title,description,release_year,language,original_language,rental_duration,length,rating,special_features
0,1,ACADEMY DINOSAUR,A Epic Drama of a Feminist And a Mad Scientist...,2006,English,,6,86,PG,"[Deleted Scenes, Behind the Scenes]"
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006,English,,3,48,G,"[Trailers, Deleted Scenes]"
2,3,ADAPTATION HOLES,A Astounding Reflection of a Lumberjack And a ...,2006,English,,7,50,NC-17,"[Trailers, Deleted Scenes]"
3,4,AFFAIR PREJUDICE,A Fanciful Documentary of a Frisbee And a Lumb...,2006,English,,5,117,G,"[Commentaries, Behind the Scenes]"
4,5,AFRICAN EGG,A Fast-Paced Documentary of a Pastry Chef And ...,2006,English,,6,130,G,[Deleted Scenes]
