**Libraries**

In [1]:
import os
import numpy
import pandas as pd
from sqlalchemy import create_engine

**Declaring & Assigning Connection Variables for my MySQL Server & Sakila Database**

In [2]:
host_name = "localhost"
port = "3306"
user_id = "root"
pwd = "Passw0rd123"

src_dbname = "sakila"
dst_dbname = "sakila_dw"

**Defining Functions for Getting Data From and Setting Data Into Database**

In [3]:
def get_dataframe(user_id, pwd, host_name, db_name, sql_query):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, connection);
    connection.close()
    
    return dframe


def set_dataframe(user_id, pwd, host_name, db_name, df, table_name, pk_column, db_operation):
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

**Creates the New Data Warehouse database & Switches the Connection Context**

In [4]:
conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}"
sqlEngine = create_engine(conn_str, pool_recycle=3600)

sqlEngine.execute(f"DROP DATABASE IF EXISTS `{dst_dbname}`;")
sqlEngine.execute(f"CREATE DATABASE `{dst_dbname}`;")
sqlEngine.execute(f"USE {dst_dbname};")

<sqlalchemy.engine.cursor.LegacyCursorResult at 0x21ffc9d8d60>

### Creates & Populates the New Dimension Tables
**Extracts Data from the Source Database Tables**

In [5]:
sql_customers = "SELECT * FROM sakila.customer;"
df_customers = get_dataframe(user_id, pwd, host_name, src_dbname, sql_customers)
df_customers.head(2)

Unnamed: 0,customer_id,store_id,first_name,last_name,email,address_id,active,create_date,last_update
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2006-02-14 22:04:36,2006-02-15 04:57:20
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6,1,2006-02-14 22:04:36,2006-02-15 04:57:20


In [6]:
sql_payments = "SELECT * FROM sakila.payment;"
df_payments = get_dataframe(user_id, pwd, host_name, src_dbname, sql_payments)
df_payments.head(2)

Unnamed: 0,payment_id,customer_id,staff_id,rental_id,amount,payment_date,last_update
0,1,1,1,76,2.99,2005-05-25 11:30:37,2006-02-15 22:12:30
1,2,1,1,573,0.99,2005-05-28 10:35:23,2006-02-15 22:12:30


In [7]:
sql_rentals = "SELECT * FROM sakila.rental;"
df_rentals = get_dataframe(user_id, pwd, host_name, src_dbname, sql_rentals)
df_rentals.head(2)

Unnamed: 0,rental_id,rental_date,inventory_id,customer_id,return_date,staff_id,last_update
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30,1,2006-02-15 21:30:53
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33,1,2006-02-15 21:30:53


In [8]:
sql_stores = "SELECT * FROM sakila.store;"
df_stores = get_dataframe(user_id, pwd, host_name, src_dbname, sql_stores)
df_stores.head(2)

Unnamed: 0,store_id,manager_staff_id,address_id,last_update
0,1,1,1,2006-02-15 04:57:12
1,2,2,2,2006-02-15 04:57:12


**Performs Necessary Transformations**

In [9]:
drop_cols = ['last_update', 'active', 'create_date']
df_customers.drop(drop_cols, axis=1, inplace=True)
df_customers.rename(columns={"customer_id":"customer_key", "store_id":"store_key"},
                    inplace=True)

df_customers.head(2)

Unnamed: 0,customer_key,store_key,first_name,last_name,email,address_id
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5
1,2,1,PATRICIA,JOHNSON,PATRICIA.JOHNSON@sakilacustomer.org,6


In [10]:
drop_cols = ['staff_id', 'rental_id', 'last_update']
df_payments.drop(drop_cols, axis=1, inplace=True)
df_payments.rename(columns={"payment_id":"payment_key", "customer_id":"customer_key",
                         "staff_id":"staff_key", "rental_id":"rental_key"}, inplace=True)

df_payments.head(2)

Unnamed: 0,payment_key,customer_key,amount,payment_date
0,1,1,2.99,2005-05-25 11:30:37
1,2,1,0.99,2005-05-28 10:35:23


In [11]:
drop_cols = ['staff_id', 'last_update']
df_rentals.drop(drop_cols, axis=1, inplace=True)
df_rentals.rename(columns={"rental_id":"rental_key", "inventory_id":"inventory_key",
                           "customer_id":"customer_key", "staff_id":"staff_key"}, inplace=True)

df_rentals.head(2)

Unnamed: 0,rental_key,rental_date,inventory_key,customer_key,return_date
0,1,2005-05-24 22:53:30,367,130,2005-05-26 22:04:30
1,2,2005-05-24 22:54:33,1525,459,2005-05-28 19:40:33


In [12]:
drop_cols = ['last_update']
df_stores.drop(drop_cols, axis=1, inplace=True)
df_stores.rename(columns={"store_id":"store_key", "manager_staff_id":"manager_key", "address_id":"address_key"},
                inplace=True)

df_stores.head(2)

Unnamed: 0,store_key,manager_key,address_key
0,1,1,1
1,2,2,2


**Loads Transformed Dataframes into the New Data Warehouse by Creating New Tables**

In [13]:
db_operation = "insert"

tables = [('dim_customers', df_customers, 'customer_key'),
          ('dim_payments', df_payments, 'payment_key'),
          ('dim_stores', df_stores, 'store_key'),
          ('dim_rentals', df_rentals, 'rental_key')]

for table_name, dataframe, primary_key in tables:
    set_dataframe(user_id, pwd, host_name, dst_dbname, dataframe, table_name, primary_key, db_operation)

### Creates & Populates the Fact Table

In [14]:
df_fact_orders = pd.merge(df_customers, df_payments, on='customer_key', how='inner')
df_fact_orders.head(2)

Unnamed: 0,customer_key,store_key,first_name,last_name,email,address_id,payment_key,amount,payment_date
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2.99,2005-05-25 11:30:37
1,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,2,0.99,2005-05-28 10:35:23


In [15]:
df_fact_orders = pd.merge(df_fact_orders, df_rentals, on='customer_key', how='inner')
df_fact_orders.head(2)

Unnamed: 0,customer_key,store_key,first_name,last_name,email,address_id,payment_key,amount,payment_date,rental_key,rental_date,inventory_key,return_date
0,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2.99,2005-05-25 11:30:37,76,2005-05-25 11:30:37,3021,2005-06-03 12:00:37
1,1,1,MARY,SMITH,MARY.SMITH@sakilacustomer.org,5,1,2.99,2005-05-25 11:30:37,573,2005-05-28 10:35:23,4020,2005-06-03 06:32:23


In [16]:
ordered_cols = ['customer_key', 'store_key', 'payment_key', 'rental_key', 'inventory_key',
                'first_name', 'last_name', 'email', 'amount', 'payment_date', 'rental_date', 'return_date']

df_fact_orders = df_fact_orders[ordered_cols]

df_fact_orders.payment_date = pd.to_datetime(df_fact_orders.payment_date).dt.date
df_fact_orders.rental_date = pd.to_datetime(df_fact_orders.rental_date).dt.date
df_fact_orders.return_date = pd.to_datetime(df_fact_orders.return_date).dt.date

df_fact_orders.payment_date = df_fact_orders.payment_date.astype('datetime64')
df_fact_orders.rental_date = df_fact_orders.rental_date.astype('datetime64')
df_fact_orders.return_date = df_fact_orders.return_date.astype('datetime64')

df_fact_orders.head(2)

Unnamed: 0,customer_key,store_key,payment_key,rental_key,inventory_key,first_name,last_name,email,amount,payment_date,rental_date,return_date
0,1,1,1,76,3021,MARY,SMITH,MARY.SMITH@sakilacustomer.org,2.99,2005-05-25,2005-05-25,2005-06-03
1,1,1,1,573,4020,MARY,SMITH,MARY.SMITH@sakilacustomer.org,2.99,2005-05-25,2005-05-28,2005-06-03


**Get the Data from the Date Dimension Table (Go to sakila-date-dimension.sql to create sakila.dim_date)**

In [17]:
sql_dim_date = "SELECT date_key, full_date FROM sakila_dw.dim_date;"
df_dim_date = get_dataframe(user_id, pwd, host_name, dst_dbname, sql_dim_date)
df_dim_date.full_date = df_dim_date.full_date.astype('datetime64')
df_dim_date.head(2)

Unnamed: 0,date_key,full_date
0,20050101,2005-01-01
1,20050102,2005-01-02


**Lookup the DateKeys from the Date Dimension Table**

In [18]:
df_dim_payment_date = df_dim_date.rename(columns={"date_key" : "payment_date_key", "full_date" : "payment_date"})
df_fact_orders = pd.merge(df_fact_orders, df_dim_payment_date, on='payment_date', how='inner')
df_fact_orders.drop(['payment_date'], axis=1, inplace=True)

df_fact_orders.head(2)

Unnamed: 0,customer_key,store_key,payment_key,rental_key,inventory_key,first_name,last_name,email,amount,rental_date,return_date,payment_date_key
0,1,1,1,76,3021,MARY,SMITH,MARY.SMITH@sakilacustomer.org,2.99,2005-05-25,2005-06-03,20050525
1,1,1,1,573,4020,MARY,SMITH,MARY.SMITH@sakilacustomer.org,2.99,2005-05-28,2005-06-03,20050525


In [19]:
df_dim_rental_date = df_dim_date.rename(columns={"date_key" : "rental_date_key", "full_date" : "rental_date"})
df_fact_orders = pd.merge(df_fact_orders, df_dim_rental_date, on='rental_date', how='inner')
df_fact_orders.drop(['rental_date'], axis=1, inplace=True)

df_fact_orders.head(2)

Unnamed: 0,customer_key,store_key,payment_key,rental_key,inventory_key,first_name,last_name,email,amount,return_date,payment_date_key,rental_date_key
0,1,1,1,76,3021,MARY,SMITH,MARY.SMITH@sakilacustomer.org,2.99,2005-06-03,20050525,20050525
1,6,2,146,57,3938,JENNIFER,DAVIS,JENNIFER.DAVIS@sakilacustomer.org,4.99,2005-05-29,20050525,20050525


In [20]:
df_dim_return_date = df_dim_date.rename(columns={"date_key" : "return_date_key", "full_date" : "return_date"})
df_fact_orders = pd.merge(df_fact_orders, df_dim_return_date, on='return_date', how='inner')
df_fact_orders.drop(['return_date'], axis=1, inplace=True)

df_fact_orders.head(2)

Unnamed: 0,customer_key,store_key,payment_key,rental_key,inventory_key,first_name,last_name,email,amount,payment_date_key,rental_date_key,return_date_key
0,1,1,1,76,3021,MARY,SMITH,MARY.SMITH@sakilacustomer.org,2.99,20050525,20050525,20050603
1,19,1,490,110,4108,RUTH,MARTINEZ,RUTH.MARTINEZ@sakilacustomer.org,0.99,20050525,20050525,20050603


**Performs any Necessary Transformations**

In [21]:
ordered_cols = ['customer_key', 'store_key', 'payment_key', 'rental_key', 'inventory_key',
                'payment_date_key', 'rental_date_key', 'return_date_key',
                'first_name', 'last_name', 'email', 'amount']
df_fact_orders = df_fact_orders[ordered_cols]

df_fact_orders.insert(0, "order_key", range(1, df_fact_orders.shape[0]+1))

df_fact_orders.head(2)

Unnamed: 0,order_key,customer_key,store_key,payment_key,rental_key,inventory_key,payment_date_key,rental_date_key,return_date_key,first_name,last_name,email,amount
0,1,1,1,1,76,3021,20050525,20050525,20050603,MARY,SMITH,MARY.SMITH@sakilacustomer.org,2.99
1,2,19,1,490,110,4108,20050525,20050525,20050603,RUTH,MARTINEZ,RUTH.MARTINEZ@sakilacustomer.org,0.99


**Writes Dataframe back to the Database**

In [22]:
table_name = "fact_orders"
primary_key = "order_key"
db_operation = "insert"

set_dataframe(user_id, pwd, host_name, dst_dbname, df_fact_orders, table_name, primary_key, db_operation)

### Demonstrates that the New Data Warehouse Exists and Contains the Correct Data

In [23]:
sql_finance = """
    SELECT stores.`store_key`,
        COUNT(stores.`store_key`) AS `number_of_customers`,
        MIN(orders.`amount`) AS `lowest_payment`,
        MAX(orders.`amount`) AS `highest_payment`,
        SUM(orders.`amount`) AS `total_payment`
    FROM `{0}`.`fact_orders` AS orders
    INNER JOIN `{0}`.`dim_stores` AS stores
    ON orders.store_key = stores.store_key
    GROUP BY stores.`store_key`;
""".format(dst_dbname)

df_finance = get_dataframe(user_id, pwd, host_name, dst_dbname, sql_finance)
df_finance.head()

Unnamed: 0,store_key,number_of_customers,lowest_payment,highest_payment,total_payment
0,1,240022,0.0,11.99,1015855.0
1,2,200225,0.0,11.99,836212.7
