**Libraries**

In [1]:
import os
import json
import numpy
import datetime
import pandas as pd

import pymongo
from sqlalchemy import create_engine

**Declaring & Assigning Connection Variables for my MySQL Server & Sakila Database**

In [2]:
mysql_uid = "root"
mysql_pwd = "Passw0rd123"
mysql_host = "localhost"

atlas_cluster_name = "sandbox.zibbf"
atlas_user_name = "m001-student"
atlas_password = "m001-mongodb-basics"

conn_str = {"local" : f"mongodb://localhost:27017/",
    "atlas" : f"mongodb+srv://{atlas_user_name}:{atlas_password}@{atlas_cluster_name}.mongodb.net"
}

src_dbname = "sakila_data"
dst_dbname = "sakila_dw"

print(f"Local Connection String: {conn_str['local']}")
print(f"Atlas Connection String: {conn_str['atlas']}")

Local Connection String: mongodb://localhost:27017/
Atlas Connection String: mongodb+srv://m001-student:m001-mongodb-basics@sandbox.zibbf.mongodb.net


**Defining Functions for Getting Data From and Setting Data Into Database**

In [3]:
def get_sql_dataframe(user_id, pwd, host_name, db_name, sql_query):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    
    '''Invoke the pd.read_sql() function to query the database, and fill a Pandas DataFrame.'''
    conn = sqlEngine.connect()
    dframe = pd.read_sql(sql_query, conn);
    conn.close()
    
    return dframe


def get_mongo_dataframe(connect_str, db_name, collection, query):
    '''Create a connection to MongoDB'''
    client = pymongo.MongoClient(connect_str)
    
    '''Query MongoDB, and fill a python list with documents to create a DataFrame'''
    db = client[db_name]
    dframe = pd.DataFrame(list(db[collection].find(query)))
    dframe.drop(['_id'], axis=1, inplace=True)
    client.close()
    return dframe


def set_dataframe(user_id, pwd, db_name, host_name, df, table_name, pk_column, db_operation):
    '''Create a connection to the MySQL database'''
    conn_str = f"mysql+pymysql://{user_id}:{pwd}@{host_name}/{db_name}"
    sqlEngine = create_engine(conn_str, pool_recycle=3600)
    connection = sqlEngine.connect()
    
    '''Invoke the Pandas DataFrame .to_sql( ) function to either create, or append to, a table'''
    if db_operation == "insert":
        df.to_sql(table_name, con=connection, index=False, if_exists='replace')
        sqlEngine.execute(f"ALTER TABLE {table_name} ADD PRIMARY KEY ({pk_column});")
            
    elif db_operation == "update":
        df.to_sql(table_name, con=connection, index=False, if_exists='append')
    
    connection.close()

**Populates MongoDB with Source Data**

In [4]:
client = pymongo.MongoClient(conn_str["local"])
db = client[src_dbname]

# Gets the path of the Current Working Directory for this Notebook, and then Appends the 'data' directory.
data_dir = os.path.join(os.getcwd(), 'data')

json_files = {"actor" : 'sakila_actor.json',
              "film" : 'sakila_film.json',
              "film_actor" : 'sakila_film_actor.json',
              "film_category" : 'sakila_film_category.json'
             }

for file in json_files:
    db.drop_collection(file)
    with open(json_files[file], 'r') as openfile:
        json_object = json.load(openfile)
        file = db[file]
        result = file.insert_many(json_object)

client.close()

### Creates & Populates the New Dimension Tables
**Extracts Data from the Source Database Tables**

In [5]:
query = {}
collection = "actor"

df_actors = get_mongo_dataframe(conn_str['local'], src_dbname, collection, query)
df_actors.head(2)

Unnamed: 0,actor_id,first_name,last_name,last_update
0,1,PENELOPE,GUINESS,2006-02-15 09:34:33
1,2,NICK,WAHLBERG,2006-02-15 09:34:33


In [6]:
query = {}
collection = "film"

df_films = get_mongo_dataframe(conn_str['local'], src_dbname, collection, query)
df_films.head(2)

Unnamed: 0,film_id,title,description,release_year,language_id,original_language_id,rental_duration,rental_rate,length,replacement_cost,rating,last_update,special_features,fulltext
0,1,ACADEMY DINOSAUR,An Epic Drama of a Feminist And a Mad Scientis...,2006.0,1,,6,0.99,86.0,20.99,PG,2007-09-10 17:46:03.905795,"{""Deleted Scenes"",""Behind the Scenes""}",'academi':1 'battl':15 'canadian':20 'dinosaur...
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006.0,1,,3,4.99,48.0,12.99,G,2007-09-10 17:46:03.905795,"{Trailers,""Deleted Scenes""}",'ace':1 'administr':9 'ancient':19 'astound':4...


In [7]:
query = {}
collection = "film_actor"

df_film_actors = get_mongo_dataframe(conn_str['local'], src_dbname, collection, query)
df_film_actors.head(2)

Unnamed: 0,actor_id,film_id,last_update
0,1,1,2006-02-15 10:05:03
1,1,23,2006-02-15 10:05:03


In [8]:
query = {}
collection = "film_category"

df_film_categories = get_mongo_dataframe(conn_str['local'], src_dbname, collection, query)
df_film_categories.head(2)

Unnamed: 0,film_id,category_id,last_update
0,1,6,2006-02-15 10:07:09
1,2,11,2006-02-15 10:07:09


**Performs Necessary Transformations**

In [9]:
drop_cols = ['last_update']
df_actors.drop(drop_cols, axis=1, inplace=True)
df_actors.rename(columns={"actor_id":"actor_key"}, inplace=True)

df_actors.head(2)

Unnamed: 0,actor_key,first_name,last_name
0,1,PENELOPE,GUINESS
1,2,NICK,WAHLBERG


In [10]:
drop_cols = ['language_id', 'original_language_id', 'last_update', 'fulltext']
df_films.drop(drop_cols, axis=1, inplace=True)
df_films.rename(columns={"film_id":"film_key"}, inplace=True)

df_films.head(2)

Unnamed: 0,film_key,title,description,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features
0,1,ACADEMY DINOSAUR,An Epic Drama of a Feminist And a Mad Scientis...,2006.0,6,0.99,86.0,20.99,PG,"{""Deleted Scenes"",""Behind the Scenes""}"
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006.0,3,4.99,48.0,12.99,G,"{Trailers,""Deleted Scenes""}"


In [11]:
drop_cols = ['last_update']
df_film_actors.drop(drop_cols, axis=1, inplace=True)
df_film_actors.rename(columns={"actor_id":"actor_key", "film_id":"film_key"}, inplace=True)
df_film_actors.insert(0, "film_actor_key", range(1, df_film_actors.shape[0]+1))

df_film_actors.head(2)

Unnamed: 0,film_actor_key,actor_key,film_key
0,1,1,1
1,2,1,23


In [12]:
drop_cols = ['last_update']
df_film_categories.drop(drop_cols, axis=1, inplace=True)
df_film_categories.rename(columns={"film_id":"film_key", "category_id":"category_key"}, inplace=True)
df_film_categories.insert(0, "film_category_key", range(1, df_film_categories.shape[0]+1))

df_film_categories.head(2)

Unnamed: 0,film_category_key,film_key,category_key
0,1,1,6
1,2,2,11


**Loads Transformed Dataframes into the New Data Warehouse by Creating New Tables**

In [13]:
dataframe = df_actors
table_name = 'dim_actors'
primary_key = 'actor_key'
db_operation = "insert"

set_dataframe(mysql_uid, mysql_pwd, dst_dbname, mysql_host, dataframe, table_name, primary_key, db_operation)

In [14]:
dataframe = df_films
table_name = 'dim_films'
primary_key = 'film_key'
db_operation = "insert"

set_dataframe(mysql_uid, mysql_pwd, dst_dbname, mysql_host, dataframe, table_name, primary_key, db_operation)

In [15]:
dataframe = df_film_actors
table_name = 'dim_film_actors'
primary_key = 'film_actor_key'
db_operation = "insert"

set_dataframe(mysql_uid, mysql_pwd, dst_dbname, mysql_host, dataframe, table_name, primary_key, db_operation)

In [17]:
dataframe = df_film_categories
table_name = 'dim_film_categories'
primary_key = 'film_category_key'
db_operation = "insert"

set_dataframe(mysql_uid, mysql_pwd, dst_dbname, mysql_host, dataframe, table_name, primary_key, db_operation)

**Validates that Dimension Tables were Created**

In [18]:
sql_actors = "SELECT * FROM sakila_dw.dim_actors;"
df_dim_actors = get_sql_dataframe(mysql_uid, mysql_pwd, mysql_host, dst_dbname, sql_actors)
df_dim_actors.head(2)

Unnamed: 0,actor_key,first_name,last_name
0,1,PENELOPE,GUINESS
1,2,NICK,WAHLBERG


In [19]:
sql_films = "SELECT * FROM sakila_dw.dim_films;"
df_dim_films = get_sql_dataframe(mysql_uid, mysql_pwd, mysql_host, dst_dbname, sql_films)
df_dim_films.head(2)

Unnamed: 0,film_key,title,description,release_year,rental_duration,rental_rate,length,replacement_cost,rating,special_features
0,1,ACADEMY DINOSAUR,An Epic Drama of a Feminist And a Mad Scientis...,2006.0,6,0.99,86.0,20.99,PG,"{""Deleted Scenes"",""Behind the Scenes""}"
1,2,ACE GOLDFINGER,A Astounding Epistle of a Database Administrat...,2006.0,3,4.99,48.0,12.99,G,"{Trailers,""Deleted Scenes""}"


In [20]:
sql_film_actors = "SELECT * FROM sakila_dw.dim_film_actors;"
df_dim_film_actors = get_sql_dataframe(mysql_uid, mysql_pwd, mysql_host, dst_dbname, sql_film_actors)
df_dim_film_actors.head(2)

Unnamed: 0,film_actor_key,actor_key,film_key
0,1,1,1
1,2,1,23


In [21]:
sql_film_categories = "SELECT * FROM sakila_dw.dim_film_categories;"
df_dim_film_categories = get_sql_dataframe(mysql_uid, mysql_pwd, mysql_host, dst_dbname, sql_film_categories)
df_dim_film_categories.head(2)

Unnamed: 0,film_category_key,film_key,category_key
0,1,1,6
1,2,2,11
