In [1]:
# Import necessary packages
import pandas as pd
from pymongo import MongoClient
import time
import psutil
import cudf
from extract_load_functions_mongodb import extraction, loading, performance

#set export location
exportLocation = r'/home/jeff/'

#set number of test runs to prefrom
iterations = 31

# Connect to MongoDb
myclient = MongoClient("mongodb://localhost:27017/")
db = myclient.ds7330 
item = db.item
order_line = db.order_line
orderHeader = db.orderHeader


etlTimerStart = time.perf_counter()    
# HEAVY EXTRACTION #
###################################################################################################################
### Pull item table and capture CPU, RAM and elapsed time to perform operations

extractionTimerStart = time.perf_counter()
### Run 30 iterations to collect a df of performance metrics
### light extraction performance metrics
column_names = ["base_CPU","base_RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
heavyExtractionPrfm = pd.DataFrame(columns = column_names)

### base metrics
print("STARTING EXTRACTION...")
print("RUNNING...")

### Run n iterations to collect a df of performance metrics
for sampleNo in range(iterations):
    
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    #Initiate timer for query
    start = time.perf_counter()

    #insert data into dataFrame
    itemdf = pd.DataFrame(list(item.find()))
    order_linedf = pd.DataFrame(list(order_line.find()))
    orderHeaderdf = pd.DataFrame(list(orderHeader.find()))
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()

    #load df with performance metrics    
    heavyExtractionPrfm = heavyExtractionPrfm.append(pd.DataFrame({'base_CPU': baseCPU,
                                                         'base_RAM': baseRAM,
                                                         'CPU_utilization': sampleCPU,
                                                         'RAM_utilization': sampleRAM,
                                                         'CPU_d': sampleCPU-baseCPU,
                                                         'RAM_d': sampleRAM - baseRAM,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
extractionTimerEnd= time.perf_counter()   

print("Data frame loading complete.\n")
performance(heavyExtractionPrfm, iterations, extractionTimerStart,extractionTimerEnd)

#Export DF to csv
heavyExtractionPrfm.to_csv (r'/home/jeff/heavyExtractionPrfm_MongoDB.csv', index = False, header=True)

#### Heavy TRANSFORMATION ####
###################################################################################################################
# Preform medium transformation workload.  In this case, join orderline and item tables, identify all item descriptions with "Blue" in them 
# and change them to "Navy"

#Cast datatypes to objects
order_linedf[['Line', 'eaches_qty']] = order_linedf[['Line', 'eaches_qty']].apply(pd.to_numeric) 
order_linedf[['Order', 'orderedItem','Ponum']] = order_linedf[['Order', 'orderedItem','Ponum']].astype(str) 

itemdf[['selling_price']] = itemdf[['selling_price']].apply(pd.to_numeric)
itemdf[['orderedItem']] = itemdf[['orderedItem']].astype(str)

orderHeaderdf[['site_num']] = orderHeaderdf[['site_num']].apply(pd.to_numeric)
orderHeaderdf[['Ponum']] = orderHeaderdf[['Ponum']].astype(str)

column_names = ["base_CPU","base_RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
heavyTransPrfm = pd.DataFrame(columns = column_names)
#lightTrans.item_desc.str.contains("^Blue")

print("STARTING TRANSFORMATION...")
print("RUNNING...")

transTimerStart = time.perf_counter()
#Run 30 iterations to collect transformation df of performance metrics

for sampleNoTransform in range(iterations):
    
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    
    #Working Code
    itemtrans = itemdf.drop(index=itemdf.index[[-1]])
    order_linetrans = order_linedf.drop(index=order_linedf.index[[-1]])
    orderHeadertrans = order_linedf.drop(index=orderHeaderdf.index[[-1]])
    
    itemtrans =itemdf.drop(itemdf.columns[[0]], axis = 1) 
    order_linetrans = order_linedf.drop(order_linedf.columns[[0]], axis = 1) 
    orderHeadertrans = orderHeaderdf.drop(orderHeaderdf.columns[[0]], axis = 1) 
    
    medium = order_linetrans.merge(right=itemtrans, on="orderedItem")
    heavy = orderHeadertrans.merge(right=medium, on="Ponum")
    
    heavy.item_desc = heavy.item_desc.str.replace('Blue', 'Navy', regex=True)
    heavy.item_desc = heavy.item_desc.str.replace('Tee', 'CottonTee', regex=True)
    heavy['extended_price'] = heavy.selling_price * medium.eaches_qty
        
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()
    
    
    heavyTransPrfm = heavyTransPrfm.append(pd.DataFrame({'base_CPU': baseCPU,
                                                         'base_RAM': baseRAM,
                                                         'CPU_utilization': sampleCPU,
                                                         'RAM_utilization':  sampleRAM,
                                                         'CPU_d': sampleCPU-baseCPU,
                                                         'RAM_d': sampleRAM - baseRAM,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
transTimerEnd= time.perf_counter() 

print("Pandas transformation metrics captured.\n")
performance(heavyTransPrfm,iterations,transTimerStart,transTimerEnd)


# Export Light Transformation Performance to local
heavyTransPrfm.to_csv (r'/home/jeff/heavyTransPrfm_mongoDB.csv', index = False, header=True)


#### cuDF TRANSFORMATION ####
###################################################################################################################

column_names = ["base_CPU","base_RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
heavyTransPrfmCU = pd.DataFrame(columns = column_names)

print("Starting cuDF TRANSFORMATION...")
print("RUNNING...")

#Run 30 iterations to collect transformation df of performance metrics
transCuTimerStart = time.perf_counter()
for sampleNoTransform in range(iterations):
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    ## Transform pandas df to cuDF
    itemCU = cudf.DataFrame.from_pandas(itemtrans)
    order_lineCU = cudf.DataFrame.from_pandas(order_linetrans)
    orderheaderCU = cudf.DataFrame.from_pandas(orderHeadertrans)
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    
    #Working Code
    joined_dataCU = order_lineCU.merge(right=itemCU, on="orderedItem")
    heavyCU = orderheaderCU.merge(right=joined_dataCU, on="Ponum")
    
    heavyCU.item_desc = heavyCU.item_desc.str.replace('Blue', 'Navy', regex=True)
    heavyCU.item_desc = heavyCU.item_desc.str.replace('Tee', 'CottonTee', regex=True)
    heavyCU['extended_price'] = heavyCU.selling_price * heavyCU.eaches_qty
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()
    
    heavyTransPrfmCU = heavyTransPrfmCU.append(pd.DataFrame({'base_CPU': baseCPU,
                                                         'base_RAM': baseRAM,
                                                         'CPU_utilization': sampleCPU,
                                                         'RAM_utilization':  sampleRAM,
                                                         'CPU_d': sampleCPU-baseCPU,
                                                         'RAM_d': sampleRAM - baseRAM,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
transCuTimerEnd = time.perf_counter() 

print("cuDF performance metrics captured loading complete.\n")
performance(heavyTransPrfmCU,iterations,transCuTimerStart,transCuTimerEnd)

# Export medium cuDF Transformation Performance to local
heavyTransPrfmCU.to_csv (r'/home/jeff/heavyTransPrfmCU_mongoDB.csv', index = False, header=True)

#### Load data ####
###################################################################################################################
loading(myclient,'heavy', heavy, iterations, exportLocation, "heavyMongoDBLoad.csv")

etlTimerEnd = time.perf_counter() 
print('ETL is complete')
print('Elapsed ETL time is: ', (etlTimerEnd-etlTimerStart)/60, ' minutes')

STARTING EXTRACTION...
RUNNING...
Data frame loading complete.

Iterations performed:  31
TOTAL Process time:  202.79465493099997 s
Average iteration time:  1.5359190660974784 s
Average BASE CPU:  1.5806451612903225
Average BASE RAM:  29.34193548387095
Average CPU Performance  4.1580645161290315
Average RAM Performance  0.03225806451612915
Average CPU utilization:  5.738709677419354
Average RAM utilization:  29.37419354838708 


STARTING TRANSFORMATION...
RUNNING...
Pandas transformation metrics captured.

Iterations performed:  31
TOTAL Process time:  170.94979942499776 s
Average iteration time:  0.5089123751616083 s
Average BASE CPU:  1.6741935483870967
Average BASE RAM:  29.777419354838692
Average CPU Performance  4.199999999999999
Average RAM Performance  0.01935483870967735
Average CPU utilization:  5.874193548387098
Average RAM utilization:  29.796774193548373 


Starting cuDF TRANSFORMATION...
RUNNING...
cuDF performance metrics captured loading complete.

Iterations performed: 