In [3]:
# Import necessary packages
import pandas as pd
from pymongo import MongoClient
import time
import psutil
import cudf
from extract_load_functions_mongodb import extraction, loading, performance

#set export location
exportLocation = r'/home/jeff/'

#set number of test runs to prefrom
iterations = 31

# Connect to MongoDb
myclient = MongoClient("mongodb://localhost:27017/")
db = myclient.ds7330 
item = db.item


etlTimerStart = time.perf_counter() 
# LIGHT EXTRACTION #
### Pull item table and capture CPU, RAM and elapsed time to perform operations
                               
lightExtraction = extraction(item, iterations, exportLocation, 'lightExtractionPrfm_Mongo.csv')
  

#### LIGHT TRANSFORMATION ####
###################################################################################################################
# Preform light transformation workload.  In this case, identify all item descriptions with "Blue" in them 
# and change them to "Navy"

    #Cast datatypes to objects

column_names = ["base_CPU","base_RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
lightTransPrfm = pd.DataFrame(columns = column_names)
#lightTrans.item_desc.str.contains("^Blue")

print("STARTING TRANSFORMATION...")
print("RUNNING...")

transTimerStart = time.perf_counter()
#Run 30 iterations to collect transformation df of performance metrics
for sampleNoTransform in range(iterations):
    
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    convert_dict = {'_id': str, 
            'orderedItem': str,
            'item_desc':str,
            'selling_price':float
           } 
    lightExtraction = lightExtraction.astype(convert_dict) 
    
    lightTrans = pd.DataFrame(lightExtraction)
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    
    #Working Code
    lightTrans.drop(lightTrans.columns[[0]], axis = 1, inplace = True) 
    lightTrans.drop([406], axis = 0, inplace = True)
    lightTrans.item_desc = lightTrans.item_desc.str.replace('Blue', 'Navy', regex=True)
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()
    
    
    lightTransPrfm = lightTransPrfm.append(pd.DataFrame({'base_CPU': baseCPU,
                                                         'base_RAM': baseRAM,
                                                         'CPU_utilization': sampleCPU,
                                                         'RAM_utilization': sampleRAM,
                                                         'CPU_d': sampleCPU-baseCPU,
                                                         'RAM_d': sampleRAM - baseRAM,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
transTimerEnd= time.perf_counter() 

print("Pandas transformation metrics captured.\n")
performance(lightTransPrfm, iterations, transTimerStart,transTimerEnd)

# Export Light Transformation Performance to local
lightTransPrfm.to_csv (r'/home/jeff/lightTransPrfm_mongodb.csv', index = False, header=True)


#### cuDF TRANSFORMATION ####
###################################################################################################################

lightTrans2 = lightExtraction
column_names = ["base_CPU","base_RAM","CPU_utilization", "RAM_utilization", "CPU_d","RAM_d","elapsed_time"]
lightTransPrfmCU = pd.DataFrame(columns = column_names)

print("Starting cuDF TRANSFORMATION...")
print("RUNNING...")

#Run 30 iterations to collect transformation df of performance metrics
transCuTimerStart = time.perf_counter()
for sampleNoTransform in range(iterations):
    time.sleep(5)
    #collect base settings
    baseCPU = psutil.cpu_percent()
    baseRAM = psutil.virtual_memory().percent
    
    ## Transform pandas df to cuDF
    lightTransCU = cudf.DataFrame.from_pandas(lightTrans2)
    
    # Start Timer and progress tracker
    start = time.perf_counter()
    
    #Working Code
    lightTransCU.drop_column("_id")
    lightTransCU = lightTransCU[0:406]
    lightTransCU.item_desc = lightTransCU.item_desc.str.replace('Blue', 'Navy', regex=True)
    sampleCPU = psutil.cpu_percent()
    sampleRAM = psutil.virtual_memory().percent
    
    #Stop timer  
    stop = time.perf_counter()
    
    lightTransPrfmCU = lightTransPrfmCU.append(pd.DataFrame({'base_CPU': baseCPU,
                                                         'base_RAM': baseRAM,
                                                         'CPU_utilization': sampleCPU,
                                                         'RAM_utilization':  sampleRAM,
                                                         'CPU_d': sampleCPU-baseCPU,
                                                         'RAM_d': sampleRAM - baseRAM,
                                                         'elapsed_time':stop - start},
                                                          index=[1]), ignore_index=True)
transCuTimerEnd = time.perf_counter() 

print("cuDF performance metrics captured loading complete.\n")
performance(lightTransPrfmCU,iterations,transCuTimerStart,transCuTimerEnd)

# Export Light Transformation cuDF Performance to local
lightTransPrfmCU.to_csv (r'/home/jeff/lightTransPrfmCU_mongodb.csv', index = False, header=True)

#### Load data ####
###################################################################################################################
loading(myclient,'light', lightTrans, iterations, exportLocation, "lightMongoDBLoad.csv")

etlTimerEnd = time.perf_counter() 
print('ETL is complete')
print('Elapsed ETL time is: ', (etlTimerEnd-etlTimerStart)/60, ' minutes')

STARTING EXTRACTION...
RUNNING...
Data frame loading complete.

Iterations performed:  31
TOTAL Process time:  155.31631806500081 s
Average iteration time:  0.004293357514994099 s
Average BASE CPU:  1.7322580645161287
Average BASE RAM:  29.74838709677419
Average CPU Performance  3.1580645161290315
Average RAM Performance  0.0
Average CPU utilization:  4.890322580645161
Average RAM utilization:  29.74838709677419 


STARTING TRANSFORMATION...
RUNNING...
Pandas transformation metrics captured.

Iterations performed:  31
TOTAL Process time:  155.29630137699132 s
Average iteration time:  0.002211148063408121 s
Average BASE CPU:  1.5935483870967744
Average BASE RAM:  29.79354838709676
Average CPU Performance  4.903225806451613
Average RAM Performance  0.0
Average CPU utilization:  6.496774193548386
Average RAM utilization:  29.79354838709676 


Starting cuDF TRANSFORMATION...
RUNNING...
cuDF performance metrics captured loading complete.

Iterations performed:  31
TOTAL Process time:  155.4