In [1]:
## reset specific variables (replace regular_expression by the variables of interest)
#%reset_selective <regular_expression>

# reset all variables
%reset -f

In [2]:
## Importing libraries

from dask import dataframe as dd
from datetime import datetime, date, timedelta
from pathlib import Path
import matplotlib.pyplot as plt
from pylab import savefig
import seaborn as sns
import pandas as pd
import csv
from pymongo import MongoClient
from mongoengine import *

%matplotlib inline

In [3]:
## Creating/Connecting Mongo DB instances

# Provide the mongodb atlas url to connect python to mongodb using pymongo
#CONNECTION_STRING = "mongodb+srv://<jgu>:<123>@<cluster-jgu>.mongodb.net/SMARTAttributesFilter"

connect(db='SMARTAttributesFilter', alias='SMARTAttributesFilter_alias')

connect(db='FailuresAppsLocation', alias='FailuresAppsLocation_alias')

connect(db='SMARTAtt_FailuresAppsLocation', alias='SMARTAtt_FailuresAppsLocation_alias')

MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary(), uuidrepresentation=3)

In [4]:
## Setting document schema

class SMARTAtt(Document):
     disk_id = FloatField(required=False, default='0')
     timestamp = DateTimeField(required=False, default='0')
     model_x = StringField(required=False, default='0')
     r_sectors = FloatField(required=False, default='0')
     u_errors = FloatField(required=False, default='0')
     p_failedA = FloatField(required=False, default='0')
     p_failedB = FloatField(required=False, default='0')
     e_failedA = FloatField(required=False, default='0')
     e_failedB = FloatField(required=False, default='0')
     n_b_written = FloatField(required=False, default='0')
     n_b_read = FloatField(required=False, default='0')
     meta = {'db_alias': 'SMARTAttributesFilter_alias'}

class FailuresAppsLocation(Document):
     disk_id = FloatField(required=False, default='0')
     failure_time = DateTimeField(required=False, default='0')
     model_x = StringField(required=False, default='0')
     model_y = StringField(required=False, default='0')
     app = StringField(required=False, default='0')
     node_id = FloatField(required=False, default='0')
     rack_id = FloatField(required=False, default='0')
     machine_room_id = FloatField(required=False, default='0')
     meta = {'db_alias': 'FailuresAppsLocation_alias'}

class SMARTAtt_FailuresAppsLocation(Document):
     smart_att = ReferenceField(SMARTAtt)
     failures_app_location = ReferenceField(FailuresAppsLocation)
     meta = {'db_alias': 'SMARTAtt_FailuresAppsLocation_alias'}

In [None]:
## Deleting DB content (for the case when the goal is to test the code from zero - otherwise the db will contain several replicas)

# Creating the object related to the whole collection
#failuresAppsLocationTeste = FailuresAppsLocation.objects() 

# Deleting all collection
#failuresAppsLocationTeste.delete() 

In [12]:
# Closing the connection to the DB

disconnect(alias='SMARTAttributesFilter_alias')

disconnect(alias='FailuresAppsLocation_alias_alias')

disconnect(alias='SMARTAtt_FailuresAppsLocation_alias')

In [5]:
## Loading datasaet - AlibabaOverTime (Failurelogs)

df_AlibabaOver_Failurelogs = pd.read_csv('/media/erb/hdd1/DataSet/alibabaOvertime/ssd_failure_label/ssd_failure_label.csv')

In [6]:
## Loading dataset - Alibaba Snapshot (TimeStamps of failed SSDs, SMART attributes in 39 columns, SSDs location, applications, SSD models and Disk ID)

df_AlibabaSnapShot_FailuresAppsLocation = pd.read_csv('/media/erb/hdd1/DataSet/alibabaSnapShot/ssd_failure_tag/ssd_failure_tag.csv')

In [7]:
# Merging Failures and location datasets and fixing columns types (to have ssd failure data that has location, failure time, and smart att)
df_Failurelogs_FailuresAppsLocation =  pd.merge(df_AlibabaOver_Failurelogs, df_AlibabaSnapShot_FailuresAppsLocation, how = 'inner', on = ['disk_id', 'failure_time'])

# Changing failure time column to datetime type
df_Failurelogs_FailuresAppsLocation['failure_time'] =  pd.to_datetime(df_Failurelogs_FailuresAppsLocation['failure_time'])

# Removing duplicates
#df_Failurelogs_FailuresAppsLocation = df_Failurelogs_FailuresAppsLocation.drop_duplicates(subset='disk_id', keep="first")

# Forcing sorting
df_Failurelogs_FailuresAppsLocation = df_Failurelogs_FailuresAppsLocation.sort_values(by=['failure_time'], ascending=True)

# Choosing the columns of interest
df_Failurelogs_FailuresAppsLocation = df_Failurelogs_FailuresAppsLocation.loc[:,['disk_id','failure_time', 'model_x','model_y','app','node_id','rack_id','machine_room_id']]

# Changing data type
#df_Failurelogs_FailuresAppsLocation = df_Failurelogs_FailuresAppsLocation.astype(datatype)

# Testing
#df_Failurelogs_FailuresAppsLocation.head(20)
#df_Failurelogs_FailuresAppsLocation.query(('rack_id == 17596 & app == "RM"'))
#df_Failurelogs_FailuresAppsLocation.query(('app == "RM"'))
#time1 = df_Failurelogs_FailuresAppsLocation.query(('rack_id == 17596 & app == "RM" & disk_id==39876'))
#time2 = df_Failurelogs_FailuresAppsLocation.query(('rack_id == 17596 & app == "RM" & disk_id==22968'))

In [8]:
## Inserting FailuresAppsLocation into the DB

for row in df_Failurelogs_FailuresAppsLocation.itertuples():
    insert_FailuresAppsLocation = FailuresAppsLocation()
    insert_FailuresAppsLocation.disk_id = row.disk_id
    insert_FailuresAppsLocation.failure_time = row.failure_time
    insert_FailuresAppsLocation.model_x = row.model_x
    insert_FailuresAppsLocation.model_y = row.model_y
    insert_FailuresAppsLocation.app = row.app
    insert_FailuresAppsLocation.node_id = row.node_id
    insert_FailuresAppsLocation.rack_id = row.rack_id
    insert_FailuresAppsLocation.machine_room_id = row.machine_room_id
    insert_FailuresAppsLocation.save()

In [11]:
## Some functions in Mongoengine

# Creating the object related to the whole collection
#failuresAppsLocationTeste = FailuresAppsLocation.objects()  

# Query if you know something about the document
#testando = FailuresAppsLocation.objects(disk_id="33722").get()

# Printing
#testando.disk_id

# Deleting all collection
#failuresAppsLocationTeste.delete()

In [None]:
fig = sns.catplot(x="capacity", y="iops_total", hue="object_size", data=dataset_write_cenUm,
               row="pattern", col="work_outstd", kind="bar", ci=90, palette="Blues_d", aspect=0.9, height=4.5
                  , legend_out = True, margin_titles = True)

fig.set_axis_labels("Capacity", "IOPS")
fig.set_xticklabels(["80GB HDD", "500GB HDD", "1TB HDD", "1TB WDHDD", "120GB SSD", "Hybrid"])

plt.savefig('write_iops.pdf', dpi=1200)

In [None]:
# Loading AlibabaOvertime dataset using Pandas

start_date = date(2018, 2, 1)
end_date = date(2018, 3, 20)
delta = timedelta(days=1)
df_AlibabaOver_SMARTlogs = pd.DataFrame()
df_AlibabaOver_SMARTlogs_Filtered = pd.DataFrame()
reallocated_Sectors_Count = 0
uncorrectable_errors = 0
program_count = 0
erase_count = 0
nblocks_written = 0
nblocks_read = 0

while start_date <= end_date:
    path = Path('/media/erb/hdd1/DataSet/alibabaOvertime/smartlogs/' + start_date.strftime("%Y%m%d") + '.csv')

    if path.is_file(): # checking if a particular file for a specific date is missing
        df_AlibabaOver_SMARTlogs = pd.read_csv(path)
        df_AlibabaOver_SMARTlogs = pd.DataFrame(df_AlibabaOver_SMARTlogs)

        # Changing failure time column to datetime type
        df_AlibabaOver_SMARTlogs['ds'] =  pd.to_datetime(df_AlibabaOver_SMARTlogs['ds'], format='%Y%m%d')

        # Choosing the columns of interest
        df_AlibabaOver_SMARTlogs = df_AlibabaOver_SMARTlogs.loc[:,['disk_id','ds', 'model','n_5','n_187','n_171','n_181','n_172','n_182','n_241','n_242']]

        # Changing the name of some columns to clarify their meaning
        df_AlibabaOver_SMARTlogs.rename(columns = {'ds':'timestamp', 'model':'model_x', 'n_5':'r_sectors','n_187':'u_errors','n_171':'p_failedA','n_181':'p_failedB','n_172':'e_failedA','n_182':'e_failedB','n_241':'n_b_written','n_242':'n_b_read'}, inplace=True)

        for row in df_AlibabaOver_SMARTlogs.itertuples():
            insert_SmartAttributes = SMARTAtt()
            insert_SmartAttributes.disk_id = row.disk_id
            insert_SmartAttributes.timestamp = row.timestamp
            insert_SmartAttributes.model_x = row.model_x
            insert_SmartAttributes.r_sectors = row.r_sectors
            insert_SmartAttributes.u_errors = row.u_errors
            insert_SmartAttributes.p_failedA = row.p_failedA
            insert_SmartAttributes.p_failedB = row.p_failedB
            insert_SmartAttributes.e_failedA = row.e_failedA
            insert_SmartAttributes.e_failedB = row.e_failedB
            insert_SmartAttributes.n_b_written = row.n_b_written
            insert_SmartAttributes.n_b_read = row.n_b_read
            insert_SmartAttributes.save()

        #df_AlibabaOver_SMARTlogs_Filtered = pd.concat([df_AlibabaOver_SMARTlogs_Filtered, df_AlibabaOver_SMARTlogs], ignore_index=True)
    start_date += delta

# Changing the name of some columns to clarify their meaning
#df_AlibabaOver_SMARTlogs_Filtered.rename(columns = {'ds':'timestamp', 'model':'model_x', 'n_5':'r_sectors','n_187':'u_errors','n_171':'p_failedA','n_181':'p_failedB','n_172':'e_failedA','n_182':'e_failedB','n_241':'n_b_written','n_242':'n_b_read'}, inplace=True)

In [None]:
# Loading AlibabaOvertime dataset using Dask

start_date = date(2018, 2, 1)
start_dateAux = start_date
end_date = date(2018, 5, 31)
delta = timedelta(days=1)

while start_date <= end_date:
    path = Path('/media/erb/hdd1/DataSet/alibabaOvertime/smartlogs/' + start_date.strftime("%Y%m%d") + '.csv')

    if path.is_file(): # checking if a particular file for a specific date is missing
        df_AlibabaOver_SMARTlogsTemp = dd.read_csv(path)
        if start_date == start_dateAux:
            df_AlibabaOver_SMARTlogs = df_AlibabaOver_SMARTlogsTemp
        else: 
            df_AlibabaOver_SMARTlogs = dd.concat([df_AlibabaOver_SMARTlogs, df_AlibabaOver_SMARTlogsTemp])
    start_date += delta

In [19]:
# Testing Alibaba's SMART attributes over two years dataset 

# settings to display all columns
#pd.set_option("display.max_columns", None)

#df_AlibabaOver_SMARTlogs_Filtered.query('disk_id == 4711')
#df_AlibabaOver_Failurelogs.head(10)
#df_Failurelogs_FailuresAppsLocation.dtypes

# Creating the object related to the whole collection
SMARTAttributesTest = SMARTAtt.objects()  

# Query if you know something about the document
SMARTAttributesTest = SMARTAtt.objects(disk_id="4711").filter()

# Printing
for i in SMARTAttributesTest:
  print(i.disk_id)
  
# Deleting all collection
#for i in SMARTAttributesTest:
 # i.delete()


In [None]:
# Loading Backblaze dataset

start_dateBB = date(2021, 12, 29)
start_dateAuxBB = start_dateBB
end_dateBB = date(2021, 12, 31)
deltaBB = timedelta(days=1)

while start_dateBB <= end_dateBB:
    pathBB = Path('/media/erb/hdd1/DataSet/backblaze/smartlogs/' + start_dateBB.strftime("%Y-%m-%d") + '.csv')

    if pathBB.is_file(): # checking if a particular file for a specific date is missing
        df_BackBlaze_SMARTlogsTemp = pd.read_csv(pathBB)
        if start_dateBB == start_dateAuxBB: # due to dask instancing of variables
            df_BackBlaze_SMARTlogs = df_BackBlaze_SMARTlogsTemp
        else: 
            df_BackBlaze_SMARTlogs = pd.concat([df_BackBlaze_SMARTlogs, df_BackBlaze_SMARTlogsTemp])
    start_dateBB += deltaBB  

In [None]:
# Testing BackBlaze dataset

df_BackBlaze_SMARTlogs.head()

In [None]:
#Alterando nome das colunas do arquivo bench e transformando a coluna timestamp no tipo datetime

#PARA HDD 80GB

df_80HDDBenchCenUm4KB.rename(columns = {'Time (s)':'timestamp','Maximum(1) (V)':'maximum_1','Maximum(2) (V)':'maximum_2','Average - Full Screen(1) (V)':'average_1','Average - Full Screen(2) (V)':'average_2'}, inplace=True)
df_80HDDBenchCenUm4KB['timestamp'] =  pd.to_datetime(df_80HDDBenchCenUm4KB['timestamp'])

In [None]:
# Combinando todos os DataFrames do IOMeter

#Cenario1

df_IOMeterCenUm_Total = [dfTemp_80HDDMeterCenUm4KB,dfTemp_80HDDMeterCenUm128KB,dfTemp_80HDDMeterCenUm512KB,dfTemp_80HDDMeterCenUm1MB,dfTemp_500HDDMeterCenUm4KB,dfTemp_500HDDMeterCenUm128KB,dfTemp_500HDDMeterCenUm512KB,dfTemp_500HDDMeterCenUm1MB
                         ,dfTemp_1HDDMeterCenUm4KB,dfTemp_1HDDMeterCenUm128KB,dfTemp_1HDDMeterCenUm512KB,dfTemp_1HDDMeterCenUm1MB,dfTemp_1WDHDDMeterCenUm4KB,dfTemp_1WDHDDMeterCenUm128KB,dfTemp_1WDHDDMeterCenUm512KB,dfTemp_1WDHDDMeterCenUm1MB,
                         dfTemp_120SSDMeterCenUm4KB,dfTemp_120SSDMeterCenUm128KB,dfTemp_120SSDMeterCenUm512KB,dfTemp_120SSDMeterCenUm1MB,dfTemp_HybridMeterCenUm4KB,dfTemp_HybridMeterCenUm128KB,dfTemp_HybridMeterCenUm512KB,dfTemp_HybridMeterCenUm1MB]

df_IOMeter_Total = df_IOMeterCenUm_Total + df_IOMeterCenDois_Total



In [None]:
with open('dataset.csv', 'w', newline='') as f:
    dataset = csv.DictWriter(f, fieldnames = ['timestamp','capacity','object_size','pattern','operation','scenario','work_outstd','repetition','sample_count','individual_total','iops_total','iops_read','iops_write','avg_response_time_total','avg_response_time_read','avg_response_time_write','maximum_response_time_total','maximum_response_time_read','maximum_response_time_write','errors_total','errors_read','errors_write','cpu_utilization','power_maximum1_hdd','power_maximum2_hdd','power_average1_hdd','power_average2_hdd','power_maximum1_ssd','power_maximum2_ssd','power_average1_ssd','power_average2_ssd', 'power_total'], delimiter = ',')
    dataset.writeheader()
    #dataset= csv.writer(f, delimiter=',')
    #dataset.writerow(['time_stamp','capacity','object_size','pattern','operation','scenario','work_outstd','repetition','individual_total','iops_total','iops_read','iops_write','avg_response_time_total','avg_response_time_read','avg_response_time_write','maximum_response_time_total','maximum_response_time_read','maximum_response_time_write','errors_total','errors_read','errors_write','cpu_utilization','power'])
    #arquivosaida.writerow({1, 2}) -- inserindo dados
    
    row_index=0
    workers_outstanding=1
    encontrou = False
    sample_count=0
    repetition=1
    pattern=1
    kind='hdd'
    aux_voltage_current= [0,0,0,0]
    aux_voltage_total= [0,0,0,0]
    aux_voltage_current_series = [0,0,0,0]
    aux_voltage_current_hybrid= [0,0,0,0]
    aux_voltage_total_hybrid= [0,0,0,0]
    aux_voltage_current_series_hybrid = [0,0,0,0]
    cenario_atual = 1
    df_IOMeter_index = 0
    
    #indiv_total='I'
    for df_IOMeter in df_IOMeter_Total:                  
        
        if (df_IOMeter_index<4 or (df_IOMeter_index>23 and df_IOMeter_index<28)):
            capacity = '80GBHDD'
            kind='hdd'
            if ((df_IOMeter_index == 0) or (df_IOMeter_index==24)):
                object_size = '4KB'
            elif ((df_IOMeter_index == 1) or (df_IOMeter_index==25)):
                object_size = '128KB'
            elif ((df_IOMeter_index == 2) or (df_IOMeter_index==26)):
                object_size = '512KB'
            elif ((df_IOMeter_index == 3) or (df_IOMeter_index==27)):
                object_size = '1MB'                
        elif ((df_IOMeter_index > 3 and df_IOMeter_index < 8) or (df_IOMeter_index>27 and df_IOMeter_index<32)):
            capacity = '500GBHDD'
            kind='hdd'
            if ((df_IOMeter_index == 4) or (df_IOMeter_index==28)):
                object_size = '4KB'
            elif ((df_IOMeter_index == 5) or (df_IOMeter_index==29)):
                object_size = '128KB'
            elif ((df_IOMeter_index == 6) or (df_IOMeter_index==30)):
                object_size = '512KB'
            elif ((df_IOMeter_index == 7) or (df_IOMeter_index==31)):
                object_size = '1MB'
        elif ((df_IOMeter_index > 7 and df_IOMeter_index < 12) or (df_IOMeter_index>31 and df_IOMeter_index<36)):
            capacity = '1TBHDD'
            kind='hdd'
            if ((df_IOMeter_index == 8) or (df_IOMeter_index==32)):
                object_size = '4KB'
            elif ((df_IOMeter_index == 9) or (df_IOMeter_index==33)):
                object_size = '128KB'
            elif ((df_IOMeter_index == 10) or (df_IOMeter_index==34)):
                object_size = '512KB'
            elif ((df_IOMeter_index == 11) or (df_IOMeter_index==35)):
                object_size = '1MB'
        elif ((df_IOMeter_index > 11 and df_IOMeter_index<16) or (df_IOMeter_index>35 and df_IOMeter_index<40)):
            capacity = '1TBWDHDD'
            kind='hdd'
            if ((df_IOMeter_index == 12) or (df_IOMeter_index==36)):
                object_size = '4KB'
            elif ((df_IOMeter_index == 13) or (df_IOMeter_index==37)):
                object_size = '128KB'
            elif ((df_IOMeter_index == 14) or (df_IOMeter_index==38)):
                object_size = '512KB'
            elif ((df_IOMeter_index == 15) or (df_IOMeter_index==39)):
                object_size = '1MB'
        elif ((df_IOMeter_index > 15 and df_IOMeter_index<20) or (df_IOMeter_index>39 and df_IOMeter_index<44)):
            capacity = '120GBSSD'
            kind = 'ssd'
            if ((df_IOMeter_index == 16) or (df_IOMeter_index==40)):
                object_size = '4KB'
            elif ((df_IOMeter_index == 17) or (df_IOMeter_index==41)):
                object_size = '128KB'
            elif ((df_IOMeter_index == 18) or (df_IOMeter_index==42)):
                object_size = '512KB'
            elif ((df_IOMeter_index == 19) or (df_IOMeter_index==43)):
                object_size = '1MB'
        elif ((df_IOMeter_index > 19 and df_IOMeter_index<24) or (df_IOMeter_index>43 and df_IOMeter_index<48)):
            capacity = 'Hybrid'
            kind = 'hybrid'
            if ((df_IOMeter_index == 20) or (df_IOMeter_index==44)):
                object_size = '4KB'
            elif ((df_IOMeter_index == 21) or (df_IOMeter_index==45)):
                object_size = '128KB'
            elif ((df_IOMeter_index == 22) or (df_IOMeter_index==46)):
                object_size = '512KB'
            elif ((df_IOMeter_index == 23) or (df_IOMeter_index==47)):
                object_size = '1MB'
        for rows in df_IOMeter: 
            if (encontrou):
                #print (rows[0])
                aux_date = datetime.strptime(rows[0], '%Y-%m-%d %H:%M:%S:%f')
                if (workers_outstanding == 1):
                    sample_count = sample_count + 1
                    if (rows[1] == 'ALL'):
                        if (pattern == 1):
                            if (kind == 'hdd'):
                                dataset.writerow({'timestamp': aux_date,'capacity': capacity,'object_size':object_size,'pattern':'R','operation':'W','scenario': cenario_atual,'work_outstd': workers_outstanding,'repetition': repetition,'sample_count': sample_count,'individual_total': 'T','iops_total': rows[7],'iops_read':rows[8],'iops_write':rows[9],'avg_response_time_total':rows[18],'avg_response_time_read':rows[19],'avg_response_time_write':rows[20],'maximum_response_time_total':rows[23],'maximum_response_time_read':rows[24],'maximum_response_time_write':rows[25],'errors_total':rows[28],'errors_read':rows[29],'errors_write':rows[30],'cpu_utilization':rows[49],'power_maximum1_hdd':(aux_voltage_total[0]/(sample_count-1)),'power_maximum2_hdd':(aux_voltage_total[1]/(sample_count-1)),'power_average1_hdd':(aux_voltage_total[2]/(sample_count-1)),'power_average2_hdd':(aux_voltage_total[3]/(sample_count-1))})
                            elif (kind == 'ssd'):
                                dataset.writerow({'timestamp': aux_date,'capacity': capacity,'object_size':object_size,'pattern':'R','operation':'W','scenario': cenario_atual,'work_outstd': workers_outstanding,'repetition': repetition,'sample_count': sample_count,'individual_total': 'T','iops_total': rows[7],'iops_read':rows[8],'iops_write':rows[9],'avg_response_time_total':rows[18],'avg_response_time_read':rows[19],'avg_response_time_write':rows[20],'maximum_response_time_total':rows[23],'maximum_response_time_read':rows[24],'maximum_response_time_write':rows[25],'errors_total':rows[28],'errors_read':rows[29],'errors_write':rows[30],'cpu_utilization':rows[49],'power_maximum1_ssd':(aux_voltage_total[0]/(sample_count-1)),'power_maximum2_ssd':(aux_voltage_total[1]/(sample_count-1)),'power_average1_ssd':(aux_voltage_total[2]/(sample_count-1)),'power_average2_ssd':(aux_voltage_total[3]/(sample_count-1))})
                            else:
                                dataset.writerow({'timestamp': aux_date,'capacity': capacity,'object_size':object_size,'pattern':'R','operation':'W','scenario': cenario_atual,'work_outstd': workers_outstanding,'repetition': repetition,'sample_count': sample_count,'individual_total': 'T','iops_total': rows[7],'iops_read':rows[8],'iops_write':rows[9],'avg_response_time_total':rows[18],'avg_response_time_read':rows[19],'avg_response_time_write':rows[20],'maximum_response_time_total':rows[23],'maximum_response_time_read':rows[24],'maximum_response_time_write':rows[25],'errors_total':rows[28],'errors_read':rows[29],'errors_write':rows[30],'cpu_utilization':rows[49],'power_maximum1_ssd':(aux_voltage_total[0]/(sample_count-1)),'power_maximum2_ssd':(aux_voltage_total[1]/(sample_count-1)),'power_average1_ssd':(aux_voltage_total[2]/(sample_count-1)),'power_average2_ssd':(aux_voltage_total[3]/(sample_count-1)),'power_maximum1_hdd':(aux_voltage_total_hybrid[0]/(sample_count-1)),'power_maximum2_hdd':(aux_voltage_total_hybrid[1]/(sample_count-1)),'power_average1_hdd':(aux_voltage_total_hybrid[2]/(sample_count-1)),'power_average2_hdd':(aux_voltage_total_hybrid[3]/(sample_count-1))})
  

In [None]:
# Carregando dados filtrados em um dataframe para graficos de performance

dataset = pd.read_csv("dataset.csv")

# Separando por cenários

##Cenário 1
dataset_cenUm = dataset.query('scenario == 1')

### Escrita
dataset_write_cenUm = dataset_cenUm.query('operation == "W" & individual_total == "T"')



In [None]:
# INVERTENDO COLUNA IOPS TOTAL para performance

## Cenario 1

dataset_write_cenUm.iops_total[dataset_write_cenUm.iops_total != 0] = (1/dataset_write_cenUm.iops_total)


In [None]:
## RETIRANDO RUÍDOS DAS MEDIÇÕES DE TENSÃO para performance

##Cenário 1

### Escrita

dataset_write_cenUm.power_average1_hdd = dataset_write_cenUm.power_average1_hdd.astype(str)
dataset_write_cenUm['power_average1_hdd'] = dataset_write_cenUm['power_average1_hdd'].apply(lambda x: x.split("e")[0])
dataset_write_cenUm.power_average1_hdd = dataset_write_cenUm.power_average1_hdd.astype(float)

In [None]:
#dataset_write_cenUm.power_average1_hdd
#dataset_teste = dataset_write_cenUm.query('capacity == "120GBSSD"')
#dataset_teste.power_average2_ssd
dataset_teste = dataset_write_cenDois.query('capacity == "80GBHDD"')
#dataset_teste.power_average1_hdd
dataset_teste.head()

In [None]:
## PARA PERFORMANCE
## Calculando consumo de energia total hdd 80GB

dataset_write_cenUm.power_total[dataset_write_cenUm.capacity == '80GBHDD'] = (((6.8-dataset_write_cenUm.power_average1_hdd[dataset_write_cenUm.capacity == '80GBHDD'])/(1.08))*(dataset_write_cenUm.power_average1_hdd[dataset_write_cenUm.capacity == '80GBHDD']))+(((12.7-dataset_write_cenUm.power_average2_hdd[dataset_write_cenUm.capacity == '80GBHDD']) /(0.68))*(dataset_write_cenUm.power_average1_hdd[dataset_write_cenUm.capacity == '80GBHDD']))
dataset_read_cenUm.power_total[dataset_read_cenUm.capacity == '80GBHDD'] = (((6.8-dataset_read_cenUm.power_average1_hdd[dataset_read_cenUm.capacity == '80GBHDD']) /(1.08))*(dataset_read_cenUm.power_average1_hdd[dataset_read_cenUm.capacity == '80GBHDD']))+(((12.7-dataset_read_cenUm.power_average2_hdd[dataset_read_cenUm.capacity == '80GBHDD']) /(0.68))*(dataset_read_cenUm.power_average2_hdd[dataset_read_cenUm.capacity == '80GBHDD']))
dataset_mix_cenUm.power_total[dataset_mix_cenUm.capacity == '80GBHDD'] = (((6.8-dataset_mix_cenUm.power_average1_hdd[dataset_mix_cenUm.capacity == '80GBHDD']) /(1.08))*(dataset_mix_cenUm.power_average1_hdd[dataset_mix_cenUm.capacity == '80GBHDD']))+(((12.7-dataset_mix_cenUm.power_average2_hdd[dataset_mix_cenUm.capacity == '80GBHDD']) /(0.68))*(dataset_mix_cenUm.power_average2_hdd[dataset_mix_cenUm.capacity == '80GBHDD']))


In [None]:
# APLICANDO O MODULO para performance

## hdd 80GB

dataset_write_cenUm.power_total[dataset_write_cenUm.capacity == '80GBHDD'] = abs(dataset_write_cenUm.power_total[dataset_write_cenUm.capacity == '80GBHDD'])
dataset_read_cenUm.power_total[dataset_read_cenUm.capacity == '80GBHDD'] = abs(dataset_read_cenUm.power_total[dataset_read_cenUm.capacity == '80GBHDD'])
dataset_mix_cenUm.power_total[dataset_mix_cenUm.capacity == '80GBHDD'] = abs(dataset_mix_cenUm.power_total[dataset_mix_cenUm.capacity == '80GBHDD'])



In [None]:
## FILTRO DE DADOS PARA Performance PARA DOE Minitab (com 20 replicates) - cenario 2

scenario = '2'
storages = ['80GBHDD', '500GBHDD', '1TBHDD', '1TBWDHDD', '120GBSSD']
metrics = ['iops_total', 'avg_response_time_total', 'power_total', 'cpu_utilization']
objects_size = ['4KB', '128KB', '512KB', '1MB']
patterns = ['80R', 'R', 'S']
replicates = ['1','2','3','4','5','6','7','8','9','10','11','12','13','14','15','16','17','18','19','20']
work_outstd = ['1', '4', '16','64']
operations = ['Write', 'Read', 'Mix']
dataset_metric2 = pd.DataFrame()

for metric in metrics:
    for run in replicates:
        for operation in operations:
             for storage in storages:
                    for object_size in objects_size:
                        for pattern in patterns:
                            for work in work_outstd:

                                if operation == 'Write':
                                    dataset_filter = dataset_write_cenDois.query('repetition == @run & scenario == @scenario & capacity == @storage & object_size == @object_size & pattern == @pattern & work_outstd == @work')             
                                elif operation == 'Read':
                                    dataset_filter = dataset_read_cenDois.query('repetition == @run & scenario == @scenario & capacity == @storage & object_size == @object_size & pattern == @pattern & work_outstd == @work')
                                else:
                                    dataset_filter = dataset_mix_cenDois.query('repetition == @run & scenario == @scenario & capacity == @storage & object_size == @object_size & pattern == @pattern & work_outstd == @work')

    filename = "Performance/" + "DOE/" + scenario + "/" + metric + "/" + "FilteredData.csv"
    dataset_metric2.to_csv(filename, index=False)
    dataset_metric2 = pd.DataFrame()

In [None]:
# CENARIO 1

## WRITE IOPS

fig = sns.catplot(x="capacity", y="iops_total", hue="object_size", data=dataset_write_cenUm,
               row="pattern", col="work_outstd", kind="bar", ci=90, palette="Blues_d", aspect=0.9, height=4.5
                  , legend_out = True, margin_titles = True)

fig.set_axis_labels("Capacity", "IOPS")
fig.set_xticklabels(["80GB HDD", "500GB HDD", "1TB HDD", "1TB WDHDD", "120GB SSD", "Hybrid"])

plt.savefig('write_iops.pdf', dpi=1200)

In [None]:
teste = dataset_cenUm.query('repetition==2 & individual_total == "I" & operation == "W" & pattern == "R" & work_outstd == 4 & capacity == "1TBHDD" & object_size == "4KB"')
if (not teste.empty):
    print ('eh')
else:
    x = teste.iops_total.get_values()[0]
    print (x)
