<a href="https://colab.research.google.com/github/AdriiTrujillo/Fault_Tolerance_Colabs/blob/main/Dataset_RF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# CREATE DATASET (BANCO DE REGISTROS)
_Adrián Trujillo López_

**Descripción:**
Creación de los datasets con los valores de FT unicamente del banco de registros.

Correr primero las funcoines de lectura de datos!

In [None]:
#Celda para poder probar lo que se quiera (o utilizar comandos del bash con !)


In [1]:
# Importación de librerias
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import re
import sys
import os
import shutil

In [2]:
# Montar el contenido de drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Lectura de las carpetas de individuos

Los datasets que se utilizan para entrenamiento son: 
* quicksort (000)
* tarai (000, 101, 110, 111)
* insertsort (000, 101, 110, 111)
* heapsort (000, 110, 111)
* mmult (No está)
* mmultvector (100, 110)
* maddconst (100, 110)
* bitcount1 (111)
* bitcount2 (111)
* crc_32_short (no está)


Los datasets que se utilizan para test son:
* Bubblesort (000 110)
* Dijkstra (Antiguo)
* Ndes (000)
* Fir short (111 falta info)
* aes (no está)

In [12]:
# Dataframes utilizados para training
quicksort_folders = ['quicksort_000']
tarai_folders = ['tarai_000', 'tarai_101', 'tarai_110', 'tarai_111']
insertsort_folders = ['insertsort_000', 'insertsort_101', 'insertsort_110', 'insertsort_111']
heapsort_folders = ['heapsort_000', 'heapsort_110', 'heapsort_111']
mmultvector_folders = ['mmultvector_100', 'mmultvector_110']
maddconst_folders = ['maddconst_100', 'maddconst_110']
bitcount1_folders = ['bitcount1_111']
bitcount2_folders = ['bitcount2_111']

# Dataframes utilizados para test
bubblesort_folders = ['bubblesort_000', 'bubblesort_110']
ndes_folders = ['ndes_000']

benchmarks = [bubblesort_folders]
              
# TODO : bubblesort_folders (failed)
# DONE : heapsort_folders, quicksort_folders, tarai_folders, insertsort_folders, mmultvector_folders, maddconst_folders, bitcount1_folders, bitcount2_folders, ndes_folders

In [None]:
# Data treatment 
csv_extension = '_db_info.csv'
database_extension = '_db'
global_path = '/content/drive/Shareddrives/NN4FT_DB/'
all_csv = []

for bench in benchmarks:
  print("In " + bench[0][0:-4] + ": ")
  # Create each csv file
  csv = bench[0][0:-4] + '.csv'
  all_csv.append(csv)
  # Start writing the csv file for each benchmark
  with open(csv, "w") as f:
    f.write('ind;r0;r1;r2;r3;r4;r5;r6;r7;r8;r9;r10;r11;r12;pc;sp;lr;totalInstructions;memoryRead;memoryWrite;memoryAccess;text;data;bss;stack;rf_sdc;rf_hang\n')
    f.close()

  for code in bench:
    print("Analyzing " + code + " ...")
    bench_path = global_path + code
    info_path = bench_path + '/' +  code + csv_extension # File with indiviuals information
    info_db = pd.read_csv(info_path, sep=",")
    database_path = bench_path + database_extension # Folder with all individuals
    folder_list = os.listdir(database_path)
    total_ind = len(folder_list)
    i = 0

    for folder in folder_list:
      outputToWrite = ""
      i+=1
      trace_file = database_path + '/' + folder + '/' + code + '.ARM.elf.trace'
      # Get program information
      df = createDataframe(trace_file)
      lifetime, memoryAccess, memoryRead, memoryWrite = getInformationProgram()
      # Get program size
      text_size, data_size, bss_size, stack_size = getSizePrograms(folder, info_db)
      # Get FT of RF
      rf_sdc, rf_hang = getCoverage(folder, info_db) 
    
      outputToWrite += folder + ';'

      for reg in registers:
        outputToWrite += str(lifetime[reg])+";"

      outputToWrite += str(df.shape[0])+";"+str(memoryRead)+";"+str(memoryWrite)+";"+str(memoryAccess)+";"+str(text_size)+";"+str(data_size)+";"+str(bss_size)+";"+str(stack_size)+";"+str(rf_sdc)+";"+str(rf_hang)+"\n"

      with open(csv, "a") as f:
        f.write(outputToWrite)
        f.close()
      
      print("Ind: ", i, "/", total_ind)




In [None]:
# Move all the created dataset to any destination
save_path = '/content/drive/Shareddrives/NN4FT_DB/Exp_Adrian/Data Frames/TAREA_3'
for file in all_csv:
  shutil.move(file, save_path)
  # print(file)

## Funciones de lectura de datos

In [5]:
#Regular expressions to classify instructions

instrLine = r"^Info\s[0-9]"
cpsrLine = r"^\s{8}cpsr"

arm_cond = r"(eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le){0,1}$"
arm_type = r"(d|b|sb|h|sh){0,1}"
addr_mode = r"(ia|ib|da|db|fd|fa|ed|ea){0,1}"
sflag = r"s{0,1}"

#Read Write instructions
ldm = r"^ldm"+addr_mode+arm_cond

#Write Read instructions
##mov opsubadd ldr uxtb logical lsr
mov = r"^(mov|movw|movt|mvn)"+sflag+arm_cond
op = r"^(add|sub|mul|mla|rsb)"+sflag+arm_cond
ldr = r"^ldr"+arm_type+arm_cond
lsr = r"^(asr|lsl|lsr|ror|rrx)"+sflag+arm_cond
uxtb = r"^(uxtb|sbfx|ubfx)"+arm_cond
logical = r"^(and|orr|eor|bic|orn)"+sflag+arm_cond

writeReadInstr = r"^("+mov+"|"+op+"|"+ldr+"|"+lsr+"|"+uxtb+"|"+logical+")$"

#Read Read instructions
##branch testop str1 stm
str1 = r"^str"+arm_type+arm_cond
stm = r"^stm"+addr_mode+arm_cond
branch = r"^(b|bl|bx|blx)"+arm_cond
testop = r"^(tst|teq)"+arm_cond
compare = r"^(cmp|cmn)"+arm_cond
pld = r"^(pld|pldw|pli)"+arm_cond

readReadInstr = r"^("+str1+"|"+stm+"|"+branch+"|"+testop+"|"+compare+"|"+pld+")$"


doInstr = r"(eq|ne|cs|cc|mi|pl|vs|vc|hi|ls|ge|lt|gt|le)$"
exInstr = ["teq","movs","svc","lsls","bics","muls"]

memoryAccesInstr = r"^("+str1+"|"+stm+"|"+ldr+"|"+ldm+"|"+pld+")$"
memoryReadInstr = r"^("+ldr+"|"+ldm+"|"+pld+")$"
memoryWriteInstr = r"^("+str1+"|"+stm+")$"

#test = "lsl"
#if re.search(doInstr,test) and (test not in exInstr):
#  print("True")

# Only the RF
registers = ["r0","r1","r2","r3","r4","r5","r6","r7","r8","r9","r10","r11","r12","pc","sp","lr"]

setCond = set()
setWR = set()
setRR = set()
setRW = set()
setMemory = set()
setMemoryRead = set()
setMemoryWrite = set()
setIgnoredInstr = set()
setInstructions = set()

In [6]:
def executedInstr(cond, cpsr):
  binarynum = bin(cpsr)[2:]
  
  try:
    N = int(binarynum[31])
  except IndexError:
    N = 0
  try:
    Z = int(binarynum[30])
  except IndexError:
    Z = 0
  try:
    C = int(binarynum[29])
  except IndexError:
    C = 0
  try:
    V = int(binarynum[28])
  except IndexError:
    V = 0

  if(cond=="eq"):
    if Z == 1:
      return True
  elif(cond=="ne"):
    if Z == 0:
      return True
  elif(cond=="cs"):
    if C == 1:
      return True
  elif(cond=="cc"):
    if C == 0:
      return True
  elif(cond=="mi"):
    if N == 1:
      return True
  elif(cond=="pl"):
    if N == 0:
      return True
  elif(cond=="vs"):
    if V == 1:
      return True
  elif(cond=="vc"):
    if V == 0:
      return True
  elif(cond=="hi"):
    if (C == 1) and (Z == 0):
      return True
  elif(cond=="ls"):
    if (C == 0) and (Z == 1):
      return True
  elif(cond=="ge"):
    if N == V:
      return True
  elif(cond=="lt"):
    if N != V:
      return True
  elif(cond=="gt"):
    if (Z == 0) and (N == V):
      return True
  elif(cond=="le"):
    if (Z == 1) and (N != V):
      return True
  return False

def createDataframe(trace_file):
  #Filter and convert bubblesort.ARM.elf.trace to csv
  fileToWrite = open("traceFile.csv", "w")
  with open(trace_file) as file:
    for line in file:
      if re.match(instrLine,line):
        columns = line.split()
        fileToWrite.write('%s;' % (columns[5])) #instruction
        if(len(columns)>6):
          registersAB = columns[6].split(",",1)
          if(len(registersAB)==1):
            fileToWrite.write('%s;%s;' % (registersAB[0], "-")) #only register A
          else:
            fileToWrite.write('%s;%s;' % (registersAB[0], registersAB[1])) #register A and B
      elif re.match(cpsrLine,line):
        columns = line.split()
        fileToWrite.write('%s\n' % (columns[2]))
  fileToWrite.close()
  
  #Read the created csv
  df = pd.read_csv("traceFile.csv", sep=";", names=["Instruction","A","B","cpsr"])
  df.cpsr = df.cpsr.shift(1).fillna(0)
  df.cpsr = df.cpsr.astype(int)
  
  return df    
      
def getInformationProgram():
  
  def calculateLTRegRead(read):
    for reg in registers:
      if reg in read:
        lifetime[reg]+=index-values[reg]
        values[reg]=index
      
  def calculateLTRegWritten(written):
    for reg in registers:
      if reg in written:
        values[reg]=index
        

  memoryAccess = 0
  memoryRead = 0
  memoryWrite = 0
  
  lifetime = {}
  values = {}
  for i in registers:
    values[i] = 0
    lifetime[i] = 0

  index = 0
  for row in df.itertuples(index=True, name="line"):
    instr = row[1]
    index = row[0]
    
    setInstructions.add(instr)
    condMatch = re.search(doInstr,instr)
    if(condMatch and (instr not in exInstr)):
      setCond.add(instr)
      if not executedInstr(condMatch.group(0), row[4]):
        continue
    if(re.match(writeReadInstr,instr)): # Write Read
      setWR.add(instr)
      calculateLTRegRead(row[3])
      calculateLTRegWritten(row[2])
    elif(re.match(readReadInstr,instr)): # Read Read
      setRR.add(instr)
      calculateLTRegRead(row[2])
      calculateLTRegRead(row[3])
    elif(re.match(ldm,instr)): # Read Write (ldm)
      setRW.add(instr)
      calculateLTRegRead(row[2])
      calculateLTRegWritten(row[3])
    else:
      setIgnoredInstr.add(instr)

    if(re.match(memoryAccesInstr,instr)):
      memoryAccess += 1
      setMemory.add(instr)
      if(re.match(memoryReadInstr,instr)): 
        memoryRead += 1
        setMemoryRead.add(instr)
      if(re.match(memoryWriteInstr,instr)):
        memoryWrite += 1
        setMemoryWrite.add(instr)
        
  return lifetime, memoryAccess, memoryRead, memoryWrite

In [9]:
def getCoverage(folder, info_db):

  rf_sdc = 0.0
  rf_hang = 0.0

  sdc_fields = ["r0_sdc","r1_sdc","r2_sdc","r3_sdc","r4_sdc","r5_sdc","r6_sdc","r7_sdc","r8_sdc","r9_sdc","r10_sdc","r11_sdc","r12_sdc","sp_sdc","lr_sdc","pc_sdc"]
  hang_fields = ["r0_hang","r1_hang","r2_hang","r3_hang","r4_hang","r5_hang","r6_hang","r7_hang","r8_hang","r9_hang","r10_hang","r11_hang","r12_hang","sp_hang","lr_hang","pc_hang"]

  #Search actual individual in the info df
  ind_df = info_db.loc[info_db['ind'] == folder]

  for field in sdc_fields:
    rf_sdc += ind_df[field].values[0]

  for field in hang_fields:
    rf_hang += ind_df[field].values[0]

  rf_sdc = rf_sdc/16
  rf_hang = rf_hang/16

  return rf_sdc, rf_hang


In [None]:
# ruta = "/content/drive/Shareddrives/NN4FT_DB/heapsort_110/heapsort_110_db_info.csv"
# df = pd.read_csv(ruta, sep=",")
# # print(df.shape)
# # ind = "d3ea90595e0ebcdc306ccacc1f2c455c01b4691b"
# # ind = "05148b1a539450830030fffdd56c026909c45855"
# ind = "188f92a279216a73742764e0cbcd6b3a35d5e3c4"
# row_df = df.loc[df['ind'] == ind]
# # row_df["r0_sdc"].values[0]
# row_df["text_size"].values


In [10]:
def getSizePrograms(folder, info_db):
  
  #Search actual individual in the info df
  ind_df = info_db.loc[info_db['ind'] == folder]

  text_size = ind_df['text_size'].values[0]
  data_size = ind_df['data_size'].values[0]
  bss_size = ind_df['bss_size'].values[0]
  stack_size = ind_df['stack_size'].values[0]

  return text_size, data_size, bss_size, stack_size