In [1]:
from tracr.rasp import rasp
from tracr.compiler import compiling
from tracr.compiler import lib

import sys
import os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.model import Model
from src.functions import *

In [168]:
#Prints the overleaf table body from a pandas dataframe
def dataFrameToOverleafTableBody(df: pd.DataFrame, decimalNumber = 2):
    columnCount = len(df.columns.values)
    tableBody = "\\begin{tabular}{|"+ "".join(["c|" for i in range(columnCount)]) +"} \\hline \n"
    tableBody += " & ".join(map(lambda x: "\\textbf{"+str(x)+"}", df.columns.values)) + "\\\\ \\hline \n"
    for _, row in df.iterrows():
        if decimalNumber:
            newRow = row.values[0] + " & "
            newRow += " & ".join(map(lambda x: "%.2f"%x, row.values[1:])) + "\\\\ \\hline \n"
        else:
            newRow = " & ".join(map(str, row.values)) + "\\\\ \\hline \n"
        tableBody += newRow
    tableBody += "\\end{tabular} \n"
    return tableBody

#Merge the results from the files in list. The files need to have the same dimension
def loadArray(fileNames="temp", baseDirectory = None):
    if baseDirectory:
        if type(fileNames) is list:
            fileNames = [baseDirectory + name for name in fileNames]
        else:
            fileNames = baseDirectory + fileNames

    if type(fileNames) is list and len(fileNames)==1:
        fileNames = fileNames[0]

    if type(fileNames) is list:
        file = open(fileNames[0], "rb")
        array = np.load(file).reshape((-1,1))
        file.close()
        for fn in fileNames[1:]:
            file = open(fn, "rb")
            conAcc = np.load(file).reshape((-1,1))
            array = np.concatenate((array, conAcc), axis=1)
            file.close()
        
    else:
        file = open(fileNames, "rb")
        array = np.load(file)
        file.close()

    return array

#Returns all possible comnination of 'baseExpression' split at "{}" and filed with 'modificationsLists'
def getListOfNames(baseExpression: str, modificationsLists: list):
    listOfNames = []
    splitBase = baseExpression.split("{}")
    if len(splitBase) != len(modificationsLists) + 1:
        print("Error: Possible modifications ("+str(len(splitBase)-1)+") does not match given modifications ("+str(len(modificationsLists))+")")
        raise RuntimeError
    
    if len(splitBase) == 2:
        for modification in modificationsLists[0]:
            listOfNames.append(str(modification).join(splitBase))
        return listOfNames
    
    for modification in modificationsLists[0]:
        listOfNames += getListOfNames(splitBase[0] + str(modification) + "{}".join(splitBase[1:]), modificationsLists[1:])

    return listOfNames

#Creates and returns a list of names based on standard for each model test
def createListOfNames(testName, baseDirectory = "", paramNames = None):
    match testName:
        case "overTraining":
            if paramNames is None:
                paramNames = ["v1", "v2", "v3"]
            baseExpression = "{}{}_{}_{}_{}"
            modificationsLists = [["random_", ""],
                                  ["train", "val"],
                                  ["sort", "reverse", "hist", "most-freq", "shuffle_dyck1", "shuffle_dyck2"],
                                  ["loss","acc"],
                                  paramNames]
            nameList = getListOfNames(baseExpression, modificationsLists)
            
        case "bitFlip":
            if paramNames is None:
                paramNames = ["0.0", "0.02", "0.04", "0.06", "0.08", "0.1"]
            baseExpression = "{}_{}_{}_bitflip_fliprate_{}_{}"
            modificationsLists = [["sort", "reverse", "hist", "most-freq", "shuffle_dyck1", "shuffle_dyck2"],
                                 ["train", "val"],
                                 ["loss", "acc"],
                                 paramNames,
                                 ["1", "2", "3", "4", "5"]
                                 ]
            nameList = getListOfNames(baseExpression, modificationsLists)

        case "gaussian":
            if paramNames is None:
                paramNames = ["0.01", "0.25", "0.50", "0.75", "1.0", "1.25"]
            baseExpression = "{}_{}_{}_gaussian_std{}_{}"
            modificationsLists = [["sort", "reverse", "hist", "most-freq", "shuffle_dyck1", "shuffle_dyck2"],
                                 ["train", "val"],
                                 ["loss", "acc"],
                                 paramNames,
                                 ["1", "2", "3", "4", "5"]
                                 ]
            nameList = getListOfNames(baseExpression, modificationsLists)

        case "mutated":
            if paramNames is None:
                paramNames = ["0", "1", "2", "3", "4"]
            baseExpression = "{}_{}_{}_{}"
            modificationsLists = [["sort", "reverse", "hist", "most-freq", "shuffle_dyck1", "shuffle_dyck2"],
                                 ["train", "val"],
                                 ["loss", "acc"],
                                 paramNames,
                                 ]
            nameList = getListOfNames(baseExpression, modificationsLists)

        case _:
            print("Error:", testName, "is not a valid test name")
        
    nameList = [baseDirectory + name for name in nameList]

    return nameList

#Classify fileName according to model base and loss/acc
def tagFileName(fileName):
    tags = {}

def dictAppend(dic, id, value):
    if id in dic:
        dic[id].append(value)
    else:
        dic[id]=[value]
        
def dataFrameHelper(dfDict, prefix, includedParameters, relevantFileNames, useAverage, useMean, useStd, baseDirectory = None):
    for param in includedParameters:
        paramFileNames = [name for name in relevantFileNames if str(param) in name]

        #Start adding to the fields
        print(param)
        print(paramFileNames)
        data = loadArray(paramFileNames, baseDirectory)

        if useAverage:
            if useMean:
                dictAppend(dfDict, prefix+"mean"+str(param), data.mean(axis=1)[-1])
            if useStd:
                dictAppend(dfDict, prefix+"std"+str(param), data.std(axis=1)[-1])
        else:
            dictAppend(dfDict, prefix+str(param), data[-1])

def createDataFrameFromFileNames(fileNames, includedParameters, useAverage = True, useMean = True, useStd = True, includeTrain = False, includeValidation = True, includeLoss = False,
                                  includeAcc = True, rowIsModel = True, includeRandom = False, includeNonRandom = True, baseDirectory = None):
    #Possible columns (All at certain checkpoints e.g. end, start, certain epoch)
        #Mean, std

    #Problem: Some experiments use mean others not (regular, grokking and mutations)
        #Solution. If experiment uses mean it is consistent throughout, add as a parameter flag
        #using average implies that we want to include mean and std. I'll add flag for min/max as well

    #Problem: Currently only gives fileNames, I need model names extracted as well as loss/acc and train/validation
    #This requires linking a model name, train/validation and loss/acc with corresponding fileNames
        #Solution should be able to build a parser function which can classify each fileName by model name, loss/acc before extracting data 
        #Not good enough. I need to differentiate between model name, train/val, loss/acc and parameter value
        #This suggests it should be easier to parse these from the baseExpression directly 

    dfDict = {}

    #Sort results with row as model
    if rowIsModel:
        dfDict["Model"]=["Sort","Reverse","Hist","Most-Freq","Dyck1","Dyck2"]

        #Itterate trough all models
        for modelName in ["sort", "reverse", "hist", "most-freq", "dyck1", "dyck2"]:
            modelFileNames = [name for name in fileNames if modelName in name]

            #Non random
            if includeNonRandom:
                nonRandomFileNames = [name for name in modelFileNames if "random" not in name]

                if includeTrain:
                    trainFileNames = [name for name in nonRandomFileNames if "train" in name]

                    if includeAcc:
                        accFileNames = [name for name in trainFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "ta", includedParameters, accFileNames, useAverage, useMean, useStd, baseDirectory)

                    if includeLoss:
                        lossFileNames = [name for name in trainFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "tl", includedParameters, lossFileNames, useAverage, useMean, useStd, baseDirectory)

                if includeValidation:
                    valFileNames = [name for name in nonRandomFileNames if "val" in name]

                    if includeAcc:
                        accFileNames = [name for name in valFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "va", includedParameters, accFileNames, useAverage, useMean, useStd, baseDirectory)

                    if includeLoss:
                        lossFileNames = [name for name in valFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "vl", includedParameters, lossFileNames, useAverage, useMean, useStd, baseDirectory)

            #Random
            if includeRandom:
                randomFileNames = [name for name in modelFileNames if "random" in name]

                if includeTrain:
                    trainFileNames = [name for name in randomFileNames if "train" in name]

                    if includeAcc:
                        accFileNames = [name for name in trainFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "randta", includedParameters, accFileNames, useAverage, useMean, useStd, baseDirectory)

                    if includeLoss:
                        lossFileNames = [name for name in trainFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "randtl", includedParameters, lossFileNames, useAverage, useMean, useStd, baseDirectory)

                if includeValidation:
                    valFileNames = [name for name in randomFileNames if "val" in name]

                    if includeAcc:
                        accFileNames = [name for name in valFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "randva", includedParameters, accFileNames, useAverage, useMean, useStd, baseDirectory)

                    if includeLoss:
                        lossFileNames = [name for name in valFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "randvl", includedParameters, lossFileNames, useAverage, useMean, useStd, baseDirectory)
            
            
                    
    else:
        print("Only model rows are implemented")            

    #Specially clause for regular overtraining results
    if not (includeTrain or includeValidation):
        print("Not yet implemented aka kinda useless at the moment")
        raise RuntimeError

    return pd.DataFrame(dfDict)



In [35]:
baseExpression = "{}_{}_{}"
modificationsLists = [["sort", "reverse", "hist", "most-freq", "dyck1", "dyck2"],
                      ["loss", "val"],
                      ["v1", "v2", "v3"]]

fileNames = getListOfNames(baseExpression, modificationsLists)

trainNames = [name for name in fileNames if "loss" in name]
print(trainNames)

['sort_loss_v1', 'sort_loss_v2', 'sort_loss_v3', 'reverse_loss_v1', 'reverse_loss_v2', 'reverse_loss_v3', 'hist_loss_v1', 'hist_loss_v2', 'hist_loss_v3', 'most-freq_loss_v1', 'most-freq_loss_v2', 'most-freq_loss_v3', 'dyck1_loss_v1', 'dyck1_loss_v2', 'dyck1_loss_v3', 'dyck2_loss_v1', 'dyck2_loss_v2', 'dyck2_loss_v3']


#### Base Gaussian model

In [162]:
baseDirectory = os.path.abspath(os.path.join('../..')) + "/PerformanceTesting/savedData/noiseTrainingGaussian2/"
fileNames = createListOfNames("gaussian")
df = createDataFrameFromFileNames(fileNames, includeTrain=False, useStd=False, useMean=True, includedParameters = ["0.01", "0.25", "0.50", "0.75", "1.0", "1.25"], baseDirectory=baseDirectory)
print(dataFrameToOverleafTableBody(df))

\begin{tabular}{|c|c|c|c|c|c|c|} \hline 
\textbf{Model} & \textbf{vamean0.01} & \textbf{vamean0.25} & \textbf{vamean0.50} & \textbf{vamean0.75} & \textbf{vamean1.0} & \textbf{vamean1.25}\\ \hline 
Sort & 0.95 & 0.99 & 0.99 & 0.97 & 1.00 & 0.99\\ \hline 
Reverse & 1.00 & 0.91 & 1.00 & 0.93 & 0.88 & 0.68\\ \hline 
Hist & 1.00 & 1.00 & 1.00 & 1.00 & 1.00 & 1.00\\ \hline 
Most-Freq & 0.36 & 0.38 & 0.36 & 0.37 & 0.36 & 0.36\\ \hline 
Dyck1 & 0.91 & 0.84 & 0.85 & 0.91 & 0.87 & 0.88\\ \hline 
Dyck2 & 0.93 & 0.89 & 0.91 & 0.90 & 0.94 & 0.88\\ \hline 
\end{tabular} 



#### Base Overtraining model

In [165]:
baseDirectory = os.path.abspath(os.path.join('../..')) + "/PerformanceTesting/savedData/overTrainingV2/"
fileNames = createListOfNames("overTraining")
df = createDataFrameFromFileNames(fileNames, includeTrain=False, useAverage=False, includeRandom=True, includedParameters = ["v1","v2","v3"], baseDirectory=baseDirectory)
print(dataFrameToOverleafTableBody(df))

\begin{tabular}{|c|c|c|c|c|c|c|} \hline 
\textbf{Model} & \textbf{vav1} & \textbf{vav2} & \textbf{vav3} & \textbf{randvav1} & \textbf{randvav2} & \textbf{randvav3}\\ \hline 
Sort & 1.00 & 1.00 & 0.07 & 0.01 & 0.01 & 0.00\\ \hline 
Reverse & 1.00 & 1.00 & 0.01 & 0.00 & 0.00 & 0.00\\ \hline 
Hist & 1.00 & 1.00 & 1.00 & 0.05 & 0.01 & 0.00\\ \hline 
Most-Freq & 0.82 & 0.32 & 0.21 & 0.00 & 0.00 & 0.00\\ \hline 
Dyck1 & 1.00 & 0.98 & 0.98 & 1.00 & 0.73 & 0.84\\ \hline 
Dyck2 & 0.95 & 0.75 & 0.91 & 0.95 & 0.84 & 0.81\\ \hline 
\end{tabular} 



#### Base bitFlip model

In [166]:
baseDirectory = os.path.abspath(os.path.join('../..')) + "/PerformanceTesting/savedData/noiseTrainingBitFlip/"
fileNames = createListOfNames("bitFlip")
df = createDataFrameFromFileNames(fileNames, includeTrain=False, useStd=False, useMean=True, includedParameters = ["0.0", "0.02", "0.04", "0.06", "0.08", "0.1"], baseDirectory=baseDirectory)
print(dataFrameToOverleafTableBody(df))

\begin{tabular}{|c|c|c|c|c|c|c|} \hline 
\textbf{Model} & \textbf{vamean0.0} & \textbf{vamean0.02} & \textbf{vamean0.04} & \textbf{vamean0.06} & \textbf{vamean0.08} & \textbf{vamean0.1}\\ \hline 
Sort & 0.82 & 0.99 & 0.83 & 0.65 & 0.65 & 0.12\\ \hline 
Reverse & 0.34 & 0.21 & 0.45 & 0.17 & 0.06 & 0.15\\ \hline 
Hist & 0.26 & 0.25 & 0.03 & 0.02 & 0.02 & 0.01\\ \hline 
Most-Freq & 0.14 & 0.20 & 0.04 & 0.04 & 0.05 & 0.06\\ \hline 
Dyck1 & 0.88 & 0.85 & 0.93 & 0.84 & 0.83 & 0.84\\ \hline 
Dyck2 & 0.90 & 0.94 & 0.96 & 0.87 & 0.83 & 0.86\\ \hline 
\end{tabular} 



#### Base mutated model

In [169]:
baseDirectory = os.path.abspath(os.path.join('../..')) + "/PerformanceTesting/savedData/mutatedModels1/"
fileNames = createListOfNames("mutated")
df = createDataFrameFromFileNames(fileNames, includeTrain=False, useAverage=False, includedParameters = ["0","1","2","3","4"], baseDirectory=baseDirectory)
print(dataFrameToOverleafTableBody(df))

0
['sort_val_acc_0']
1
['sort_val_acc_1']
2
['sort_val_acc_2']
3
['sort_val_acc_3']
4
['sort_val_acc_4']
0
['reverse_val_acc_0']
1
['reverse_val_acc_1']
2
['reverse_val_acc_2']
3
['reverse_val_acc_3']
4
['reverse_val_acc_4']
0
['hist_val_acc_0']
1
['hist_val_acc_1']
2
['hist_val_acc_2']
3
['hist_val_acc_3']
4
['hist_val_acc_4']
0
['most-freq_val_acc_0']
1
['most-freq_val_acc_1']
2
['most-freq_val_acc_2']
3
['most-freq_val_acc_3']
4
['most-freq_val_acc_4']
0
['shuffle_dyck1_val_acc_0']
1
['shuffle_dyck1_val_acc_0', 'shuffle_dyck1_val_acc_1', 'shuffle_dyck1_val_acc_2', 'shuffle_dyck1_val_acc_3', 'shuffle_dyck1_val_acc_4']
2
['shuffle_dyck1_val_acc_2']
3
['shuffle_dyck1_val_acc_3']
4
['shuffle_dyck1_val_acc_4']
0
['shuffle_dyck2_val_acc_0']
1
['shuffle_dyck2_val_acc_1']
2
['shuffle_dyck2_val_acc_0', 'shuffle_dyck2_val_acc_1', 'shuffle_dyck2_val_acc_2', 'shuffle_dyck2_val_acc_3', 'shuffle_dyck2_val_acc_4']
3
['shuffle_dyck2_val_acc_3']
4
['shuffle_dyck2_val_acc_4']


TypeError: only length-1 arrays can be converted to Python scalars