In [1]:
from tracr.rasp import rasp
from tracr.compiler import compiling
from tracr.compiler import lib

import sys
import os

module_path = os.path.abspath(os.path.join('../..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.model import Model
from src.functions import *

In [92]:
#Prints the overleaf table body from a pandas dataframe
def dataFrameToOverleafTableBody(df: pd.DataFrame, decimalNumber = 2):
    columnCount = len(df.columns.values)
    tableBody = "\\begin{tabular}{|"+ "".join(["c|" for i in range(columnCount)]) +"} \\hline \n"
    tableBody += " & ".join(map(lambda x: "\\textbf{"+str(x)+"}", df.columns.values)) + "\\\\ \\hline \n"
    for _, row in df.iterrows():
        if decimalNumber:
            newRow = row.values[0] + " & "
            newRow += " & ".join(map(lambda x: "%.2f"%x, row.values[1:])) + "\\\\ \\hline \n"
        else:
            newRow = " & ".join(map(str, row.values)) + "\\\\ \\hline \n"
        tableBody += newRow
    tableBody += "\\end{tabular} \n"
    return tableBody

#Merge the results from the files in list. The files need to have the same dimension
def loadArray(fileNames="temp"):
    if type(fileNames) is list and len(fileNames)==1:
        fileNames = fileNames[0]

    if type(fileNames) is list:
        file = open(fileNames[0], "rb")
        array = np.load(file).reshape((-1,1))
        file.close()
        for fn in fileNames[1:]:
            file = open(fn, "rb")
            conAcc = np.load(file).reshape((-1,1))
            array = np.concatenate((array, conAcc), axis=1)
            file.close()
        
    else:
        file = open(fileNames, "rb")
        array = np.load(file)
        file.close()

    return array

#Returns all possible comnination of 'baseExpression' split at "{}" and filed with 'modificationsLists'
def getListOfNames(baseExpression: str, modificationsLists: list):
    listOfNames = []
    splitBase = baseExpression.split("{}")
    if len(splitBase) != len(modificationsLists) + 1:
        print("Error: Possible modifications ("+str(len(splitBase)-1)+") does not match given modifications ("+str(len(modificationsLists))+")")
        raise RuntimeError
    
    if len(splitBase) == 2:
        for modification in modificationsLists[0]:
            listOfNames.append(str(modification).join(splitBase))
        return listOfNames
    
    for modification in modificationsLists[0]:
        listOfNames += getListOfNames(splitBase[0] + str(modification) + "{}".join(splitBase[1:]), modificationsLists[1:])

    return listOfNames

#Creates and returns a list of names based on standard for each model test
def createListOfNames(testName, baseDirectory = "", paramNames = None):
    match testName:
        case "overTraining":
            if paramNames is None:
                paramNames = ["v1", "v2", "v3"]
            baseExpression = "{}{}_{}_{}"
            modificationsLists = [["random_", ""],
                                  ["train", "val"],
                                  ["sort", "reverse", "hist", "most-freq", "shuffle_dyck1", "shuffle_dyck2"],
                                  paramNames]
            nameList = getListOfNames(baseExpression, modificationsLists)
        case "bitFlip":
            pass

        case "gaussian":
            if paramNames is None:
                paramNames = ["0.01", "0.25", "0.50", "0.75", "1.0", "1.25"]
            baseExpression = "{}_{}_{}_gaussian_std{}_{}"
            modificationsLists = [["sort", "reverse", "hist", "most-freq", "shuffle_dyck1", "shuffle_dyck2"],
                                 ["train", "val"],
                                 ["loss", "acc"],
                                 paramNames,
                                 ["1", "2", "3", "4", "5"]
                                 ]
            nameList = getListOfNames(baseExpression, modificationsLists)

        case "mutated":
            pass
        case _:
            print("Error:", testName, "is not a valid test name")
        
    nameList = [baseDirectory + name for name in nameList]

    return nameList

#Classify fileName according to model base and loss/acc
def tagFileName(fileName):
    tags = {}

def dictAppend(dic, id, value):
    if id in dic:
        dic[id].append(value)
    else:
        dic[id]=[value]
        
def dataFrameHelper(dfDict, prefix, includedParameters, relevantFileNames, useAverage, useMean, useStd):
    for param in includedParameters:
        paramFileNames = [name for name in relevantFileNames if str(param) in name]

        #Start adding to the fields
        data = loadArray(paramFileNames)

        if useAverage:
            if useMean:
                dictAppend(dfDict, prefix+"mean"+str(param), data.mean(axis=1)[-1])
            if useStd:
                dictAppend(dfDict, prefix+"std"+str(param), data.std(axis=1)[-1])
        else:
            dictAppend(dfDict, prefix+str(param), data[-1])

def createDataFrameFromFileNames(fileNames, includedParameters, useAverage = True, useMean = True, useStd = True, includeTrain = False, includeValidation = True, includeLoss = False,
                                  includeAcc = True, rowIsModel = True, includeRandom = False):
    #Possible columns (All at certain checkpoints e.g. end, start, certain epoch)
        #Mean, std

    #Problem: Some experiments use mean others not (regular, grokking and mutations)
        #Solution. If experiment uses mean it is consistent throughout, add as a parameter flag
        #using average implies that we want to include mean and std. I'll add flag for min/max as well

    #Problem: Currently only gives fileNames, I need model names extracted as well as loss/acc and train/validation
    #This requires linking a model name, train/validation and loss/acc with corresponding fileNames
        #Solution should be able to build a parser function which can classify each fileName by model name, loss/acc before extracting data 
        #Not good enough. I need to differentiate between model name, train/val, loss/acc and parameter value
        #This suggests it should be easier to parse these from the baseExpression directly 

    dfDict = {}

    #Sort results with row as model
    if rowIsModel:
        dfDict["Model"]=["Sort","Reverse","Hist","Most-Freq","Dyck1","Dyck2"]

        #Itterate trough all models
        for modelName in ["sort", "reverse", "hist", "most-freq", "dyck1", "dyck2"]:
            modelFileNames = [name for name in fileNames if modelName in name]

            if includeRandom:
                randomFileNames = [name for name in fileNames if "random" in name]

                if includeTrain:
                    trainFileNames = [name for name in randomFileNames if "train" in name]

                    if includeAcc:
                        accFileNames = [name for name in trainFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "randta", includedParameters, accFileNames, useAverage, useMean, useStd)

                    if includeLoss:
                        lossFileNames = [name for name in trainFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "randtl", includedParameters, lossFileNames, useAverage, useMean, useStd)

                if includeValidation:
                    valFileNames = [name for name in randomFileNames if "val" in name]

                    if includeAcc:
                        accFileNames = [name for name in valFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "randva", includedParameters, accFileNames, useAverage, useMean, useStd)

                    if includeLoss:
                        lossFileNames = [name for name in valFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "randvl", includedParameters, lossFileNames, useAverage, useMean, useStd)
            
            #Non random
            else:
                if includeTrain:
                    trainFileNames = [name for name in modelFileNames if "train" in name]

                    if includeAcc:
                        accFileNames = [name for name in trainFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "ta", includedParameters, accFileNames, useAverage, useMean, useStd)

                    if includeLoss:
                        lossFileNames = [name for name in trainFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "tl", includedParameters, lossFileNames, useAverage, useMean, useStd)

                if includeValidation:
                    valFileNames = [name for name in modelFileNames if "val" in name]

                    if includeAcc:
                        accFileNames = [name for name in valFileNames if "acc" in name]

                        dataFrameHelper(dfDict, "va", includedParameters, accFileNames, useAverage, useMean, useStd)

                    if includeLoss:
                        lossFileNames = [name for name in valFileNames if "loss" in name]

                        dataFrameHelper(dfDict, "vl", includedParameters, lossFileNames, useAverage, useMean, useStd)
                    
    else:
        print("Only model rows are implemented")            

    #Specially clause for regular overtraining results
    if not (includeTrain or includeValidation):
        print("Not yet implemented aka kinda useless at the moment")
        raise RuntimeError

    return pd.DataFrame(dfDict)



In [35]:
baseExpression = "{}_{}_{}"
modificationsLists = [["sort", "reverse", "hist", "most-freq", "dyck1", "dyck2"],
                      ["loss", "val"],
                      ["v1", "v2", "v3"]]

fileNames = getListOfNames(baseExpression, modificationsLists)

trainNames = [name for name in fileNames if "loss" in name]
print(trainNames)

['sort_loss_v1', 'sort_loss_v2', 'sort_loss_v3', 'reverse_loss_v1', 'reverse_loss_v2', 'reverse_loss_v3', 'hist_loss_v1', 'hist_loss_v2', 'hist_loss_v3', 'most-freq_loss_v1', 'most-freq_loss_v2', 'most-freq_loss_v3', 'dyck1_loss_v1', 'dyck1_loss_v2', 'dyck1_loss_v3', 'dyck2_loss_v1', 'dyck2_loss_v2', 'dyck2_loss_v3']


#### Base Gaussian model

In [91]:
baseDirectory = os.path.abspath(os.path.join('../..')) + "/PerformanceTesting/savedData/noiseTrainingGaussian2/"
fileNames = createListOfNames("gaussian", baseDirectory)
df = createDataFrameFromFileNames(fileNames, includeTrain=False, useStd=False, useMean=True, includedParameters = ["0.01", "0.25", "0.50", "0.75", "1.0", "1.25"])
print(dataFrameToOverleafTableBody(df))

{'Model': ['Sort', 'Reverse', 'Hist', 'Most-Freq', 'Dyck1', 'Dyck2'], 'vamean0.01': [np.float64(0.9540453635694188), np.float64(0.9989399293286219), np.float64(1.0), np.float64(0.36214993118759575), np.float64(0.9122077922077922), np.float64(0.9297688791829039)], 'vamean0.25': [np.float64(0.9926956694855734), np.float64(0.9134318785882991), np.float64(1.0), np.float64(0.3772673303681378), np.float64(0.8449639249639249), np.float64(0.8932495121866124)], 'vamean0.50': [np.float64(0.9860010190514107), np.float64(1.0), np.float64(1.0), np.float64(0.35776090066823507), np.float64(0.852914862914863), np.float64(0.9116499233369891)], 'vamean0.75': [np.float64(0.9721664377635657), np.float64(0.9342949216224932), np.float64(1.0), np.float64(0.3726626781197031), np.float64(0.9127417027417029), np.float64(0.9001052778568539)], 'vamean1.0': [np.float64(0.9970043249742379), np.float64(0.8753108920711673), np.float64(1.0), np.float64(0.36418346276806324), np.float64(0.8668831168831168), np.float64(0

#### Base Gaussian model

In [None]:
baseDirectory = os.path.abspath(os.path.join('../..')) + "/PerformanceTesting/savedData/noiseTrainingGaussian2/"
fileNames = createListOfNames("gaussian", baseDirectory)
df = createDataFrameFromFileNames(fileNames, includeTrain=False, useStd=False, useMean=True, includedParameters = ["0.01", "0.25", "0.50", "0.75", "1.0", "1.25"])
print(dataFrameToOverleafTableBody(df))

{'Model': ['Sort', 'Reverse', 'Hist', 'Most-Freq', 'Dyck1', 'Dyck2'], 'vamean0.01': [np.float64(0.9540453635694188), np.float64(0.9989399293286219), np.float64(1.0), np.float64(0.36214993118759575), np.float64(0.9122077922077922), np.float64(0.9297688791829039)], 'vamean0.25': [np.float64(0.9926956694855734), np.float64(0.9134318785882991), np.float64(1.0), np.float64(0.3772673303681378), np.float64(0.8449639249639249), np.float64(0.8932495121866124)], 'vamean0.50': [np.float64(0.9860010190514107), np.float64(1.0), np.float64(1.0), np.float64(0.35776090066823507), np.float64(0.852914862914863), np.float64(0.9116499233369891)], 'vamean0.75': [np.float64(0.9721664377635657), np.float64(0.9342949216224932), np.float64(1.0), np.float64(0.3726626781197031), np.float64(0.9127417027417029), np.float64(0.9001052778568539)], 'vamean1.0': [np.float64(0.9970043249742379), np.float64(0.8753108920711673), np.float64(1.0), np.float64(0.36418346276806324), np.float64(0.8668831168831168), np.float64(0

#### Base Gaussian model

In [None]:
baseDirectory = os.path.abspath(os.path.join('../..')) + "/PerformanceTesting/savedData/noiseTrainingGaussian2/"
fileNames = createListOfNames("gaussian", baseDirectory)
df = createDataFrameFromFileNames(fileNames, includeTrain=False, useStd=False, useMean=True, includedParameters = ["0.01", "0.25", "0.50", "0.75", "1.0", "1.25"])
print(dataFrameToOverleafTableBody(df))

{'Model': ['Sort', 'Reverse', 'Hist', 'Most-Freq', 'Dyck1', 'Dyck2'], 'vamean0.01': [np.float64(0.9540453635694188), np.float64(0.9989399293286219), np.float64(1.0), np.float64(0.36214993118759575), np.float64(0.9122077922077922), np.float64(0.9297688791829039)], 'vamean0.25': [np.float64(0.9926956694855734), np.float64(0.9134318785882991), np.float64(1.0), np.float64(0.3772673303681378), np.float64(0.8449639249639249), np.float64(0.8932495121866124)], 'vamean0.50': [np.float64(0.9860010190514107), np.float64(1.0), np.float64(1.0), np.float64(0.35776090066823507), np.float64(0.852914862914863), np.float64(0.9116499233369891)], 'vamean0.75': [np.float64(0.9721664377635657), np.float64(0.9342949216224932), np.float64(1.0), np.float64(0.3726626781197031), np.float64(0.9127417027417029), np.float64(0.9001052778568539)], 'vamean1.0': [np.float64(0.9970043249742379), np.float64(0.8753108920711673), np.float64(1.0), np.float64(0.36418346276806324), np.float64(0.8668831168831168), np.float64(0