# Libraries

In [2]:
import cobra as cb
import logging
logging.basicConfig(filename="log.txt"  , level=logging.INFO)
import os
import pandas as pd
import numpy as np
import itertools
from itertools import combinations_with_replacement
import scipy.stats 

# Set up

In [3]:
modelNames = ["ENGRO 1", "ENGRO 2"]

In [4]:
##############################################
# Create a folder if it doesn't already exist
# Parameters
# - path --> new folder path
##############################################
def createFolder(path):
    if not os.path.exists(path):
            os.mkdir(path)

# Loading models

In [5]:
##############################################
# Load the models files (.xml)
# Parameters
# - modelNames --> list of models names that must
# match the file names
# - modelFolder --> the folder containing the 
# models files
##############################################
def loadModels(modelNames, modelFolder):
    models = {}
    for modelName in modelNames:
        files = os.listdir(modelFolder)
        found = False
        for file in files:
            filename, extension = os.path.splitext(file)
            if(filename == modelName):
                found = True
                break
        if(found):
            if(extension == ".xml"):
                models[modelName] = cb.io.read_sbml_model(modelFolder + filename + extension)
            else:
                raise ImportError('Model file extension not supported')
        else:
            raise FileNotFoundError('File not found')
    return models

In [6]:
modelsDict = loadModels(modelNames, "../../models/")
modelReactionsDict = {}
for modelName in modelNames:
    listReactions = []
    for reaction in modelsDict[modelName].reactions:
        listReactions.append(reaction.id)
    modelReactionsDict[modelName] = listReactions

# KLD analysis

In [12]:
##############################################
# It creates a pairs list for statistical tests with
# the files created by the sampling notebook. Each pair
# refers to two dataset with equal number 
# of samples, but from different executions.
# Parameters
# - names --> list of file names as ("nsamples_executionIndex_algorithm.csv")
##############################################
def testPairsCreator(algorithms, samples, executionsPerSamples, executionsCbs3):
    
    testNames = {}
    
    for algorithm in algorithms:
        testNames[algorithm] = []
        names = []
        if(algorithm == "cbs3"):
            for j in range(0, executionsCbs3):
                    names.append(str(j) + "_" + "0_"+ algorithm + ".csv/")
            temp_list = list(itertools.combinations_with_replacement(names,2))
            temp_list=[(el[0],el[1]) for el in temp_list if el[0]!=el[1]]
            testNames[algorithm].extend(temp_list)
        else:
            for i in samples:
                for j in range(0, executionsPerSamples):
                    names.append(str(i) + "_" + str(j) + "_"+ algorithm + ".csv")
            temp = []
            last_nsample = names[0].split("_")[0]
            for name in names:
                nsample = name.split("_")[0]
                if(nsample == last_nsample):
                    temp.append(name + "/")
                    last_nsample = nsample
                else:
                    temp_list = list(itertools.combinations_with_replacement(temp,2))
                    temp_list=[(el[0],el[1]) for el in temp_list if el[0]!=el[1]]
                    testNames[algorithm].extend(temp_list)
                    temp = []
                    temp.append(name + "/")
                    last_nsample = nsample
            temp_list = list(itertools.combinations_with_replacement(temp,2))
            temp_list=[(el[0],el[1]) for el in temp_list if el[0]!=el[1]]
            testNames[algorithm].extend(temp_list)
    return testNames

##############################################
# It analyses the KLD coefficient produced by kld.r
# to create a dataframe in which each row, representing 
# a tested couple of samples, contains  the average
# KLD coefficients for each model reaction. The last column
# of the dataframe contains the cross-reaction mean for 
# a tested couple of samples.
# Parameters
# - modelNames --> list of modelNames
# - modelsDict --> models dictionary
# - modelReactionsDict --> models reactions dictionary
# - samplesNList --> samples list
# - pairList --> tested pairs list
# - thinningsDict --> thinnings dictionary
# - executionsPerSampleSize --> executions per sample size
# - executionsCbs3 --> executions of the CBS3 algorithm
# - kldFolder --> folder of the KLD coefficients
# - avoidBlockedReactions --> if True, the cross-reaction mean
# does not take into account the coefficients of blocked reactions
##############################################
def kldAnalysis(modelNames, modelsDict, 
               modelReactionsDict, 
               samplesNList,
               pairList,
               thinningsDict,
               executionsPerSampleSize, executionsCbs3, kldFolder, avoidBlockedReactions):
    
    
    
    for modelName in modelNames:
            for algorithm in pairList:
                numberOfTestPerNSample = len(pairList[algorithm])/len(samplesNList)
                for thinning in thinningsDict[algorithm]:
                    if(algorithm == "cbs3"):
                        createFolder(os.path.join(kldFolder, modelName, algorithm + "groupedBy" + str(thinning), "analysis"))
                        df = pd.read_csv(os.path.join(kldFolder, modelName, algorithm + "groupedBy" + str(thinning), "kld.csv"), index_col = 1)
                    else:
                        createFolder(os.path.join(kldFolder, modelName, algorithm + "Thinning" + str(thinning), "analysis"))
                        df = pd.read_csv(os.path.join(kldFolder, modelName, algorithm + "Thinning" + str(thinning), "kld.csv"), index_col = 1)
                        
                    df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],
                            axis = 1, inplace = True)
                    nReaction = len(modelReactionsDict[modelName])
                    blockedR = cb.flux_analysis.find_blocked_reactions(modelsDict[modelName])
                    nBlockedR = len(blockedR)
                    columns = ["Samples"]
                    for col in df.columns:
                        columns.append(col)
                    columns.append("Total")
                    dfResult = pd.DataFrame(columns =  columns)

                    for column in df.columns:
                        summ = 0
                        i = 0
                        res = []
                        for cell in df[column]:
                            summ = summ + cell
                            i = i + 1
                            if(algorithm != "cbs3"):
                                if(i == numberOfTestPerNSample):
                                    res.append(summ/numberOfTestPerNSample)
                                    i = 0
                                    summ = 0
                        if(algorithm == "cbs3"):
                            res.append(summ/len(column))
                        dfResult[column] = res

                    sampleList = []
                    if(algorithm != "cbs3"):
                        for i in samplesNList:
                            sampleList.append(i)
                        dfResult["Samples"] = sampleList
                    else:
                        dfResult["Samples"] = [0]
                    dfResult = dfResult.set_index("Samples")
                    res = []
                    for index, row in dfResult.iterrows():
                        summ = 0
                        for i in range(0, nReaction):
                            if(avoidBlockedReactions):
                                if(modelReactionsDict[modelName][i] not in blockedR):
                                    summ = summ + row[i]
                            else:
                                summ = summ + row[i]
                        res.append(summ/(nReaction - nBlockedR))
                    dfResult["Total"] = res
                    if(algorithm == "cbs3"):
                        dfResult.to_csv(os.path.join(kldFolder, modelName, algorithm + "groupedBy" + str(thinning), "analysis", "kld.csv"))
                    else:
                        dfResult.to_csv(os.path.join(kldFolder, modelName, algorithm + "Thinning" + str(thinning), "analysis", "kld.csv"))
    pass

In [13]:
algorithms = ["achr", "optgp", "chrr", "cbs3"]

thinningsDict = {}

samplesNList = []
for i in range(1000, 30001, 1000):
    samplesNList.append(i)
    
executionsPerSampleSize = 20
executionsCbs3 = 20


thinningsDict["achr"] = [1, 10, 100]
thinningsDict["optgp"] = [1, 10, 100]
thinningsDict["chrr"] = [1, 10, 100]
thinningsDict["cbs3"] = [1000]

pairList = testPairsCreator(algorithms, samplesNList, executionsPerSampleSize, executionsCbs3)

kldFolder = "../../results/KLD/"

avoidBlockedReactions = True
        
kldAnalysis(modelNames, modelsDict, 
               modelReactionsDict, 
               samplesNList,
               pairList,
               thinningsDict,
               executionsPerSampleSize, executionsCbs3, kldFolder, avoidBlockedReactions)
