# Libraries

In [69]:
import cobra as cb
import logging
logging.basicConfig(filename="log.txt"  , level=logging.INFO)
import os
import pandas as pd
import numpy as np
import itertools
from itertools import combinations_with_replacement
import scipy.stats as stats
from statsmodels.stats.multitest import fdrcorrection

# Set up

In [52]:
modelNames = ["ENGRO 1", "ENGRO 2"]

In [53]:
##############################################
# Create a folder if it doesn't already exist
# Parameters
# - path --> new folder path
##############################################
def createFolder(path):
    if not os.path.exists(path):
            os.mkdir(path)

# Loading models

In [54]:
##############################################
# Load the models files (.xml)
# Parameters
# - modelNames --> list of models names that must
# match the file names
# - modelFolder --> the folder containing the 
# models files
##############################################
def loadModels(modelNames, modelFolder):
    models = {}
    for modelName in modelNames:
        files = os.listdir(modelFolder)
        found = False
        for file in files:
            filename, extension = os.path.splitext(file)
            if(filename == modelName):
                found = True
                break
        if(found):
            if(extension == ".xml"):
                models[modelName] = cb.io.read_sbml_model(modelFolder + filename + extension)
            else:
                raise ImportError('Model file extension not supported')
        else:
            raise FileNotFoundError('File not found')
    return models

In [55]:
modelsDict = loadModels(modelNames, "../../models/")
modelReactionsDict = {}
for modelName in modelNames:
    listReactions = []
    for reaction in modelsDict[modelName].reactions:
        listReactions.append(reaction.id)
    modelReactionsDict[modelName] = listReactions

# Similarity tests

In [64]:
##############################################
# 
# Parameters
# - path --> new folder path
##############################################
def kstest(s1, s2):
    return stats.ks_2samp(s1, s2)

##############################################
# 
# Parameters
# - path --> new folder path
##############################################
def mannwhitney(s1, s2):
    return stats.mannwhitneyu(s1, s2)

##############################################
# It creates a pairs list for statistical tests with
# the files created by the sampling notebook. Each pair
# refers to two dataset with equal number 
# of samples, but from different executions.
# Parameters
# - names --> list of file names as ("nsamples_executionIndex_algorithm.csv")
##############################################
def testPairsCreator(algorithms, samples, executionsPerSamples, executionsCbs3):
    
    testNames = {}
    
    for algorithm in algorithms:
        testNames[algorithm] = []
        names = []
        if(algorithm == "cbs3"):
            for j in range(0, executionsCbs3):
                    names.append(str(j) + "_" + "0_"+ algorithm + ".csv/")
            temp_list = list(itertools.combinations_with_replacement(names,2))
            temp_list=[(el[0],el[1]) for el in temp_list if el[0]!=el[1]]
            testNames[algorithm].extend(temp_list)
        else:
            for i in samples:
                for j in range(0, executionsPerSamples):
                    names.append(str(i) + "_" + str(j) + "_"+ algorithm + ".csv")
            temp = []
            last_nsample = names[0].split("_")[0]
            for name in names:
                nsample = name.split("_")[0]
                if(nsample == last_nsample):
                    temp.append(name + "/")
                    last_nsample = nsample
                else:
                    temp_list = list(itertools.combinations_with_replacement(temp,2))
                    temp_list=[(el[0],el[1]) for el in temp_list if el[0]!=el[1]]
                    testNames[algorithm].extend(temp_list)
                    temp = []
                    temp.append(name + "/")
                    last_nsample = nsample
            temp_list = list(itertools.combinations_with_replacement(temp,2))
            temp_list=[(el[0],el[1]) for el in temp_list if el[0]!=el[1]]
            testNames[algorithm].extend(temp_list)
    return testNames


##############################################
# 
# Parameters
# - path --> new folder path
##############################################
def similarityTest(modelNames, modelsDict, modelReactionsDict, pairList, tests, thinningsDict,
                   elementPath, resultPath):
    
    createFolder(resultPath)
    
    for modelName in modelNames:
        createFolder(resultPath + modelName)
        for algorithm in pairList:
            for thinning in thinningsDict[algorithm]:
                for test in tests:
                    createFolder(resultPath + modelName + "/" + test)
                    dfColumns = ['testName']
                    dfColumns.extend(modelReactionsDict[modelName])
                    resultDf = pd.DataFrame(columns = dfColumns)
                    lenDf = 0
                    nReactions = len(modelReactionsDict[modelName])
                    for pair in pairList[algorithm]:
                        if(algorithm == "cbs3"):
                            samplePath = elementPath + modelName + "/" + algorithm + "groupedBy" + str(thinning)+ "/"
                        else:
                            samplePath = elementPath + modelName + "/" + algorithm + "Thinning" + str(thinning) + "/"
                        test_name = (pair[0] + pair[1]).split("/")
                        del test_name[-1]
                        formattedNameEl1 = test_name[0].split('_')
                        formattedNameEl2 = test_name[1].split('_')
                        res = [formattedNameEl1[0] + "_" + formattedNameEl1[1] + 
                               "_" + formattedNameEl2[0] + "_" + formattedNameEl2[1]]
                        logging.info("Executing test: " + res[0])
                        logging.info("Files: " + samplePath + test_name[0] + 
                              " and " + samplePath + test_name[1])
                        s1 = pd.read_csv(samplePath + test_name[0], index_col = 0)
                        s2 = pd.read_csv(samplePath + test_name[1], index_col = 0)
                        for h in range(nReactions):
                            if(test == "kstest"):
                                res.append(kstest(s1.iloc[:, h], s2.iloc[:, h]).pvalue)
                            elif(test == "mannwhitney"):
                                res.append(mannwhitney(s1.iloc[:, h], s2.iloc[:, h]).pvalue)
                            else:
                                raise NotImplementedError('Algorithm not supported')
                        resultDf.loc[lenDf] = res
                        lenDf = lenDf + 1
                    if(algorithm == "cbs3"):
                        resultDf.set_index('testName').to_csv(os.path.join(resultPath, modelName, test, algorithm + "groupedBy" + str(thinning) + ".csv")) 
                    else:
                        resultDf.set_index('testName').to_csv(os.path.join(resultPath, modelName, test, algorithm + "Thinning" + str(thinning) + ".csv"))
    pass

In [10]:
algorithms = ["achr", "optgp", "chrr", "cbs3"]

thinningsDict= {}


samplesNList = []
for i in range(1000, 30001, 1000):
    samplesNList.append(i)
    
executionsPerSampleSize = 20
executionsCbs3 = 20

thinningsDict["achr"] = [1, 10, 100]
thinningsDict["optgp"] = [1, 10, 100]
thinningsDict["chrr"] = [1, 10, 100]
thinningsDict["cbs3"] = [1000]

pairList = testPairsCreator(algorithms, samplesNList, executionsPerSampleSize, executionsCbs3)

tests = ["kstest", "mannwhitney"]

samplesFolder = "../../samples/"

resultFolder = "../../results/FDR/"

similarityTest(modelNames, modelsDict, 
               modelReactionsDict, pairList, 
               tests, thinningsDict,
               samplesFolder, resultFolder)


In [57]:
algorithms = ["achr", "optgp", "chrr", "cbs3"]

thinningsDict= {}


samplesNList = []
for i in range(1000, 5000, 1000):
    samplesNList.append(i)
    
executionsPerSampleSize = 3
executionsCbs3 = 3

thinningsDict["achr"] = [1, 2, 3]
thinningsDict["optgp"] = [1, 2, 3]
thinningsDict["chrr"] = [1, 2, 3]
thinningsDict["cbs3"] = [1000]

pairList = testPairsCreator(algorithms, samplesNList, executionsPerSampleSize, executionsCbs3)

tests = ["kstest", "mannwhitney"]

samplesFolder = "../../samples/"

resultFolder = "../../results/FDR/"

similarityTest(modelNames, modelsDict, 
               modelReactionsDict, pairList, 
               tests, thinningsDict,
               samplesFolder, resultFolder)


# Fold change and fluxes mean

In [58]:
def mean(samplesFolder, resultFolder):
    for path in os.listdir(samplesFolder):
        if os.path.isfile(os.path.join(samplesFolder, path)):
            df = pd.read_csv(os.path.join(samplesFolder, path), index_col = 0)
            dfres = pd.DataFrame(df.mean(), columns = [ "mean"])
            dfres.to_csv(os.path.join(resultFolder, path))
    pass

def foldChange(modelNames, modelsDict, 
               modelReactionsDict, pairList,
               thinningsDict,
               samplesFolder, resultPathMean, resultPathFc):
    
    createFolder(resultPathMean)
    createFolder(resultPathFc)
    
    for modelName in modelNames:
        logging.info("Computing fold change: " + modelName)
        createFolder(resultPathMean + modelName)
        createFolder(resultPathFc + modelName)
        for algorithm in pairList:
            for thinning in thinningsDict[algorithm]:
                if(algorithm == "cbs3"):
                    logging.info("Computing means: " + algorithm + " - groupedBy" + str(thinning))
                    createFolder(os.path.join(resultPathMean, modelName, algorithm + "groupedBy" + str(thinning)))
                    mean(os.path.join(samplesFolder, modelName, algorithm + "groupedBy" + str(thinning)), 
                         os.path.join(resultPathMean, modelName, algorithm + "groupedBy" + str(thinning)))
                else:
                    logging.info("Computing means: " + algorithm + " - thinning" + str(thinning))
                    createFolder(os.path.join(resultPathMean, modelName, algorithm + "Thinning" + str(thinning)))
                    mean(os.path.join(samplesFolder, modelName, algorithm + "Thinning" + str(thinning)), 
                         os.path.join(resultPathMean, modelName, algorithm + "Thinning" + str(thinning)))
                    
                
                dfColumns = ['testName']
                dfColumns.extend(modelReactionsDict[modelName])
                nReactions = len(modelReactionsDict[modelName])
                resultDf = pd.DataFrame(columns = dfColumns)
                lenDf = 0
                for pair in pairList[algorithm]:
                    if(algorithm == "cbs3"):
                        samplePath = os.path.join(resultPathMean, modelName, algorithm + "groupedBy" + str(thinning))
                    else:
                        samplePath = os.path.join(resultPathMean, modelName, algorithm + "Thinning" + str(thinning))
                        
                    test_name = (pair[0] + pair[1]).split("/")
                    del test_name[-1]
                    formattedNameEl1 = test_name[0].split('_')
                    formattedNameEl2 = test_name[1].split('_')
                    res = [formattedNameEl1[0] + "_" + formattedNameEl1[1] + "_" + 
                           formattedNameEl2[0] + "_" + formattedNameEl2[1]]
                    s1 = pd.read_csv( os.path.join(samplePath , test_name[0]), index_col = 0)
                    s2 = pd.read_csv( os.path.join(samplePath , test_name[1]), index_col = 0)
                    for index in range (0, nReactions):
                        a = s1.iloc[index]["mean"]
                        b = s2.iloc[index]["mean"]
                        if(b == 0):
                            res.append(abs(a))
                        else:
                            res.append(abs((a-b)/b))
                    resultDf.loc[lenDf] = res
                    lenDf = lenDf + 1
                if(algorithm == "cbs3"):
                    resultDf.set_index('testName').to_csv(os.path.join(resultPathFc, modelName, 
                                                                       algorithm + "groupedBy" + str(thinning) + ".csv"))
                    
                    
                else:
                    resultDf.set_index('testName').to_csv(os.path.join(resultPathFc, modelName, 
                                                                       algorithm + "Thinning" + str(thinning) + ".csv"))
    pass

In [59]:
algorithms = ["achr", "optgp", "chrr", "cbs3"]

thinningsDict = {}

'''samplesNList = []
for i in range(1000, 30001, 1000):
    samplesNList.append(i)
    
executionsPerSampleSize = 20
executionsCbs3 = 20

thinningsDict["achr"] = [1, 10, 100]
thinningsDict["optgp"] = [1, 10, 100]
thinningsDict["chrr"] = [1, 10, 100]
thinningsDict["cbs3"] = [1000]'''


samplesNList = []
for i in range(1000, 4001, 1000):
    samplesNList.append(i)
    
executionsPerSampleSize = 3
executionsCbs3 = 3


thinningsDict["achr"] = [1, 2, 3]
thinningsDict["optgp"] = [1, 2, 3]
thinningsDict["chrr"] = [1, 2, 3]
thinningsDict["cbs3"] = [1000]

pairList = testPairsCreator(algorithms, samplesNList, executionsPerSampleSize, executionsCbs3)

samplesFolder = "../../samples/"

resultFolderMean = "../../results/mean/"

resultFolderFc = "../../results/foldChange/"

foldChange(modelNames, modelsDict, 
               modelReactionsDict, pairList,
               thinningsDict,
               samplesFolder, resultFolderMean, resultFolderFc)


# False Discovery Rate (FDR)

In [85]:
def FDR(modelNames, modelsDict, modelReactionsDict, tests, samplesNList, pairList, thinningsDict, executionsPerSampleSize, executionsCbs3 ,
        pvalueFolder, foldChangeFolder, filterName, avoidBlockedReactions, alpha=0.01, toAdjust=False, foldChange=False):
    
    for modelName in modelNames:
        for test in tests:
            createFolder(os.path.join(pvalueFolder, modelName, test, "analysis"))
            createFolder(os.path.join(pvalueFolder, modelName, test, "analysis", filterName))
            for algorithm in pairList:
                numberOfTestPerNSample = len(pairList[algorithm])/len(samplesNList)
                for thinning in thinningsDict[algorithm]:
                    if(algorithm == "cbs3"):
                        filePath = os.path.join(pvalueFolder, modelName, test,  algorithm + "groupedBy" + str(thinning) + ".csv")
                    else:
                        filePath = os.path.join(pvalueFolder, modelName, test,  algorithm + "Thinning" + str(thinning) + ".csv")
                    dfPvalues = pd.read_csv(filePath, index_col = 0)
                    columns = ["Samples"]
                    columns.extend(modelReactionsDict[modelName])
                    nReaction = len(modelReactionsDict[modelName])
                    columns.append("Total")
                    df = pd.DataFrame(columns =  columns)
                    if(avoidBlockedReactions):
                        blockedR = cb.flux_analysis.find_blocked_reactions(modelsDict[modelName])
                        nBlockedR = len(blockedR)
                    if(foldChange):
                        if(algorithm == "cbs3"):
                            dfFoldChange = pd.read_csv(os.path.join(foldChangeFolder, modelName, algorithm + "groupedBy" + str(thinning) + ".csv"))
                        else:
                            dfFoldChange = pd.read_csv(os.path.join(foldChangeFolder, modelName, algorithm + "Thinning" + str(thinning) + ".csv"))
                
  
                    for columnName in dfPvalues.columns:
                        res = []
                        if(toAdjust):
                            column = fdrcorrection(dfPvalues[columnName], alpha)[1]
                        else:
                            column = dfPvalues[columnName]
                        if(foldChange):
                            foldChangeColumn = dfFoldChange[columnName]
                        cell_index = 0
                        cont = 0
                        i = 0
                        for cell in column:
                            if(not(foldChange)):
                                if(cell < alpha):
                                    cont = cont + 1
                            else:
                                if(cell < alpha and foldChangeColumn[cell_index] > 0.2):
                                    cont = cont + 1
                            i = i + 1
                            cell_index = cell_index + 1
                            if(algorithm != "cbs3"):
                                if(i == numberOfTestPerNSample):
                                    res.append(cont/numberOfTestPerNSample)
                                    cont = 0
                                    i = 0
                        if(algorithm == "cbs3"):
                            res.append(cont/len(column))
                        df[columnName] = res
                    res = []
                    sampleList = []
                    if(algorithm != "cbs3"):
                        for sample in samplesNList:
                            sampleList.append(sample)
                        df["Samples"] = sampleList
                    else:
                        df["Samples"] = [0]
                    df = df.set_index("Samples")
                    for index, row in df.iterrows():
                        summ = 0
                        for i in range(0, nReaction):
                            if(avoidBlockedReactions):
                                if(modelReactionsDict[modelName][i] not in blockedR):
                                    summ = summ + row[i]
                            else:
                                summ = summ + row[i]
                        res.append(summ/(nReaction - nBlockedR))
                    df["Total"] = res
                    if(algorithm == "cbs3"):
                        df.to_csv(os.path.join(pvalueFolder, modelName, test, "analysis", filterName, algorithm + "groupedBy" + str(thinning) + ".csv"))
                    else:
                        df.to_csv(os.path.join(pvalueFolder, modelName, test, "analysis", filterName, algorithm + "Thinning" + str(thinning) + ".csv"))
                    pass

In [86]:
algorithms = ["achr", "optgp", "chrr", "cbs3"]

thinningsDict = {}

'''samplesNList = []
for i in range(1000, 30001, 1000):
    samplesNList.append(i)
    
executionsPerSampleSize = 20
executionsCbs3 = 20

thinningsDict["achr"] = [1, 10, 100]
thinningsDict["optgp"] = [1, 10, 100]
thinningsDict["chrr"] = [1, 10, 100]
thinningsDict["cbs3"] = [1000]'''


tests = ["kstest", "mannwhitney"]

samplesNList = []
for i in range(1000, 4001, 1000):
    samplesNList.append(i)
    
executionsPerSampleSize = 3
executionsCbs3 = 3


thinningsDict["achr"] = [1, 2, 3]
thinningsDict["optgp"] = [1, 2, 3]
thinningsDict["chrr"] = [1, 2, 3]
thinningsDict["cbs3"] = [1000]

pairList = testPairsCreator(algorithms, samplesNList, executionsPerSampleSize, executionsCbs3)

pvalueFolder = "../../results/FDR/"


foldChangeFolder = "../../results/foldChange/"

avoidBlockedReactions = True

filters = ["adjusted/", "adjustedFoldChange/",  "notAdjusted/" ,"notAdjustedFoldChange/"]

for i in range (0, 4):
    if(i == 0):
        bool0 = True #adjusted
        bool1 = False # fc
    elif(i == 1):
        bool0 = True #adjusted
        bool1 = True # fc
      
    elif(i == 2):
        bool0 = False #adjusted
        bool1 = False # fc
    else:
        bool0 = False #adjusted
        bool1 = True # fc
        
    filterName = filters[i]
        
    FDR(modelNames, modelsDict, 
                   modelReactionsDict, 
                   tests,
                   samplesNList,
                   pairList,
                   thinningsDict,
                   executionsPerSampleSize, executionsCbs3, pvalueFolder, foldChangeFolder, filterName, avoidBlockedReactions, 0.01, bool0, bool1)
