In [95]:
#This method obtains superclasses of attributes passed as lists
import csv

def inferredData(theGene):
    
    #New empty list
    newGene = []
    
    #Loop to traverse through the attributes of the genes
    for x in theGene:
        
        newGene.append(x)
        
        with open('GO_AllSubsumers.tsv') as tsvfile:
    
            tsvReader = csv.reader(tsvfile, delimiter = '\t')
    
            next(tsvReader) 
            
            #Loop to traverse line by line the .tsv file
            for line in tsvReader:
                
                #If attribute in left column is the same as the attribute in gene add attribute in left column to new list
                if x == line[0]:
                    newTerm = line[1]
                    newGene.append(newTerm)
    return newGene

In [96]:
#This method returns jaccard similarity number
def jaccard (term1, term2):
    
    #Variables
    common = 0
    total = 0
    
    #Add attributes to a new list
    termList1 = [term1]
    termList2 = [term2]
    
    #Obtain all superclasses of attributes
    termList1 = inferredData(termList1)
    termList2 = inferredData(termList2)
    
    #Loop to find all common superclasses
    for x in termList1:
        if x in termList2:
            common += 1
            
    #Subtract common attributes from both list length to determine their intersection and add to total
    total = (len(termList1) - common)
    total += (len(termList2) - common)
    
    #Add number of common superclasses to total
    total += common
    
    #Determine intersection/union of both attributes
    total = common/total
    
    return total

In [97]:
#This method calculates information content number
import math

def informationContent(number):
    
    #If number is 0 return 0
    if number == 0:
        result = 0
    #If number is not 0 compute information content and add to result
    else:
        result = -(math.log(number, 2))
    
    return result

In [98]:
#This method computes the Resnik similarity
def resnik(term1, term2, geneA, geneB, geneC):
    
    #Variables
    result = 0
    iC = 0 
    
    #Calls inferredData method on each gene to obtain inferred data
    inferredDataA = inferredData(geneA)
    inferredDataB = inferredData(geneB)
    inferredDataC = inferredData(geneC)
    
    #Make a list with terms passed as parameters
    termList1 = [term1]
    termList2 = [term2]
    
    #Call inferredData method to obtain all superclasses of terms
    termList1 = inferredData(termList1)
    termList2 = inferredData(termList2)
    
    #Loop to traverse through inferred data set
    for x in termList1:
        
        #Count variable to determine how many times attribute is found in inferred data set
        count = 0
        
        #Find common attributes and see if they are found in the inferred data set
        if x in termList2:
            if x in inferredDataA:
                count += 1
            if x in inferredDataB:
                count += 1
            if x in inferredDataC:
                count += 1
                
        #Divide count by number of genes        
        number = count/3
        
        #Call informationContent on number to determine information content number
        iC = informationContent(number)
        
        #If new information content number is greater then previous it gets replaced
        if iC > result:
            result = iC
            
    return result

In [99]:
#This method finds all the pairs of the genes
#and calls the respective method either Jaccard or Resnik
def allPairs(gene1, gene2, Jaccard, gene3):
    
    #Variables
    mean = 0
    
    #Double loop to traverse through the attributes
    for x in gene1:
        
        for y in gene2:
            
            #If Jaccard is true it calls Jaccard method
            if Jaccard == True:
                mean += jaccard(x, y)
                
            #If Jaccard is false it calls Resnik method    
            else:
                mean += reznik(x, y, gene1, gene2, gene3)
                
    #Denominator is the length of gene1 times the length of gene2
    denominator = len(gene1) * len(gene2)
    
    #Mean is divided by denominator
    mean = mean/denominator
    
    return mean

In [100]:
#This method finds the Best Pairs of the genes
#and calls the respective method either Jaccard or Resnik
def bestPairs(gene1, gene2, jacc, gene3):
    
    #Variables
    mean = 0
    result1 = 0
    result2 = 0
    
    #Double loop to traverse through the attributes
    for x in gene1:
        
        for y in gene2:
            
            #If Jaccard is true it calls Jaccard method
            if jacc == True:
                result1 = jaccard(x,y)
                
            #If Jaccard is false it calls Resnik method
            else:
                result1 = resnik(x, y, gene1, gene2, gene3)
                
            #If there is a better result it replaces the prexisting result
            if result1 > result2:
                result2 = result1
                
    #Adds result by the mean and divides it by the length of gene1
    mean += result2
    mean = mean/len(gene1)
    
    return mean

In [101]:
#Genes with atributes as lists
geneA = ["GO_0016020", "GO_0003677"] 
geneB = ["GO_0016021"]
geneC = ["GO_0003677"]

#Question 1
result1 = allPairs(geneA, geneC, True, geneB)
print("The Jaccard All Pairs between GeneA and Gene C is: ", result1)

#Question 2
result2 = bestPairs(geneA, geneC, True, geneB)
print("The Jaccard Best Pairs between GeneA and Gene C is: ", result2)

#Question 3
result3 = bestPairs(geneA, geneB, False, geneC)
print("The Reznick Best Pairs between GeneA and Gene B is: ", result3)

The Jaccard All Pairs between GeneA and Gene C is:  0.5
The Jaccard Best Pairs between GeneA and Gene C is:  0.5
The Reznick Best Pairs between GeneA and Gene B is:  0.29248125036057815
