### Introduction

In [None]:
# Aim: to produce the resources to facilitate research on gene pleiotropy

In [None]:
# Delivarable #1: Produce lists of Gene Ontologies and their corrected semantic similarity measures

# step #1: download the latest gene ontology annotations from here (make note of the date of download):
# http://current.geneontology.org/products/pages/downloads.html
# when multiple options exist (for example in human), there apply the following rules:
# (a) no isoform,
# (b) yes protein,
# (c) the most abundant annotations
# I suggest to study 6-7 model species, namely: humans, mouse, zebrafish, chicken, C. elegans, D. melanogaster, other?

### Import the data

In [None]:
import pandas as pd

# info on column_labels at: http://geneontology.org/docs/go-annotation-file-gaf-format-2.0/
column_labels = ['DB', 'DB Object ID', 'DB Object Symbol',
                 'Qualifier', 'GO ID', 'DB:Reference (|DB:Reference)',
                 'Evidence Code', 'With (or) From', 'Aspect', 'DB Object Name',
                 'DB Object Synonym (|Synonym)', 'DB Object Type', 'Taxon(|taxon)',
                 'Date', 'Assigned By', 'Annotation Extension', 'Gene Product Form ID']

df = pd.read_csv('goa_human.gaf', comment='!', 
                 sep="\t", header=None, low_memory=False, names=column_labels)

In [None]:
df.head(5)

In [None]:
df.shape

### Filtering

In [None]:
# for each gene (column: DB Object ID), retrieve the gene ontology annotations, as follows:
# (a) separately for P (biological process), F (molecular function) or C (cellular component)
# (column: Aspect),
# (b) end lists should not have duplicates,
# (c) do not incluce "NOT" qualifiers

# Filter for rows with qualifiers not containing "NOT"
df_filtered = df[~df["Qualifier"].str.contains("NOT")]


In [None]:
df_filtered.shape

In [None]:
df_filtered.head(5)

### Create the dictionaries and populate them

In [None]:
# Create dictionaries for each aspect (P, F, and C)
aspect_P = {}
aspect_F = {}
aspect_C = {}

In [None]:
# Iterate through the filtered DataFrame and populate the dictionaries
for index, row in df_filtered.iterrows():
    aspect = row["Aspect"]
    gene_id = row["DB Object ID"]
    go_id = row["GO ID"]

    if aspect == "P":
        aspect_P.setdefault(gene_id, []).append(go_id)
    elif aspect == "F":
        aspect_F.setdefault(gene_id, []).append(go_id)
    elif aspect == "C":
        aspect_C.setdefault(gene_id, []).append(go_id)

### Useful stuff about the dictionaries

In [None]:
# Get the size of each dict (genes in each dict)
print(f"The size of dictionary aspect_P is: {len(aspect_P)}")

In [None]:
print(f"The size of dictionary aspect_F is: {len(aspect_F)}")

In [None]:
print(f"The size of dictionary aspect_C is: {len(aspect_C)}")

In [None]:
# Print the first three key-value pairs of the P dictionary as an example
for gene_id, go_ids in list(aspect_P.items())[:100]:
    print(f"{gene_id}: {go_ids}")

In [None]:
# search if a key (a gene) exists in the P dictionary
if "A0A024RBG1" in aspect_P:
    print("Key 'A0A024RBG1' exists in the dictionary.")
else:
    print("Key 'A0A024RBG1' does not exist in the dictionary.")

In [83]:
# get the value of a specific key (the GO terms associated with a gene)
aspect_P["A0A024RBG1"] 

['GO:1901907', 'GO:1901909', 'GO:0071543', 'GO:1901911']

In [None]:
# sort the aspect_P dictionary based on keys (alphabetically)
sorted_Pdict = dict(sorted(aspect_P.items(), key=lambda item: item[0]))

# print the first 3 rows of the sorted Pdict
for gene_id, go_ids in list(sorted_Pdict.items())[:10]:
    print(f"{gene_id}: {go_ids}")

### Semantic similarity correction

In [None]:
# now we will focus only on the "P" dictionary (biological process) and we will try to correct 
# by a semantic similarity measure the list of Gene Ontologies associated with each gene


# To correct the list of GO IDs associated with each gene ID using a semantic similarity measure, 
# we can use various approaches based on the Gene Ontology structure, such as Resnik, Lin, 
# or Jiang-Conrath measures, among others.
# Here we perform a basic example using the goatools library to calculate semantic 
# similarity with the Resnik measure and correct the list of GO IDs associated with each gene ID.

In [None]:
# since we have already goatools installed, we just check the goatools version
from importlib.metadata import version
version('goatools')


In [None]:
# import specific tools
from goatools.semantic import semantic_similarity
from goatools import obo_parser
from goatools.obo_parser import GODag

godag = GODag("go-basic.obo")

In [None]:
# Calculate semantic similarity using Resnik measure and update the lists 
# (add updated lists to new dict, do not alter the original dict)
aspect_P_corrected = {}
for gene_id, go_ids in aspect_P.items():
    new_go_ids = []
    for go_id in go_ids:
        max_sim = 0.0
        for other_go_id in go_ids:
            if go_id != other_go_id:
               sim = semantic_similarity(go_id, other_go_id, godag)
               max_sim = max(max_sim, sim)
        if max_sim >= 0.5:  # Set the threshold as needed
            new_go_ids.append(go_id)
    aspect_P_corrected[gene_id] = new_go_ids


In [None]:
# Print the first three key, value pairs of the corrected "P" dictionary as an example
for gene_id, go_ids in list(aspect_P_corrected.items())[:100]:
    print(f"{gene_id}: {go_ids}")

### Assessment of our method

In [None]:
# The gene codenamed A0A024RBG1 previously had 4 GO terms associated with it 
# ['GO:1901907', 'GO:1901909', 'GO:0071543', 'GO:1901911']
# Let's see how many there are now (after semantic similarity correction using Resnik distance)
if "A0A024RBG1" in aspect_P_corrected:
    print("Key 'A0A024RBG1' exists in the dictionary.")
else:
    print("Key 'A0A024RBG1' does not exist in the dictionary.")

In [None]:
aspect_P_corrected["A0A024RBG1"] 

In [None]:
# For some reason all the terms for said gene are deleted (should look into it)

In [None]:
# The gene codenamed A0A075B6H5 previously had only 1 GO term associated with it 
# ['GO:0007166']
# Let's see how many there are now (after semantic similarity correction using Resnik measure)
if "A0A075B6H5" in aspect_P_corrected:
    print("Key 'A0A075B6H5' exists in the dictionary.")
else:
    print("Key 'A0A075B6H5' does not exist in the dictionary.")

In [82]:
aspect_P_corrected["A0A075B6H5"] 

[]

In [None]:
# So we notice another fault in our code: 
# Genes with only one term associated with them get that term deleted 
# during our similarity calculation (perhaps because there is no other term to compute similarity)

In [None]:
# There are 4 things that we should investigate
# 1) Why did a specific term get deleted and others not?
# 2) Why in cases of only 1 term, this term gets deleted?
# 3) Why in some cases of multiple terms, all terms get deleted?
# 4) What value should the similarity threshold take?
# Extra notes
# While investigating the first 100 genes (before and after correction)
# I notice that two terms 'GO:0002250', 'GO:0006955' always survive the correction 
# and seem to almost be the only ones to do so


In [None]:
# count genes with empty values after correction

# Initialize a counter for empty values
empty_values_count = 0

# Loop through the values in the dictionary and count the empty lists
for value in aspect_P_corrected.values():
    if not value:
        empty_values_count += 1

print("Number of keys with empty values:", empty_values_count)

In [None]:
len(aspect_P_corrected)

In [None]:
# We see that most of the keys (genes) got all their values (GO terms) deleted after our "correction".

In [None]:
# Now let's check how many genes in the original dataset had only 1 go term associated with them

# Initialize a counter for keys with one GO term in their value list
keys_with_one_go_term_count = 0

# Loop through the values in the dictionary and count the keys with only one GO term
for value in aspect_P.values():
    if len(value) == 1:
        keys_with_one_go_term_count += 1

print("Number of keys with only one GO term:", keys_with_one_go_term_count)

In [73]:
# Count how many different GO terms exist in the original dictionary

# Create a set to store unique GO terms
unique_go_terms = set()

# Loop through the values in the dictionary and add GO terms to the set
for value in aspect_P.values():
    unique_go_terms.update(value)

# Get the count of different GO terms
num_unique_go_terms = len(unique_go_terms)

print("Number of different GO terms:", num_unique_go_terms)

Number of different GO terms: 12389


In [72]:
# Count how many different GO terms exist in the corrected dictionary

# Create a set to store unique GO terms
unique_go_terms = set()

# Loop through the values in the dictionary and add GO terms to the set
for value in aspect_P_corrected.values():
    unique_go_terms.update(value)

# Get the count of different GO terms
num_unique_go_terms = len(unique_go_terms)

print("Number of different GO terms:", num_unique_go_terms)

Number of different GO terms: 7240


In [87]:
# NEW METHOD
def get_most_general(go_terms, godag):
    most_general_term = None
    max_similarity = 0.5  # Semantic similarity threshold
    for term1 in go_terms:
        for term2 in go_terms:
            if term1 != term2:
                similarity = semantic_similarity(term1, term2, godag)
                if similarity > max_similarity:
                    most_general_term = term1 if godag[term1].level < godag[term2].level else term2
                    break
        if most_general_term is not None:
            break
    return most_general_term

# Load the GO DAG
obo_file = "go-basic.obo"  
go_dag = obo_parser.GODag(obo_file)

# Create a new dictionary with the reduced GO terms
new_data_dict = {}
for key, go_terms in aspect_P.items():
    if len(go_terms) == 1:
        new_data_dict[key] = go_terms[0]
    else:
        most_general_term = get_most_general(go_terms, go_dag)
        if most_general_term is not None:
            new_data_dict[key] = most_general_term
        else:
            # If no term is more general (similarity threshold not met), keep all terms
            new_data_dict[key] = go_terms

# What if similarity threshold is met but both terms are on the same lvl (meaning that none is more 
# general than the other?)

go-basic.obo: fmt(1.2) rel(2023-06-11) 46,420 Terms


In [98]:
for gene_id, go_ids in list(new_data_dict.items())[:100]:
    print(f"{gene_id}: {go_ids}")

A0A075B6H7: GO:0006955
A0A075B6H8: GO:0006955
A0A075B6H9: GO:0006955
A0A075B6I0: GO:0006955
A0A075B6I1: GO:0006955
A0A075B6I3: GO:0006955
A0A075B6I4: GO:0006955
A0A075B6I6: GO:0006955
A0A075B6I7: GO:0006955
A0A075B6I9: GO:0006955
A0A075B6J1: GO:0006955
A0A075B6J2: GO:0006955
A0A075B6J6: GO:0006955
A0A075B6J9: GO:0006955
A0A075B6K0: GO:0006955
A0A075B6K2: GO:0006955
A0A075B6K4: GO:0006955
A0A075B6K5: GO:0006955
A0A075B6K6: GO:0006955
A0A075B6L2: GO:0002250
A0A075B6L6: ['GO:0002250', 'GO:0007166']
A0A075B6N1: ['GO:0002250', 'GO:0007166']
A0A075B6N2: ['GO:0002250', 'GO:0007166']
A0A075B6N3: ['GO:0002250', 'GO:0007166']
A0A075B6N4: ['GO:0002250', 'GO:0007166']
A0A075B6P5: GO:0006955
A0A075B6R0: ['GO:0002250', 'GO:0045087']
A0A075B6R9: GO:0006955
A0A075B6S0: GO:0002250
A0A075B6S2: GO:0006955
A0A075B6S4: GO:0006955
A0A075B6S5: GO:0006955
A0A075B6S6: GO:0006955
A0A075B6S9: GO:0006955
A0A075B6T6: GO:0002250
A0A075B6T7: ['GO:0002250', 'GO:0009617']
A0A075B6T8: GO:0002250
A0A075B6U4: ['GO:000225

In [None]:
for gene_id, go_ids in list(aspect_P.items())[:100]:
    print(f"{gene_id}: {go_ids}")

In [88]:
new_data_dict['A0A024RBG1']

['GO:1901907', 'GO:1901909', 'GO:0071543', 'GO:1901911']

In [99]:
#OTHER METHOD

# Load the Gene Ontology data (replace 'go.obo' with the path to your Gene Ontology OBO file)
go_data = obo_parser.GODag("go-basic.obo")

def reduce_go_terms(go_terms, threshold=0.8):
    reduced_terms = set(go_terms)

    for term1 in go_terms:
        for term2 in go_terms:
            if term1 != term2:
                similarity = semantic_similarity(term1, term2, go_data, ic_map)
                if similarity >= threshold:
                    reduced_terms.discard(term2)

    # Make sure at least one GO term is always included
    if not reduced_terms:
        reduced_terms.add(min(go_terms, key=lambda x: ic_map[x]))

    return list(reduced_terms)

# Now, if the reduction process removes all GO terms for a key, we will add back the 
# GO term with the minimum information content (ic_map[x]) to ensure that at least one
# term is retained.

reduced_dictionary = {}

for key, go_terms in aspect_P.items():
    reduced_terms = reduce_go_terms(go_terms)
    reduced_dictionary[key] = reduced_terms

go-basic.obo: fmt(1.2) rel(2023-06-11) 46,420 Terms


NameError: name 'ic_map' is not defined

In [95]:
reduced_dictionary['A0A024RBG1']

['GO:0071543', 'GO:1901911']

In [96]:
aspect_P['A0A024RBG1']

['GO:1901907', 'GO:1901909', 'GO:0071543', 'GO:1901911']

In [97]:
for gene_id, go_ids in list(reduced_dictionary.items())[:100]:
    print(f"{gene_id}: {go_ids}")

A0A075B6H7: ['GO:0002377']
A0A075B6H8: ['GO:0002377']
A0A075B6H9: ['GO:0002377']
A0A075B6I0: ['GO:0002377']
A0A075B6I1: ['GO:0002377']
A0A075B6I3: ['GO:0002377']
A0A075B6I4: ['GO:0002377']
A0A075B6I6: ['GO:0002377']
A0A075B6I7: ['GO:0002377']
A0A075B6I9: ['GO:0002377']
A0A075B6J1: ['GO:0002377']
A0A075B6J2: ['GO:0002377']
A0A075B6J6: ['GO:0002377']
A0A075B6J9: ['GO:0002377']
A0A075B6K0: ['GO:0002377']
A0A075B6K2: ['GO:0002377']
A0A075B6K4: ['GO:0002377']
A0A075B6K5: ['GO:0002377']
A0A075B6K6: ['GO:0002377']
A0A075B6L2: ['GO:0002250']
A0A075B6L6: ['GO:0002250', 'GO:0007166']
A0A075B6N1: ['GO:0002250', 'GO:0007166']
A0A075B6N2: ['GO:0002250', 'GO:0007166']
A0A075B6N3: ['GO:0002250', 'GO:0007166']
A0A075B6N4: ['GO:0002250', 'GO:0007166']
A0A075B6P5: ['GO:0002377']
A0A075B6R0: ['GO:0002250', 'GO:0045087']
A0A075B6R9: ['GO:0002377']
A0A075B6S0: ['GO:0002250']
A0A075B6S2: ['GO:0002377']
A0A075B6S4: ['GO:0002377']
A0A075B6S5: ['GO:0002377']
A0A075B6S6: ['GO:0002377']
A0A075B6S9: ['GO:0002377'