In [None]:
def reverse_protein_go_dict(protein_to_go):
   """
   Convert protein id : [GO ids] dict to GO id  : [protein ids] dict.
   Args:
         protein_to_go (dict): Protein ID to GO id dictionary

    Returns:
         dict: GO ID to protein IDs dictionary
   
   """
   go_to_proteins = defaultdict(list)
   for protein, go_terms in protein_to_go.items():
       for go_term in go_terms:
           go_to_proteins[go_term].append(protein)
   return go_to_proteins

In [None]:
def expand_go_terms_with_ancestors(protein_to_go, go_obo):
    """
    Expand GO IDs with their ancestors.
    
    Args:
        protein_to_go (dict): Dictionary mapping GO ids to Protein ID.
        go_obo (GODag): Parsed GO DAG for ancestor retrieval.
        
    Returns:
        dict: Updated protein_to_go dictionary with expanded GO IDs.
    """

    expanded_protein_to_go = defaultdict(set)
    
    for protein, go_terms in protein_to_go.items():
        for go_id in go_terms:
            # Add the GO ID itself
            expanded_protein_to_go[protein].add(go_id)
            
            # Add all ancestor terms (i.e their IDs) of that GO ID 
            if go_id in go_obo:
                ancestors = go_obo[go_id].get_all_parents()  # Get all ancestors
                expanded_protein_to_go[protein].update(ancestors)
    
   
    return {protein: list(go_terms) for protein, go_terms in expanded_protein_to_go.items()}


In [None]:
def get_go_terms_given_goid(protein_go_dict: Dict[str, List[str]]) -> Dict[str, str]:
    """
    Convert GO IDs to their corresponding terms using the Gene Ontology API.
    
    Args:
        protein_go_dict (dict) : Dictionary mapping protein IDs to lists of GO IDs
        
    Returns:
        dict : Dictionary mapping GO IDs to their terms
        

    """
    # Extract unique GO IDs from the dictionary
    go_ids = set()
    for go_list in protein_go_dict.values():
        go_ids.update(go_list)
    
    # Initialize results dictionary
    go_terms_dict = {}
    
    # Base URL for the Gene Ontology API
    base_url = "http://api.geneontology.org/api/ontology/term/"
    
    # Process each GO ID
    for go_id in go_ids:
        try:
            # Add delay 
            time.sleep(0.1)
            
            # Make API request using the Gene Ontology API, i.e. search the API based on the current GO ID
            response = requests.get(f"{base_url}{go_id}")
            response.raise_for_status()
            
            # Extract the corresponding GO term
            data = response.json()
            go_terms_dict[go_id] = data.get('label', 'Term not found')
        
        # Some error catching 
        except requests.exceptions.RequestException as e:
            print(f"Error fetching term for {go_id}: {str(e)}")
            go_terms_dict[go_id] = 'Error fetching term'
            
        except Exception as e:
            print(f"Unexpected error processing {go_id}: {str(e)}")
            go_terms_dict[go_id] = 'Error processing term'
    
    return go_terms_dict

In [None]:
def fetch_go_annotations(protein_id):
    """
    Fetch GO annotations and create GO ID to protein list mapping.
    
    Args:
        protein_id (str): single protein ID of our family 
        
    Returns:
        List : List of the GO ids found for that protein
    """
    go_ids = []
    

    # Base URL for the UniProt API
    url = f"https://rest.uniprot.org/uniprotkb/{protein_id}.xml"

    try:
        response = requests.get(url)
        response.raise_for_status()
        
        namespaces = {'ns': 'http://uniprot.org/uniprot'}
        root = ET.fromstring(response.content)
        
        # Get all GO IDs for the protein
        for db_ref in root.findall(".//ns:dbReference[@type='GO']", namespaces):
            go_id = db_ref.attrib.get('id')
            
            if go_id:
                go_ids.append(go_id)
    
    # Some error catching
    except requests.exceptions.RequestException as e:
        print(f"Error fetching GO annotations for {protein_id}: {e}")
   
            
    return go_ids


# Let's add some debugging to help understand what's happening
# here we see that the big .xml file has the same structure as the small ones 
# we already analyzed ; thus,we can use the same parsing structure, but this time directly
# just collect the counts of GO terms, because that is all we need (no diff. categories, would just make our code slower)
def print_swissprot_file(swissprot_xml_path, length = 50):
    """
    Just to look at the first few lines to see the structure
    """

    with open(swissprot_xml_path, 'r') as f:
        print("First length lines of the file:")
        for i, line in enumerate(f):
            if i < length:
                print(line.strip())
            else:
                break



def parse_swissprot_go_terms(swissprot_xml_path, family_proteins):
   """
   Parse GO IDs from SwissProt XML file for each protein, excluding proteins in the family.
   
   Args:
       swissprot_xml_path (str): Path to SwissProt XML file
       family_proteins (set): UniProt IDs in protein family
   
   Returns:
       dict: protein ID : [GO IDs] for that protein
   """
   protein_to_go = defaultdict(list)
   total_proteins = 0
   skipped_proteins = 0
   
   namespaces = {'ns': 'http://uniprot.org/uniprot'}
   context = ET.iterparse(swissprot_xml_path, events=('end',))
   
   print("Starting to parse SwissProt XML...")
   
   for event, elem in context:
       if elem.tag.endswith('entry'):
           accession = elem.find(".//ns:accession", namespaces)
           if accession is not None:
               uniprot_id = accession.text
               
               # Exclude family proteins
               if uniprot_id in family_proteins:
                   skipped_proteins += 1
               else:
                   # Get all GO IDs for the protein (same structure as in fetch_go_annotations)
                   for db_ref in elem.findall(".//ns:dbReference[@type='GO']", namespaces):
                       go_id = db_ref.attrib.get('id')
                       if go_id:
                           protein_to_go[uniprot_id].append(go_id)
                   total_proteins += 1

           elem.clear()
           
           # Keep track of progress, as it takes some time to parse the whole file
           if (total_proteins + skipped_proteins) % 10000 == 0:
               print(f"Processed {total_proteins} proteins "
                     f"(skipped {skipped_proteins} family proteins)...")
             
    
               
                    
               
   return protein_to_go

def calculate_go_enrichment(go_to_proteins_family, go_to_proteins_swissprot, total_proteins_family, total_proteins_swissprot, go_id_to_go_term):
    """ 
    Perform Fisher's exact test to calculate GO term enrichment in our protein family.

    Args:
        go_to_proteins_family (dict): GO ID to list of proteins in family
        go_to_proteins_swissprot (dict): GO ID to list of proteins in SwissProt
        total_proteins_family (int): Total proteins in family
        total_proteins_swissprot (int): Total proteins in SwissProt
        go_id_to_go_term (dict): GO ID to GO term mapping

    Returns:
        pd.DataFrame: DataFrame with GO term enrichment results
"""
    results = []
    
    
    for go_id in go_to_proteins_family.keys():
   
        # Create the 2x2 contingency table for Fisher's exact test
        # The table looks like this:
        #                   Protein in family    Protein not in family (i.e. all in SwissProt - family proteins)
        # Has GO term            a                    b
        # No GO term             c                    d
        
        # Contingency table calculations:
        a = len(go_to_proteins_family[go_id])  # Proteins with this GO term in family
        
      
        b = len(go_to_proteins_swissprot.get(go_id, []))  # Proteins with GO term in rest of SwissProt (without family)
        
        c = total_proteins_family - a  # Proteins without GO term in family
        
  
        d = total_proteins_swissprot - b # Proteins without GO term in rest of SwissProt (without family)
        
        # Verify all values are non-negative before creating contingency table
        if all(x >= 0 for x in [a, b, c, d]):
            contingency_table = [[a, b], [c, d]]
            
            # Perform Fisher's exact test
            # We ask : is the GO term appearing more often in our family than we would expect by random chance ?
            # The null hypothesis (H0) is: "The proportion of proteins with this GO term in our family 
            # is the same as the proportion in the SwissProt dataset (without the protein in the family)." 
            # In other words, under H0, getting the GO term is independent of being in our family (so it doesn't represent the family)
            # Alternative Hypothesis (H1) using the right-tail and two-tail:
            #Right-tail (greater): Our family has a higher proportion of this GO term than SwissProt
            #Two-tail (two-sided): The proportion is different (either higher or lower)

            #Fisher's exact test calculates the probability of seeing our observed data (or more extreme) under the null hypothesis.
            #A very small p-value (like < 0.05) tells us:
            #Two-tail: This GO term's frequency is significantly different from SwissProt
            #Right-tail: This GO term is significantly enriched in our family(overrepresented)


            odds_ratio, pvalue_two_tail = fisher_exact(contingency_table, alternative='two-sided')
            _, pvalue_greater = fisher_exact(contingency_table, alternative='greater')
      
            # Calculate proportions
            my_proportion = a / total_proteins_family 
            # Notice that we add the proteins in the family to the proteins in SwissProt to get the total number of proteins
            swissprot_proportion = (a+b) / (total_proteins_swissprot + total_proteins_family)

     
            
            results.append({
                'GO_ID': go_id,
                'GO_Term': go_id_to_go_term.get(go_id, 'N/A'), # Include GO term name
                'Count_Prot_Dataset (a)': a,
                'Count_Prot_SwissProt (b)': b,
                'c': c,
                'd': d,
                'Count_Prot_SwissProt_Actual': a+b,
                'Percentage_Dataset': round(my_proportion * 100, 2),
                'Percentage_SwissProt': round(swissprot_proportion * 100, 10),
                'Fold_Enrichment': round(my_proportion/swissprot_proportion,2),
                'P_Value_Two_Tail': pvalue_two_tail,
                'P_Value_Greater': pvalue_greater,
            })
    
    # Convert to DataFrame and sort by p-value
    df_results = pd.DataFrame(results)
    if not df_results.empty:
        df_results = df_results.sort_values('P_Value_Two_Tail')

    df_results.to_csv("Function/enrichment_results.csv")
    
    return df_results

In [None]:
# Hierarchical Structure
def analyze_go_hierarchy():
    """
    Analyze the hierarchical structure of enriched GO terms.

    """

    # Load the Gene Ontology DAG
    go_obo = obo_parser.GODag('go.obo')
    
    # Read our enrichment results
    df = pd.read_csv("Function/enrichment_results.csv")
    
    # Filter for significantly enriched terms
    enriched_terms = df[
        (df['P_Value_Two_Tail'] < 0.05) &
        (df['P_Value_Greater'] < 0.05)
    ]
    
    # Create a dictionary to store branch information
    branch_info = {}
    
    # For each enriched term, traverse up its ancestry
    for _, row in enriched_terms.iterrows():
        go_id = row['GO_ID']
        if go_id in go_obo:
            term = go_obo[go_id]
            
            # Get all ancestors (parents) up to the root of the DAG of the current term (i.e. the current GO ID)
            
            ancestors = term.get_all_parents()
            
            for ancestor_id in ancestors:
                if ancestor_id not in branch_info:
                    ancestor_term = go_obo[ancestor_id]
                    # Get the immediate parent as the branch name (if there is one, else take the ancestor_term itself)
                    parent_terms = ancestor_term.parents
                    branch_name = next(iter(parent_terms)).name if parent_terms else ancestor_term.name
                    
                    branch_info[ancestor_id] = {
                        'term_name': ancestor_term.name,
                        'branch_name': branch_name,
                        'enriched_children': [],
                        'total_significance': 0,
                        'depth': ancestor_term.depth,
                    }

               
                # Our go_id in the current iteration is a child to ALL ancestors we found using "get_all_parents()"
                #  (note that this is not necessarily a direct child, but maybe also much more down in the tree somewhere)
                # Thus, add this child into the enriched_children list of the ancestor with its two-tailed p-value 
                branch_info[ancestor_id]['enriched_children'].append({
                    'id': go_id,
                    'name': term.name,
                    'p_value': row['P_Value_Two_Tail']
                })
                # Measure significance based on -log value of the p value of all the childs of the ancestor (lower p values have higher scores)
                branch_info[ancestor_id]['total_significance'] += -np.log10(row['P_Value_Two_Tail'])
    
    # Filter for high-level terms (lower depth) with multiple enriched children
    significant_branches = {
        go_id: info for go_id, info in branch_info.items() # take each key,value of the branch_info dictionary
        if len(info['enriched_children']) >= 2  # At least 2 enriched children
        and info['depth'] <= 3  # High-level terms having maximum depth of 3 (i.e. only look at GO terms high up in the tree)
    } 
    
    # Sort branches by their total significance
    sorted_branches = sorted(
        significant_branches.items(),
        key=lambda x: x[1]['total_significance'],
        reverse=True
    )
    
    # Create a list to store the branch information
    branch_data = []


    for go_id, info in sorted_branches[:20]:  # Top 20 branches
        branch_data.append({
            'GO_ID': go_id,
            'GO_Term': info['term_name'],
            'Branch_Name' : info['branch_name'],
            'Hierarchy_Depth': info['depth'],
            'Number_Enriched_Terms': len(info['enriched_children']),
            'Total_Significance_Score': info['total_significance']
        })

    # Create a DataFrame and save to CSV
    branches_df = pd.DataFrame(branch_data)
    branches_df.to_csv('Function/enriched_branches.csv', index=False)

In [None]:
def main():

    psiblast_file = "Model/Evaluation/Predictions/PSI-BLAST/psiblastsearch_output.csv"
    hmm_file = "Model/Evaluation/Predictions/HMM-SEARCH/hmmsearch_output.csv"
    protein_ids = load_protein_ids(psiblast_file, hmm_file)


    # Proteins_to_GO terms for our family 
    print("Fetching GO annotations...")
    family_annotations = {}
    for pid in tqdm(protein_ids, desc="Fetching GO annotations"):
        family_annotations[pid] = fetch_go_annotations(pid)

    total_proteins_family = len(family_annotations)


    
    # Proteins_to_GO terms for SwissProt
    swissprot_annotations = parse_swissprot_go_terms("uniprot_sprot.xml", protein_ids)

    total_proteins_swissprot = len(swissprot_annotations)

    # Load the GO DAG for ancestor expansion
    # We downloaded the go.obo file so we can parse the whole ontology
    # Note that "go.obo" we downloaded locally and not to the Git Repository due to its size
    print("Expanding GO terms to include ancestors...")
    go_obo = obo_parser.GODag('go.obo')
    expanded_family_annotations = expand_go_terms_with_ancestors(family_annotations, go_obo)
    expanded_swissprot_annotations = expand_go_terms_with_ancestors(swissprot_annotations, go_obo)

    # This we didn't directly use, but since it is Task 1 in the Function assignment, we include it 
    expanded_family_annotations_df = pd.DataFrame(expanded_family_annotations.items(), columns=['Protein ID', 'GO IDs'])
    expanded_family_annotations_df.to_csv("Function/expanded_family_annotations.csv", index=False)

    # Fetch all GO terms for all found GO IDs in the family after expanding 
    go_id_to_go_term = get_go_terms_given_goid(expanded_family_annotations)

    # Reverse mapping (go_id to proteins mapping) for enrichment
    go_to_proteins_family = reverse_protein_go_dict(expanded_family_annotations)
    go_to_proteins_swissprot = reverse_protein_go_dict(expanded_swissprot_annotations)

    # Calculate GO enrichments
    _ = calculate_go_enrichment(go_to_proteins_family, go_to_proteins_swissprot,
                                total_proteins_family, total_proteins_swissprot, go_id_to_go_term)

    # Analyze hierarchy using the "cut tree" approach
    print("Analyzing GO hierarchy using the 'cut tree' approach...")
    analyze_go_hierarchy_cut_tree(
    enrichment_results_path="Function/enrichment_results.csv",
    go_obo_path="go.obo",
    output_path="Function/enriched_branches_cut_tree.csv"
    )
    
    # Read the enrichment results
    df = pd.read_csv("Function/enrichment_results.csv")

    # Get the terms to the GO ids from the family data
  #  go_id_to_term = create_go_id_to_term_mapping(family_annotations)

    # Filter for significantly enriched terms
    enriched_terms = df[
    (df['P_Value_Two_Tail'] < 0.05) &
    (df['P_Value_Greater'] < 0.05)
    ]


    # Create word frequencies using the actual GO terms instead of IDs
    word_frequencies = {}
    for _, row in enriched_terms.iterrows():
        go_id = row['GO_ID']
        if go_id in go_id_to_go_term:  # Make sure we have the term for this ID
            term = go_id_to_go_term[go_id]
            # Use fold enrichment as weight
            weight = row['Fold_Enrichment']
            word_frequencies[term] = weight

    # Create and display the word cloud
    wordcloud = WordCloud(
        width=1200, 
        height=800,
        background_color='white',
        prefer_horizontal=0.7,
        max_words=50,  # Limit to top 50 terms for better readability
        min_font_size=10,
        max_font_size=60
    ).generate_from_frequencies(word_frequencies)

    # Plot and save the word cloud
    plt.figure(figsize=(20, 12))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('GO Term Enrichment Word Cloud', fontsize=16, pad=20)
    plt.savefig('Function/go_enrichment_wordcloud.png', dpi=300, bbox_inches='tight')
    plt.close()

    # Print out the enriched terms for verification
    print("\nTop enriched GO terms:")
    sorted_terms = sorted(word_frequencies.items(), key=lambda x: x[1], reverse=True)
    for term, weight in sorted_terms[:10]:
        print(f"\nTerm: {term}")
        print(f"Weight in word cloud: {weight:.2f}")

    
    # Hierarchy

    analyze_go_hierarchy()

    
if __name__ == "__main__":
    main()