# EXPLORATORY DATA ANALYSIS

In [None]:
!pip install obonet -q
!pip install pyvis -q

In [None]:
import os
import json
from PIL import Image
from typing import Dict
from collections import Counter

import random
import cv2
import obonet
import networkx
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import matplotlib.patches as mpatch
from Bio import SeqIO
from pyvis.network import Network

### The CAFA dataset contains the following important files:
- **Go-basic.obo:** GO graph data. Each node of the graph contains info on GO terms and relationships with other GO terms.
- **Train_sequences.fasta:** The list of proteins with unique ids, some meta info and sequence.
- **Train_taxonomy.tsv:** It contains the taxonomy ID of proteins
- **Train_term.tsv:** Contains mapping of the protein ids with the GO terms ids.
- **IA.txt:** Information Accretion for each term. This is used to weight precision and recall

In [None]:
class CFG:
    train_go_obo_path: str = "/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo"
    train_seq_fasta_path: str = "/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta"
    train_terms_path: str = "/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv"
    train_taxonomy_path: str = "/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv"
    train_ia_path: str = "/kaggle/input/cafa-5-protein-function-prediction/IA.txt"

In [None]:
class color:
   PURPLE = '\033[95m'
   CYAN = '\033[96m'
   DARKCYAN = '\033[36m'
   BLUE = '\033[94m'
   GREEN = '\033[92m'
   YELLOW = '\033[93m'
   RED = '\033[91m'
   BOLD = '\033[1m'
   UNDERLINE = '\033[4m'
   END = '\033[0m'

In [None]:
def plot_dag(graph, term, radius=1):
    # create smaller subgraph
    # radius - include all neighbors of distance<=radius from n (increse it to add further parent's branches).
    ng_graph = networkx.ego_graph(graph, term, radius=radius)

    for n in ng_graph.nodes(data=True):
        # concatenate label of the node with its attribute
        n[1]["label"] = n[0] + " " +n[1]["name"]

    nt = Network(directed=True, notebook=True, cdn_resources="in_line")
    nt.from_nx(ng_graph)
    return nt.show("network.html")

In [None]:
graph = obonet.read_obo(CFG.train_go_obo_path)

In [None]:
print(f"Number of nodes: {len(graph)}")

In [None]:
print(f"Number of edges: {graph.number_of_edges()}")

In [None]:
sequences = SeqIO.parse(CFG.train_seq_fasta_path, "fasta")
num_sequences = sum(1 for seq in sequences)
print(num_sequences)

In [None]:
sequences = SeqIO.parse(CFG.train_seq_fasta_path, "fasta")

# get the length of each sequence
lengths = [len(seq) for seq in sequences]

fig = px.histogram(x=lengths, nbins=1000, color_discrete_sequence=['goldenrod'])
fig.update_layout(
    title={
        'text': "Distribution of protein sequence lengths",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Sequence length", yaxis_title="Count"
)

fig.show()

In [None]:
records = SeqIO.parse(CFG.train_seq_fasta_path, "fasta")

# create a list of all amino acids in the sequences
aa_list = [aa for record in records for aa in record.seq]

# count the frequency of each amino acid
aa_count = Counter(aa_list)

fig = px.bar(
    x=list(aa_count.values()), y=list(aa_count.keys()),
    color_discrete_sequence=['darkslateblue'],
    orientation='h', height=700
)
fig.update_layout(
    title={
        'text': "Amino Acid Composition",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Frequency", yaxis_title="Amino Acid"
)
fig.show()

In [None]:
records = SeqIO.parse(CFG.train_seq_fasta_path, "fasta")

# create a list of all amino acids in the sequences
aa_list = [aa for record in records for aa in record.seq]

# count the frequency of each amino acid
aa_count = Counter(aa_list)

fig = px.bar(
    x=list(aa_count.values()), y=list(aa_count.keys()),
    color_discrete_sequence=['darkslateblue'],
    orientation='h', height=700
)
fig.update_layout(
    title={
        'text': "Amino Acid Composition",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    xaxis_title="Frequency", yaxis_title="Amino Acid"
)
fig.show()

In [None]:
train_terms_df = pd.read_csv(CFG.train_terms_path, sep="\t")
train_terms_df.head()

In [None]:
train_terms_df.describe()

In [None]:
aspect_counts = train_terms_df.aspect.value_counts()

fig = px.pie(values=aspect_counts.values, names=aspect_counts.index)
fig.update_traces(textposition='inside', textfont_size=14)
fig.update_layout(
    title={
        'text': "Pie distribution of aspect values",
        'y':0.95,
        'x':0.5,
        'xanchor': 'center',
        'yanchor': 'top'
    },
    legend_title_text='Aspect:'
)
fig.show()

In [None]:
train_taxonomy_df = pd.read_csv(CFG.train_taxonomy_path, sep="\t")
train_taxonomy_df.head()

In [None]:
train_taxonomy_df.describe()

In [None]:
len(train_taxonomy_df)
# matches with the num of unique enteries in train_terms.fasta

In [None]:
merged_df = pd.merge(train_terms_df,train_taxonomy_df,on='EntryID')
merged_df.head()

In [None]:
limit = 10

with open(CFG.train_ia_path) as f:
    ia_weights = [x.replace("\n", "").split("\t") for x in f.readlines()]

ia_weights[:limit]