In [22]:
import networkx
import obonet
import pandas as pd
import numpy as np

from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm

In [107]:
# Constant
TRAIN_DATASET_PATH = '../biological_data_pfp/train/train_set.tsv'
GENE_ONTOLOGY_PATH = './dataset/taxonomy/go-basic.obo'
RELATIONSHIP = {'is_a': 1, 'part_of': 0}
N_LABELS = 1500

### Read the training dataset

In [122]:
# Function to read a TSV file with a progress bar and concatenate into a single DataFrame
def read_tsv_with_progress(filename):
    # Count the number of lines (for the progress bar)
    num_lines = sum(1 for line in open(filename, 'r'))
    
    # Create a tqdm object for the progress bar
    tqdm_iterator = tqdm(pd.read_csv(filename, delimiter='\t', chunksize=1000), total=num_lines/1000)
    
    # List to store each chunk
    chunks = []

    # Read the file in chunks and append each chunk to the list
    for chunk in tqdm_iterator:
        chunks.append(chunk)

    # Concatenate all chunks into a single DataFrame
    df = pd.concat(chunks, ignore_index=True)

    return df

# Replace 'your_file.tsv' with the path to your TSV file
main_df = read_tsv_with_progress(TRAIN_DATASET_PATH)

cc_df = main_df[main_df['aspect'] == 'cellular_component']
bp_df = main_df[main_df['aspect'] == 'biological_process']
mf_df = main_df[main_df['aspect'] == 'molecular_function']

cc_df = main_

4278it [00:02, 1842.43it/s]                                                     


In [121]:
# Assuming df is your DataFrame
go_term_df = df[['GO_term']]  # Reshape to 2D array

encoder = OneHotEncoder(sparse=True)  # Use sparse output
encoder.fit(go_term_df)

codes_sparse = encoder.transform(go_term_df)



### Read Gene Ontology

In [116]:
%%time
graph = obonet.read_obo(GENE_ONTOLOGY_PATH)

CPU times: user 4.47 s, sys: 36.3 ms, total: 4.51 s
Wall time: 4.49 s


In [117]:
len(graph)

42837

In [118]:
graph.number_of_edges()

83581

In [119]:
networkx.is_directed_acyclic_graph(graph)

True

### Parse Node Properties

In [124]:
# Get pair relationship
def generate_pairing(df, name):
    pairing = {'child': [], 'relationship': [], 'parent': []}
    GO_terms = df['GO_term'].copy().unique()
    for term in tqdm(GO_terms, desc="Processing GO terms"):
        for parent, child, key in graph.in_edges(term, keys=True):
            if key not in RELATIONSHIP:
                continue
    
            pairing['child'].append(child)
            pairing['relationship'].append(key)
            pairing['parent'].append(parent)
            
    pairing_df = pd.DataFrame(pairing)
    pairing_df.to_csv(f'./dataset/train/{name}_pairing.csv', index=False)

for k, df in {'cellular_component': cc_df, 'biological_process': mf_df, 'molecullar_function': bp_df}.items():
    generate_pairing(df, k)

Processing GO terms: 100%|████████████████| 678/678 [00:00<00:00, 102937.02it/s]
Processing GO terms: 100%|█████████████████| 839/839 [00:00<00:00, 82135.68it/s]
Processing GO terms: 100%|███████████████| 1487/1487 [00:00<00:00, 65461.71it/s]


In [129]:
# Get frequency dictionary
def generate_frequency(df, name):  
    go_term_df = df[['GO_term']]
    frequency = {}
    for term in tqdm(go_term_df["GO_term"], desc="Processing GO terms"):
        frequency.setdefault(term, 0)
        frequency[term] += 1

    freq_attributes = {'id': [], 'frequency': []}
    # Sorted the value
    for k, v in tqdm(sorted(frequency.items(), key=lambda item: item[1], reverse=True), desc="process sorting"):
        freq_attributes['id'].append(k)
        freq_attributes['frequency'].append(v)
        
    freq_df = pd.DataFrame(freq_attributes)
    freq_df.to_csv(f'./dataset/train/{name}_freq.csv', index=False)

for k, df in {'cellular_component': cc_df, 'biological_process': mf_df, 'molecullar_function': bp_df}.items():
    generate_frequency(df, k)

Processing GO terms: 100%|███████| 1109632/1109632 [00:00<00:00, 4912211.72it/s]
process sorting: 100%|███████████████████| 678/678 [00:00<00:00, 1275790.99it/s]
Processing GO terms: 100%|█████████| 532532/532532 [00:00<00:00, 3990768.31it/s]
process sorting: 100%|███████████████████| 839/839 [00:00<00:00, 3460197.70it/s]
Processing GO terms: 100%|███████| 2634883/2634883 [00:00<00:00, 5150747.41it/s]
process sorting: 100%|█████████████████| 1487/1487 [00:00<00:00, 4155183.24it/s]
