In [1]:
import pandas as pd
import numpy as np

In [2]:
def preprocess_train(path_taxo, max_labels=500):
    """Function to preprocess the taxonomie file
    Parameters:
        path_taxo (str): path to the taxonomie file

    Returns:
        df_taxo (pd.DataFrame): preprocessed taxonomie file
    
    """
    # we read the taxonomie file
    df_taxo = pd.read_csv(path_taxo, sep='\t')

    # we compute the select term for our dataset 
    terms = df_taxo.groupby(['aspect', 'term'])['term'].count().reset_index(name='frequency')
    fractions = (terms.groupby('aspect')['term'].nunique() / terms['term'].nunique() * max_labels).apply(round)

    selected_terms = set()
    for aspect, number in fractions.items():
        selection = terms.loc[(terms.aspect == aspect)]
        selection = selection.nlargest(number, columns='frequency', keep='first')
        selected_terms.update(selection.term.to_list())  

    def assign_labels(annotations, selected_terms=selected_terms):
        
        intersection = selected_terms.intersection(annotations)
        labels = np.isin(np.array(list(selected_terms)), np.array(list(intersection)))
        
        return list(labels.astype('int'))

    annotations = df_taxo.groupby('EntryID')['term'].apply(set)
    labels = annotations.apply(assign_labels)

    return labels, selected_terms



In [3]:

labels, selected_terms = preprocess_train('/workspaces/protein_ontologies/dataset_kg/Train/train_terms.tsv')

# save into parquet format
labels_df = pd.DataFrame(np.array(labels.to_list()), columns=list(selected_terms))

# add the EntryID column
labels_df.index = labels.index

# save the labels in parquet format
labels_df.to_parquet('/workspaces/protein_ontologies/dataset_kg/Train/train_labels.parquet')


In [4]:
labels_df

Unnamed: 0_level_0,GO:0019637,GO:1901363,GO:0009893,GO:0051726,GO:0015031,GO:0010648,GO:0003700,GO:0004674,GO:1901698,GO:0051093,...,GO:0051174,GO:0003690,GO:0010035,GO:0005216,GO:0048646,GO:0003723,GO:0043207,GO:0004842,GO:0031399,GO:0009605
EntryID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A009IHW8,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A021WW32,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A021WZA4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A023FBW4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A023FBW7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X6RKS3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLN4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLP6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLR1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:

from Bio import SeqIO


def preprocess_seq_file(path_fasta, train=True):
    # read the fasta file
    info = []

    with open(path_fasta) as handle:
        if train:
            for record in SeqIO.parse(handle, "fasta"):

                id = record.id
                seq = str(record.seq)

                preprocess_info = record.description.split(' ')

                other_entry = preprocess_info[0]

                info_organism = preprocess_info[1].split('|')[0]
                info_animal = preprocess_info[1].split('|')[2].split('_')[1]

                info.append([id, seq, other_entry, info_organism, info_animal])

            # create the dataframe
            df_seq = pd.DataFrame(
                info, columns=['EntryID', 'sequence', 'other_entry', 'organism', 'animal'])
        else:
            # test case
            for record in SeqIO.parse(handle, "fasta"):

                id = record.id
                seq = str(record.seq)

                info.append([id, seq])

            # create the dataframe
            df_seq = pd.DataFrame(info, columns=['EntryID', 'sequence'])

    return df_seq


# now we have to preprocess the sequence column
def preprocess_sequence(sequence):
    """
    The idea is that we have a seq of type 'ASJIOJA'
    and we want to transform it in a string like '[CLS] A S J I O J A [SEP]'
    """
    sequence = list(sequence)
    sequence = ' '.join(sequence)
    return f'{sequence}'


In [None]:
df_seq = preprocess_seq_file("/workspaces/protein_ontologies/dataset_kg/Train/train_sequences.fasta")

df_seq['sequence'] = df_seq['sequence'].apply(preprocess_sequence)

# set the EntryID as index
df_seq.set_index('EntryID', inplace=True)

# save the dataframe in parquet format
df_seq.to_parquet('/workspaces/protein_ontologies/dataset_kg/Train/train_sequences.parquet')

In [16]:
df_seq = preprocess_seq_file("/workspaces/protein_ontologies/dataset_kg/Test/testsuperset.fasta", train=False)

df_seq['sequence'] = df_seq['sequence'].apply(preprocess_sequence)

# set the EntryID as index
df_seq.set_index('EntryID', inplace=True)

# save the dataframe in parquet format
df_seq.to_parquet('/workspaces/protein_ontologies/dataset_kg/Test/test_sequences.parquet')

Unnamed: 0_level_0,sequence
EntryID,Unnamed: 1_level_1
Q9CQV8,M T M D K S E L V Q K A K L A E Q A E R Y D D ...
P62259,M D D R E D L V Y Q A K L A E Q A E R Y D E M ...
P68510,M G D R E Q L L Q R A R L A E Q A E R Y D D M ...
P61982,M V D R E Q L V Q K A R L A E Q A E R Y D D M ...
O70456,M E R A S L I Q K A K L A E Q A E R Y E D M A ...
...,...
P08380,G N C K C D D E G P N V R T A P L T G Y V D L ...
C0HK72,R G I C L E P K V V G P C K A R I R R F Y Y D ...
C0HK73,G S I C L E P K V V G P C K A G I R R F Y F D ...
C0HK74,G S I C L E P K V V G P C T A Y F P R F Y F D ...


In [6]:
labels_df

Unnamed: 0_level_0,GO:0005488,GO:0007267,GO:0043167,GO:0008219,GO:0009892,GO:0008284,GO:0140640,GO:0006508,GO:0050793,GO:0043069,...,GO:0098542,GO:0048699,GO:1901576,GO:0007417,GO:0019725,GO:0016746,GO:0048583,GO:1903047,GO:0043603,GO:0000977
EntryID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A009IHW8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A021WW32,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
A0A021WZA4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A023FBW4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A023FBW7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X6RKS3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLN4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLP6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLR1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
