In [1]:
import pandas as pd
import numpy as np

In [2]:
def preprocess_train(path_taxo, max_labels=500):
    """Function to preprocess the taxonomie file
    Parameters:
        path_taxo (str): path to the taxonomie file

    Returns:
        df_taxo (pd.DataFrame): preprocessed taxonomie file
    
    """
    # we read the taxonomie file
    df_taxo = pd.read_csv(path_taxo, sep='\t')

    # we compute the select term for our dataset 
    terms = df_taxo.groupby(['aspect', 'term'])['term'].count().reset_index(name='frequency')
    fractions = (terms.groupby('aspect')['term'].nunique() / terms['term'].nunique() * max_labels).apply(round)

    selected_terms = set()
    for aspect, number in fractions.items():
        selection = terms.loc[(terms.aspect == aspect)]
        selection = selection.nlargest(number, columns='frequency', keep='first')
        selected_terms.update(selection.term.to_list())  

    def assign_labels(annotations, selected_terms=selected_terms):
        
        intersection = selected_terms.intersection(annotations)
        labels = np.isin(np.array(list(selected_terms)), np.array(list(intersection)))
        
        return list(labels.astype('int'))

    annotations = df_taxo.groupby('EntryID')['term'].apply(set)
    labels = annotations.apply(assign_labels)

    return labels, selected_terms


labels, selected_terms = preprocess_train('/workspaces/protein_ontologies/dataset_kg/Train/train_terms.tsv')

In [3]:
# save into parquet format
labels_df = pd.DataFrame(np.array(labels.to_list()), columns=list(selected_terms))

# add the EntryID column
labels_df.index = labels.index

# save the labels in parquet format
labels_df.to_parquet('/workspaces/protein_ontologies/dataset_kg/Train/train_labels.parquet')


In [None]:
labels_df.index

RangeIndex(start=0, stop=142246, step=1)

In [4]:

from Bio import SeqIO

def preprocess_seq_file(path_fasta):
    # read the fasta file
    info = []

    with open(path_fasta) as handle:
        for record in SeqIO.parse(handle, "fasta"):
            id = record.id
            seq = str(record.seq)

            preprocess_info = record.description.split(' ')

            other_entry = preprocess_info[0]

            info_organism = preprocess_info[1].split('|')[0]
            info_animal = preprocess_info[1].split('|')[2].split('_')[1]

            info.append([id, seq, other_entry, info_organism, info_animal])

    # create the dataframe
    df_seq = pd.DataFrame(info, columns=['EntryID', 'sequence', 'other_entry', 'organism', 'animal'])

    return df_seq

df_seq = preprocess_seq_file("/workspaces/protein_ontologies/dataset_kg/Train/train_sequences.fasta")


In [5]:
# now we have to preprocess the sequence column
def preprocess_sequence(sequence):
    """
    The idea is that we have a seq of type 'ASJIOJA'
    and we want to transform it in a string like '[CLS] A S J I O J A [SEP]'
    """
    sequence = list(sequence)
    sequence = ' '.join(sequence)
    return f'{sequence}'

df_seq['sequence'] = df_seq['sequence'].apply(preprocess_sequence)


In [41]:
# set the EntryID as index
df_seq.set_index('EntryID', inplace=True)

# save the dataframe in parquet format
df_seq.to_parquet('/workspaces/protein_ontologies/dataset_kg/Train/train_sequences.parquet')

Unnamed: 0,EntryID,sequence,other_entry,organism,animal
0,P20536,[CLS] M N S V T V S H A P Y T I T Y H D D W E ...,P20536,sp,VACCC
1,O73864,[CLS] M T E Y R N F L L L F I T S L S V I Y P ...,O73864,sp,DANRE
2,O95231,[CLS] M R L S S S P P R G P Q Q L S S F G S V ...,O95231,sp,HUMAN
3,A0A0B4J1F4,[CLS] M G G E A G A D G P R G R V K S L G L V ...,A0A0B4J1F4,sp,MOUSE
4,P54366,[CLS] M V E T N S P P A G Y T L K R S P S D L ...,P54366,sp,DROME
...,...,...,...,...,...
142241,A0A286YAI0,[CLS] M E T E V D D F P G K A S I F S Q V N P ...,A0A286YAI0,tr,DANRE
142242,A0A1D5NUC4,[CLS] M S A A A S A E M I E T P P V L N F E E ...,A0A1D5NUC4,tr,CHICK
142243,Q5RGB0,[CLS] M A D K G P I L T S V I I F Y L S I G A ...,Q5RGB0,tr,DANRE
142244,A0A2R8QMZ5,[CLS] M G R K K I Q I T R I M D E R N R Q V T ...,A0A2R8QMZ5,tr,DANRE


In [6]:
labels_df

Unnamed: 0_level_0,GO:0005488,GO:0007267,GO:0043167,GO:0008219,GO:0009892,GO:0008284,GO:0140640,GO:0006508,GO:0050793,GO:0043069,...,GO:0098542,GO:0048699,GO:1901576,GO:0007417,GO:0019725,GO:0016746,GO:0048583,GO:1903047,GO:0043603,GO:0000977
EntryID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A009IHW8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A021WW32,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,1,0,0
A0A021WZA4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A023FBW4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
A0A023FBW7,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X6RKS3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLN4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLP6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
X6RLR1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
