In [None]:
!pip install h5py



In [40]:
import pandas as pd
import numpy as np
import h5py

# Data Pre-processing

This notebook preprocesses the data in the `.h5` files provided and saves the results as dataframes and/ or numpy arrays for use in the main project notebook.

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [41]:
# FOLDER = 'bio_data'
FOLDER = 'biological_data_pfp'

In [None]:
#%%capture
!unzip '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data.zip'

Archive:  /content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data.zip
   creating: bio_data/train/
  inflating: bio_data/train/go-basic.obo  
  inflating: bio_data/train/train.fasta  
  inflating: bio_data/train/train_embeddings.h5  
  inflating: bio_data/train/train_ids.txt  
  inflating: bio_data/train/train_protein2ipr.dat  
  inflating: bio_data/train/train_set.tsv  
  inflating: bio_data/train_embeddings_BiologicalProcesses.pkl  
  inflating: bio_data/train_embeddings_CellularComponent.pkl  
  inflating: bio_data/train_embeddings_MolecularFunction.pkl  


## Preprocessing Training Data

### Splitting the Data by GO Branch
We first split the training data according to their 'aspect' i.e. GO branch which is one of biological processes, molecular function, or cellular component.

In [42]:
# PATH_TO_TRAIN_SET = '/content/'+FOLDER+'/train/train_set.tsv'
PATH_TO_TRAIN_SET = './data/'+FOLDER+'/train/train_set.tsv'
df = pd.read_csv(PATH_TO_TRAIN_SET, sep='\t')
train_set_df = pd.read_csv(PATH_TO_TRAIN_SET, sep='\t')

df.head()

Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226


In [43]:
df.loc[df['Protein_ID']=='P91124']

Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226
5,P91124,cellular_component,GO:0005794
6,P91124,cellular_component,GO:0012505
7,P91124,cellular_component,GO:0043231
8,P91124,cellular_component,GO:0043227
9,P91124,cellular_component,GO:0043229


In [44]:
len(df)

4277047

After loading the data we can proceed to split by aspect.

In [45]:
# Split the DataFrame into different DataFrames based on 'aspect'
aspect_groups = df.groupby('aspect')

# Create separate DataFrames for each 'aspect'
aspect_dfs = {aspect: aspect_groups.get_group(aspect) for aspect in aspect_groups.groups}

# Display the separate DataFrames
print("\nDataFrames split by 'aspect':")
for aspect, aspect_df in aspect_dfs.items():
    if aspect == 'biological_process':
        BP_df = aspect_df
    elif aspect == 'cellular_component':
        CC_df = aspect_df
    else: #aspect == 'molecular_function':
        MF_df = aspect_df

print(f"\nDataFrame for aspect Biological Processes with {len(BP_df)} elements")
print(BP_df.head())

print(f"\nDataFrame for aspect Cellular Component with {len(CC_df)} elements")
print(CC_df.head())

print(f"\nDataFrame for aspect Molecular Function with {len(MF_df)} elements")
print(MF_df.head())


DataFrames split by 'aspect':

DataFrame for aspect Biological Processes with 2634883 elements
   Protein_ID              aspect     GO_term
10     P91124  biological_process  GO:0008150
11     P91124  biological_process  GO:0050789
12     P91124  biological_process  GO:0050795
13     P91124  biological_process  GO:0051239
14     P91124  biological_process  GO:0065007

DataFrame for aspect Cellular Component with 1109632 elements
  Protein_ID              aspect     GO_term
0     P91124  cellular_component  GO:0005575
1     P91124  cellular_component  GO:0110165
2     P91124  cellular_component  GO:0005737
3     P91124  cellular_component  GO:0005622
4     P91124  cellular_component  GO:0043226

DataFrame for aspect Molecular Function with 532532 elements
   Protein_ID              aspect     GO_term
26     P91124  molecular_function  GO:0005488
27     P91124  molecular_function  GO:0003674
28     P91124  molecular_function  GO:0005515
29     P91124  molecular_function  GO:0051020
30 

### Splitting Embeddings by GO Branch

In [106]:
# PATH_TO_TRAIN_EMBED = '/content/'+FOLDER+'/train/train_embeddings.h5'
PATH_TO_TRAIN_EMBED = './data/'+FOLDER+'/train/train_embeddings.h5'

In [107]:
hf = h5py.File(PATH_TO_TRAIN_EMBED, 'r')

In [108]:
hf

<HDF5 file "train_embeddings.h5" (mode r)>

In [109]:
hf_dsets = list(hf.keys())
if 'P91124' in hf_dsets:
  print(True)

True


In [110]:
len(hf_dsets)

123969

In [111]:
dset1 = hf['A0A009IHW8']
dset1

<HDF5 dataset "A0A009IHW8": shape (1024,), type "<f2">

In [112]:
np.array(hf.get('A0A021WW32')) #A0A009IHW8 #A0A021WW32

array([-0.01643 , -0.001583,  0.00389 , ..., -0.03296 ,  0.0532  ,
        0.0299  ], dtype=float16)

In [113]:
hf_dsets[0]

'A0A009IHW8'

In [114]:
loa = []
for key in hf_dsets:
  arr = np.array(hf[key])
  loa.append(arr)

loa[0:2]

[array([ 0.0682  , -0.04648 ,  0.001752, ..., -0.02461 ,  0.03476 ,
        -0.031   ], dtype=float16),
 array([-0.01643 , -0.001583,  0.00389 , ..., -0.03296 ,  0.0532  ,
         0.0299  ], dtype=float16)]

In [115]:
train_embed_array = np.array(loa)
train_embed_array.shape

(123969, 1024)

In [116]:
train_embed_df = pd.DataFrame(loa)
train_embed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,0.008865,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.00898,-0.003506,-0.024612,0.03476,-0.031006
1,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,0.025497,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.08197,0.017456,-0.032959,0.053192,0.029907
2,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,-0.034973,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
3,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,-0.018631,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
4,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,0.029907,...,-0.100647,-0.063293,0.002346,-0.104675,-0.000757,-0.047485,0.003002,-0.036774,0.103577,0.005245


In [117]:
train_embed_df.insert(0, "Protein_ID", hf_dsets)
train_embed_df.set_index("Protein_ID", inplace = True)
train_embed_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
Protein_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A0A009IHW8,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,0.008865,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.00898,-0.003506,-0.024612,0.03476,-0.031006
A0A021WW32,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,0.025497,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.08197,0.017456,-0.032959,0.053192,0.029907
A0A021WZA4,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,-0.034973,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
A0A023FBW4,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,-0.018631,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
A0A023FBW7,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,0.029907,...,-0.100647,-0.063293,0.002346,-0.104675,-0.000757,-0.047485,0.003002,-0.036774,0.103577,0.005245


In [118]:
np.save('train_embeddings_array', train_embed_array)

Now, we can split the train embeddings dataframe by aspect based on whether or not the protein ID appears in the dataframes for BP, CC, and MF.

In [119]:
BP_Protein_IDs = BP_df.Protein_ID.unique()
CC_Protein_IDs = CC_df.Protein_ID.unique()
MF_Protein_IDs = MF_df.Protein_ID.unique()

In [120]:
print('Number of unique proteins in BP: ', len(BP_Protein_IDs))
print('Number of unique proteins in CC: ', len(CC_Protein_IDs))
print('Number of unique proteins in MF: ', len(MF_Protein_IDs))

Number of unique proteins in BP:  83064
Number of unique proteins in CC:  84638
Number of unique proteins in MF:  55698


In [121]:
train_embed_BP_df = train_embed_df.filter(items = BP_Protein_IDs, axis=0)
train_embed_BP_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
Protein_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P91124,-0.012238,-0.042206,-0.013374,0.018265,0.009674,0.020203,-0.004852,-0.044098,-0.013695,0.000777,...,-0.037201,-0.030762,-0.03418,0.015869,-0.00239,0.031677,0.016922,0.020508,0.038361,0.005409
Q04418,-0.022003,-0.069641,-0.007042,0.054413,-0.046326,0.068176,-0.009506,-0.056396,0.046906,-0.027664,...,-0.037201,-0.047577,-0.028397,0.050079,0.027481,0.015579,-0.079224,-0.008293,-0.011238,0.045013
Q7ZT12,0.040283,-0.033569,0.104614,0.066895,-0.079346,0.109314,0.056274,-0.108826,0.039764,0.047546,...,-0.088196,-0.0042,-0.012558,0.014381,-0.002611,0.011345,-0.03479,-0.002676,-0.004436,0.075134
Q6DBW0,0.011063,0.022766,0.028946,0.032928,-0.006409,0.017044,-0.026642,-0.025192,0.027771,-0.048584,...,-0.008736,0.001218,-0.024841,-0.022797,0.039276,0.078369,-0.016266,-0.000606,-0.031494,0.016129
Q9WUC4,0.06134,-0.00452,0.014717,0.001324,0.031616,0.075317,-0.004963,-0.059937,0.050568,-0.033661,...,-0.04599,0.021606,-0.013168,-0.037689,0.031677,-0.111023,-0.030685,0.011009,0.03096,-0.008629


In [122]:
train_embed_CC_df = train_embed_df.filter(items = CC_Protein_IDs, axis=0)
train_embed_CC_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
Protein_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P91124,-0.012238,-0.042206,-0.013374,0.018265,0.009674,0.020203,-0.004852,-0.044098,-0.013695,0.000777,...,-0.037201,-0.030762,-0.03418,0.015869,-0.00239,0.031677,0.016922,0.020508,0.038361,0.005409
Q55DL5,0.141846,0.062073,0.073669,-0.071228,0.070312,-0.023636,0.05127,-0.118286,-0.072815,0.07019,...,-0.051941,-0.067566,0.004971,-0.068237,0.154907,-0.064087,-0.177979,-0.067932,0.082275,0.035675
O81027,0.049103,0.03891,-0.017807,0.027786,-0.00568,0.063477,-0.038452,-0.028214,0.071472,-0.018372,...,-0.017303,-0.006474,0.00165,-0.066528,0.018433,0.027634,-0.033997,0.005363,0.011383,0.021683
Q04418,-0.022003,-0.069641,-0.007042,0.054413,-0.046326,0.068176,-0.009506,-0.056396,0.046906,-0.027664,...,-0.037201,-0.047577,-0.028397,0.050079,0.027481,0.015579,-0.079224,-0.008293,-0.011238,0.045013
Q7ZT12,0.040283,-0.033569,0.104614,0.066895,-0.079346,0.109314,0.056274,-0.108826,0.039764,0.047546,...,-0.088196,-0.0042,-0.012558,0.014381,-0.002611,0.011345,-0.03479,-0.002676,-0.004436,0.075134


In [123]:
train_embed_MF_df = train_embed_df.filter(items = MF_Protein_IDs, axis=0)
train_embed_MF_df.head()

Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
Protein_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
P91124,-0.012238,-0.042206,-0.013374,0.018265,0.009674,0.020203,-0.004852,-0.044098,-0.013695,0.000777,...,-0.037201,-0.030762,-0.03418,0.015869,-0.00239,0.031677,0.016922,0.020508,0.038361,0.005409
O81027,0.049103,0.03891,-0.017807,0.027786,-0.00568,0.063477,-0.038452,-0.028214,0.071472,-0.018372,...,-0.017303,-0.006474,0.00165,-0.066528,0.018433,0.027634,-0.033997,0.005363,0.011383,0.021683
Q8IXT2,-0.025146,-0.013313,0.005749,0.004353,-0.069824,0.101807,-0.042328,-0.042084,0.032318,0.054108,...,0.02356,0.013626,-0.008537,-0.021698,0.015884,0.002615,-0.019516,-0.025925,0.024017,0.062866
Q9WUC4,0.06134,-0.00452,0.014717,0.001324,0.031616,0.075317,-0.004963,-0.059937,0.050568,-0.033661,...,-0.04599,0.021606,-0.013168,-0.037689,0.031677,-0.111023,-0.030685,0.011009,0.03096,-0.008629
P30041,0.025467,0.09613,0.008034,-0.018234,-0.007935,0.083313,-0.066223,-0.055603,0.02829,-0.027191,...,-0.050629,-0.007927,0.02066,-0.037262,0.042175,-0.03479,-0.019852,-0.065979,0.014198,0.043304


In [124]:
train_embed_BP_df.to_pickle('train_embeddings_BiologicalProcesses.pkl')
train_embed_CC_df.to_pickle('train_embeddings_CellularComponent.pkl')
train_embed_MF_df.to_pickle('train_embeddings_MolecularFunction.pkl')

### Preparing Labels for each GO Branch

In [65]:
# train_protein_ids = np.loadtxt('/content/biological_data_pfp_2/train/train_ids.txt', dtype = str)
train_protein_ids = np.loadtxt('./data/biological_data_pfp/train/train_ids.txt', dtype = str)
print(train_protein_ids.shape)

(123969,)


In [89]:
# Set the limit for label
num_labels = 1500

# Take value counts in descending order and fetch first 1500 `GO term ID` as labels
labels = train_set_df['GO_term'].value_counts().index[:num_labels].tolist()

# Fetch the train_terms data for the relevant labels only
train_set_top1500terms = train_set_df.loc[train_set_df['GO_term'].isin(labels)]

In [93]:
train_set_top1500terms

Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226
...,...,...,...
4277042,P28271,biological_process,GO:0010608
4277043,P28271,biological_process,GO:0080090
4277044,P28271,biological_process,GO:0006417
4277045,P28271,biological_process,GO:0051246


In [67]:
import progressbar

In [91]:
bar = progressbar.ProgressBar(maxval=num_labels, \
    widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

In [92]:
# Pre-compute the unique proteins for each label outside the loop to improve efficiency
grouped = train_set_top1500terms.groupby('GO_term')['Protein_ID'].apply(set).to_dict()

In [100]:
set_ids = [
    ('BiologicalProcesses', BP_Protein_IDs),
    ('CellularComponent', CC_Protein_IDs),
    ('MolecularFunction', MF_Protein_IDs),
]

for title, ids in set_ids:
    
    id_size = ids.shape[0]
    label_set = np.zeros((id_size, num_labels))
    
    bar.start()
    for i, label in enumerate(labels):
        label_related_proteins = grouped.get(label, set())
        label_set[:, i] = [id in label_related_proteins for id in ids]
        bar.update(i + 1)
    bar.finish()

    labels_df = pd.DataFrame(data=label_set, columns=labels)
    labels_df.insert(0, "Protein_ID", ids)
    labels_df.set_index("Protein_ID", inplace=True)
    labels_df.to_pickle(f'train_labels_{title}.pkl')

    print(labels_df.shape)
    print(labels_df.describe())



(83064, 1500)


[                                                                                                           ]   0%

         GO:0005575    GO:0110165  GO:0008150    GO:0005622    GO:0043226  \
count  83064.000000  83064.000000     83064.0  83064.000000  83064.000000   
mean       0.603354      0.597323         1.0      0.475549      0.416823   
std        0.489204      0.490440         0.0      0.499405      0.493036   
min        0.000000      0.000000         1.0      0.000000      0.000000   
25%        0.000000      0.000000         1.0      0.000000      0.000000   
50%        1.000000      1.000000         1.0      0.000000      0.000000   
75%        1.000000      1.000000         1.0      1.000000      1.000000   
max        1.000000      1.000000         1.0      1.000000      1.000000   

         GO:0009987    GO:0003674    GO:0043229    GO:0043227    GO:0005737  \
count  83064.000000  83064.000000  83064.000000  83064.000000  83064.000000   
mean       0.674059      0.539126      0.399704      0.375265      0.357303   
std        0.468728      0.498470      0.489840      0.484194      0.



(84638, 1500)


[=                                                                                                          ]   1%

       GO:0005575    GO:0110165    GO:0008150    GO:0005622    GO:0043226  \
count     84638.0  84638.000000  84638.000000  84638.000000  84638.000000   
mean          1.0      0.991198      0.592134      0.795068      0.685319   
std           0.0      0.093407      0.491441      0.403654      0.464392   
min           1.0      0.000000      0.000000      0.000000      0.000000   
25%           1.0      1.000000      0.000000      1.000000      0.000000   
50%           1.0      1.000000      1.000000      1.000000      1.000000   
75%           1.0      1.000000      1.000000      1.000000      1.000000   
max           1.0      1.000000      1.000000      1.000000      1.000000   

         GO:0009987    GO:0003674    GO:0043229    GO:0043227    GO:0005737  \
count  84638.000000  84638.000000  84638.000000  84638.000000  84638.000000   
mean       0.447175      0.429878      0.656809      0.623668      0.599364   
std        0.497205      0.495061      0.474777      0.484468      0.



(55698, 1500)
         GO:0005575    GO:0110165    GO:0008150    GO:0005622    GO:0043226  \
count  55698.000000  55698.000000  55698.000000  55698.000000  55698.000000   
mean       0.653237      0.646666      0.804015      0.522478      0.446946   
std        0.475944      0.478010      0.396961      0.499499      0.497182   
min        0.000000      0.000000      0.000000      0.000000      0.000000   
25%        0.000000      0.000000      1.000000      0.000000      0.000000   
50%        1.000000      1.000000      1.000000      1.000000      0.000000   
75%        1.000000      1.000000      1.000000      1.000000      1.000000   
max        1.000000      1.000000      1.000000      1.000000      1.000000   

         GO:0009987  GO:0003674    GO:0043229    GO:0043227    GO:0005737  \
count  55698.000000     55698.0  55698.000000  55698.000000  55698.000000   
mean       0.603433         1.0      0.429190      0.406819      0.405113   
std        0.489189         0.0      0.4949

In [125]:
BP_train_label_df = pd.read_pickle(f'train_labels_BiologicalProcesses.pkl')
BP_train_embedding_df = pd.read_pickle(f'train_embeddings_BiologicalProcesses.pkl')
MF_train_label_df = pd.read_pickle(f'train_labels_MolecularFunction.pkl')
MF_train_embedding_df = pd.read_pickle(f'train_embeddings_MolecularFunction.pkl')
CC_train_label_df = pd.read_pickle(f'train_labels_CellularComponent.pkl')
CC_train_embedding_df = pd.read_pickle(f'train_embeddings_CellularComponent.pkl')


### Choose a subset of the BP data

Since the BP Ontology is quite large and training on such a large dataset may be computationally expensive, we randomly select 75% of the data for use in model building

In [131]:
datasets = ['BiologicalProcesses', 'MolecularFunction', 'CellularComponent']

for dataset in datasets:
    label_df = pd.read_pickle(f'train_labels_{dataset}.pkl')
    embedding_df = pd.read_pickle(f'train_embeddings_{dataset}.pkl')
    
    sampled_embedding_df = embedding_df.sample(frac=0.75, random_state=5)
    sampled_label_df = label_df.loc[sampled_embedding_df.index]
    
    # Save the sampled dataframes
    sampled_embedding_df.to_pickle(f'75percent_train_embeddings_{dataset}.pkl')
    sampled_label_df.to_pickle(f'75percent_train_labels_{dataset}.pkl')


## Preprocessing Test Data

### Splitting the Data by GO branch

After loading the data we can proceed to split by aspect.

In [None]:
PATH_TO_TEST_EMBED = '/content/'+FOLDER+'/test/test_embeddings.h5'

In [None]:
hf_test = h5py.File(PATH_TO_TEST_EMBED, 'r')
hf_test_dsets = list(hf_test.keys())
len(hf_test_dsets)

In [None]:
loa_testdata = []
for key in hf_test_dsets:
  arr = np.array(hf_test[key])
  loa_testdata.append(arr)

[array([ 0.0682  , -0.04648 ,  0.001752, ..., -0.02461 ,  0.03476 ,
        -0.031   ], dtype=float16),
 array([-0.01643 , -0.001583,  0.00389 , ..., -0.03296 ,  0.0532  ,
         0.0299  ], dtype=float16)]

In [None]:
test_embed_array = np.array(loa_testdata)
np.save('test_embeddings_array', test_embed_array)
test_embed_array.shape

In [None]:
test_embed_df = pd.DataFrame(loa_testdata)
test_embed_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1014,1015,1016,1017,1018,1019,1020,1021,1022,1023
0,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,0.008865,...,-0.040771,-0.013138,-0.049591,-0.101074,0.066406,0.00898,-0.003506,-0.024612,0.03476,-0.031006
1,-0.016434,-0.001583,0.003889,0.073425,0.012428,0.028168,-0.040375,-0.093811,-0.017807,0.025497,...,0.011879,-0.033325,-0.031342,-0.005245,0.014732,0.08197,0.017456,-0.032959,0.053192,0.029907
2,0.007904,0.087708,-0.001715,0.037659,0.017883,0.025589,-0.011749,-0.084717,-0.016266,-0.034973,...,0.004829,-0.049713,-0.027176,-0.037415,-0.006241,-0.039703,0.001784,0.004719,-0.004288,0.001847
3,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,-0.018631,...,-0.053589,-0.002508,-0.016647,-0.069458,0.042206,-0.051758,-0.025436,0.057373,0.099121,0.032898
4,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,0.029907,...,-0.100647,-0.063293,0.002346,-0.104675,-0.000757,-0.047485,0.003002,-0.036774,0.103577,0.005245
