In [1]:
import os
import gc

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

# annoy for approximate nearest neighbors
from annoy import AnnoyIndex

import gc



# Assigning labels

In [2]:
DATA_DIR = '/kaggle/input/cafa-5-protein-function-prediction'
MAX_LABELS = 1500

In [3]:
train_terms = pd.read_csv(os.path.join(DATA_DIR, 'Train', 'train_terms.tsv'), sep='\t')

terms = train_terms.groupby(['aspect', 'term'])['term'].count().reset_index(name='frequency')
print(terms.groupby('aspect')['term'].nunique())

aspect
BPO    21285
CCO     2957
MFO     7224
Name: term, dtype: int64


In [4]:
fractions = (terms.groupby('aspect')['term'].nunique() / terms['term'].nunique() * MAX_LABELS).apply(round)
print(fractions)

selected_terms = set()
for aspect, number in fractions.items():
    selection = terms.loc[(terms.aspect == aspect)]
    selection = selection.nlargest(number, columns='frequency', keep='first')
    selected_terms.update(selection.term.to_list())

aspect
BPO    1015
CCO     141
MFO     344
Name: term, dtype: int64


In [5]:
print(selected_terms)

{'GO:0009056', 'GO:0014070', 'GO:0008080', 'GO:0001525', 'GO:0048736', 'GO:0070482', 'GO:0050839', 'GO:0003674', 'GO:0042110', 'GO:0006357', 'GO:0010256', 'GO:0016922', 'GO:0050795', 'GO:0007281', 'GO:0021700', 'GO:0043087', 'GO:0048638', 'GO:0042546', 'GO:0140013', 'GO:0005543', 'GO:0044270', 'GO:1902680', 'GO:0017148', 'GO:0034655', 'GO:0031327', 'GO:0009896', 'GO:0009536', 'GO:0009506', 'GO:0032386', 'GO:0072521', 'GO:0010605', 'GO:0005774', 'GO:0070085', 'GO:0033674', 'GO:0003729', 'GO:0016620', 'GO:0016875', 'GO:0009266', 'GO:0022804', 'GO:0043604', 'GO:0008016', 'GO:0044325', 'GO:0005635', 'GO:0030674', 'GO:0019220', 'GO:1903829', 'GO:0098687', 'GO:0016835', 'GO:0048839', 'GO:0140014', 'GO:0015297', 'GO:0051051', 'GO:0021953', 'GO:0002164', 'GO:0007399', 'GO:0099568', 'GO:0009886', 'GO:0004725', 'GO:0009150', 'GO:0051603', 'GO:0032504', 'GO:0016772', 'GO:0007346', 'GO:0005539', 'GO:0050867', 'GO:0008284', 'GO:0045184', 'GO:0019903', 'GO:0005777', 'GO:0046872', 'GO:0008276', 'GO:0

In [6]:
def assign_labels(annotations, selected_terms=selected_terms):
    
    intersection = selected_terms.intersection(annotations)
    labels = np.isin(np.array(list(selected_terms)), np.array(list(intersection)))
    
    return list(labels.astype('int'))

annotations = train_terms.groupby('EntryID')['term'].apply(set)
labels = annotations.progress_apply(assign_labels)

labels.head()

100%|██████████| 142246/142246 [03:06<00:00, 764.61it/s]


EntryID
A0A009IHW8    [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
A0A021WW32    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
A0A021WZA4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
A0A023FBW4    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
A0A023FBW7    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
Name: term, dtype: object

# Loading train embeddings

In [7]:
labels.head()

EntryID
A0A009IHW8    [1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
A0A021WW32    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
A0A021WZA4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
A0A023FBW4    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
A0A023FBW7    [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
Name: term, dtype: object

In [8]:
train_ids = np.load('/kaggle/input/t5embeds/train_ids.npy')

x_train = np.load('/kaggle/input/t5embeds/train_embeds.npy')
y_train = np.array(labels[train_ids].to_list())

# Training

In [9]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, shuffle=True, random_state=42)

In [10]:
# build the Annoy index for approximate nearest neighbors
index = AnnoyIndex(x_train.shape[1], 'angular')
for i, vector in enumerate(x_train):
    index.add_item(i, vector)
index.build(100)

True

In [11]:
K = 16
# find the K nearest neighbors for each vector in the validation set
# return the indices and distances of the neighbors
idxs = []
dists = []
for vector in tqdm(x_valid):
    idx, dist = index.get_nns_by_vector(vector, K, include_distances=True)
    idxs.append(idx)
    dists.append(dist)
# convert the indices and distances to numpy arrays
idxs = np.array(idxs)
dists = np.array(dists)

100%|██████████| 35562/35562 [00:55<00:00, 639.14it/s]


In [12]:
# Predict the probability of each label for each vector in the validation set
y_hat = []
for i in tqdm(range(len(x_valid))):
    y_hat_i = np.zeros(y_valid.shape[1])
    for j in range(K):
        y_hat_i += (1 - dists[i, j]) / K * y_train[idxs[i, j]]
    y_hat.append(y_hat_i)
# convert the predictions to a numpy array
y_hat = np.array(y_hat)
# clip the predictions to be between 0 and 1
y_hat = np.clip(y_hat, 0, 1)

100%|██████████| 35562/35562 [00:07<00:00, 4885.29it/s]


In [13]:
scores = pd.DataFrame(columns=list(selected_terms), index=['roc_auc'])

for i, term in enumerate(selected_terms):
    score = roc_auc_score(y_valid[:, i], y_hat[:, i])
    scores[term] = score

scores.mean(axis=1)

roc_auc    0.829647
dtype: float64

# Submission

In [14]:
test_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')
x_test = np.load('/kaggle/input/t5embeds/test_embeds.npy')

In [15]:
# calculate the nearest neighbors for each vector in the test set
idxs = []
dists = []
for vector in tqdm(x_test):
    idx, dist = index.get_nns_by_vector(vector, K, include_distances=True)
    idxs.append(idx)
    dists.append(dist)
# convert the indices and distances to numpy arrays
idxs = np.array(idxs)
dists = np.array(dists)

100%|██████████| 141865/141865 [03:38<00:00, 649.96it/s]


In [16]:
# Predict the probability of each label for each vector in the test set
predictions = []
for i in tqdm(range(len(x_test))):
    y_hat_i = np.zeros(y_train.shape[1])
    for j in range(K):
        y_hat_i += (1 - dists[i, j]) / K * y_train[idxs[i, j]]
    predictions.append(y_hat_i)

100%|██████████| 141865/141865 [00:26<00:00, 5293.01it/s]


In [17]:
# free up memory
del x_train, x_valid, y_train, y_valid, x_test, index, idxs, dists, y_hat
gc.collect()

0

In [18]:
# convert the predictions to a numpy array
predictions = np.array(predictions)
# clip the predictions to be between 0 and 1
predictions = np.clip(predictions, 0, 1)

chunk_size = 10_000
chunks = [range(i, min(i + chunk_size, len(predictions))) for i in range(0, len(predictions), chunk_size)]

final_sub = pd.DataFrame()  # Create an empty DataFrame to hold the final result

print(f"processing {len(chunks)} chunks of {chunk_size} predictions each")

for chunk in chunks:
    print(f"processing chunk {chunk}")
    sub = pd.DataFrame(data=predictions[chunk], columns=list(selected_terms), index=test_ids[chunk])
    sub = sub.T.unstack().reset_index(name='prediction')
    sub = sub.loc[sub['prediction'] > 0]
    final_sub = pd.concat([final_sub, sub])  # Concatenate current chunk DataFrame to the final DataFrame

final_sub.head()

processing 15 chunks of 10000 predictions each
processing chunk range(0, 10000)
processing chunk range(10000, 20000)
processing chunk range(20000, 30000)
processing chunk range(30000, 40000)
processing chunk range(40000, 50000)
processing chunk range(50000, 60000)
processing chunk range(60000, 70000)
processing chunk range(70000, 80000)
processing chunk range(80000, 90000)
processing chunk range(90000, 100000)
processing chunk range(100000, 110000)
processing chunk range(110000, 120000)
processing chunk range(120000, 130000)
processing chunk range(130000, 140000)
processing chunk range(140000, 141865)


Unnamed: 0,level_0,level_1,prediction
6,Q9CQV8,GO:0050839,0.110393
7,Q9CQV8,GO:0003674,0.687724
9,Q9CQV8,GO:0006357,0.052687
10,Q9CQV8,GO:0010256,0.052687
24,Q9CQV8,GO:0031327,0.164868


In [19]:
final_sub.to_csv('submission.tsv', sep='\t', index=False, header=False)