In [1]:
import os
import gc

from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
from sklearn.metrics import roc_auc_score

import numpy as np
import pandas as pd

from tqdm import tqdm
tqdm.pandas()

from keras.models import Sequential
from keras.layers import Dense
# measure roc auc score metric 
from tensorflow.keras.metrics import AUC



# Assigning labels

In [2]:
DATA_DIR = '/kaggle/input/cafa-5-protein-function-prediction'
MAX_LABELS = 500

In [3]:
train_terms = pd.read_csv(os.path.join(DATA_DIR, 'Train', 'train_terms.tsv'), sep='\t')

terms = train_terms.groupby(['aspect', 'term'])['term'].count().reset_index(name='frequency')
print(terms.groupby('aspect')['term'].nunique())

aspect
BPO    21285
CCO     2957
MFO     7224
Name: term, dtype: int64


In [4]:
fractions = (terms.groupby('aspect')['term'].nunique() / terms['term'].nunique() * MAX_LABELS).apply(round)
print(fractions)

selected_terms = set()
for aspect, number in fractions.items():
    selection = terms.loc[(terms.aspect == aspect)]
    selection = selection.nlargest(number, columns='frequency', keep='first')
    selected_terms.update(selection.term.to_list())

aspect
BPO    338
CCO     47
MFO    115
Name: term, dtype: int64


In [5]:
print(selected_terms)

{'GO:0042802', 'GO:0051174', 'GO:0097708', 'GO:0065009', 'GO:0043229', 'GO:0003674', 'GO:0019904', 'GO:0051252', 'GO:0007154', 'GO:0019901', 'GO:0009887', 'GO:0022803', 'GO:0009891', 'GO:0043085', 'GO:1902680', 'GO:0031328', 'GO:0016791', 'GO:1901575', 'GO:0006139', 'GO:1990837', 'GO:0050790', 'GO:0140097', 'GO:0010604', 'GO:0042127', 'GO:0048589', 'GO:0031344', 'GO:0051240', 'GO:0000003', 'GO:0010035', 'GO:0023051', 'GO:0010941', 'GO:0009266', 'GO:0042221', 'GO:0055085', 'GO:0048729', 'GO:0019752', 'GO:0004672', 'GO:0019787', 'GO:1901565', 'GO:0043169', 'GO:0007049', 'GO:0045597', 'GO:0048568', 'GO:0022836', 'GO:0009057', 'GO:0006955', 'GO:0010243', 'GO:0009628', 'GO:0005829', 'GO:0001216', 'GO:0048513', 'GO:2000026', 'GO:0048583', 'GO:0008284', 'GO:0030234', 'GO:0019900', 'GO:0000166', 'GO:1902531', 'GO:0042578', 'GO:0000977', 'GO:0003723', 'GO:0140677', 'GO:0001654', 'GO:0032559', 'GO:0044248', 'GO:0005739', 'GO:0002376', 'GO:0043067', 'GO:0051649', 'GO:0031410', 'GO:0043436', 'GO:0

In [6]:
def assign_labels(annotations, selected_terms=selected_terms):
    
    intersection = selected_terms.intersection(annotations)
    labels = np.isin(np.array(list(selected_terms)), np.array(list(intersection)))
    
    return list(labels.astype('int'))

annotations = train_terms.groupby('EntryID')['term'].apply(set)
labels = annotations.progress_apply(assign_labels)

labels.head()

100%|██████████| 142246/142246 [00:54<00:00, 2604.73it/s]


EntryID
A0A009IHW8    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
A0A021WW32    [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
A0A021WZA4    [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
A0A023FBW4    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
A0A023FBW7    [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
Name: term, dtype: object

# Loading train embeddings

In [7]:
train_ids = np.load('/kaggle/input/t5embeds/train_ids.npy')

x_train = np.load('/kaggle/input/t5embeds/train_embeds.npy')
y_train = np.array(labels[train_ids].to_list())

# Training

In [8]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, shuffle=True, random_state=42)

In [9]:
# build a simple MLP model in Keras with ReLU activation and nothing else
nfeats = x_train.shape[1]
nlabels = y_train.shape[1]
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=nfeats))
model.add(Dense(128, activation='relu'))
model.add(Dense(nlabels, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
                optimizer='adam',
                metrics=[AUC()])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 256)               262400    
                                                                 
 dense_1 (Dense)             (None, 128)               32896     
                                                                 
 dense_2 (Dense)             (None, 500)               64500     
                                                                 
Total params: 359,796
Trainable params: 359,796
Non-trainable params: 0
_________________________________________________________________


In [10]:
model.fit(x_train, y_train, epochs=15, batch_size=128, validation_data=(x_valid, y_valid))

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x788e0d22b760>

In [11]:
y_hat = model.predict(x_valid)

scores = pd.DataFrame(columns=list(selected_terms), index=['roc_auc'])

for i, term in enumerate(selected_terms):
    score = roc_auc_score(y_valid[:, i], y_hat[:, i])
    scores[term] = score

scores.mean(axis=1)



roc_auc    0.870418
dtype: float64

# Submission

In [12]:
test_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')
x_test = np.load('/kaggle/input/t5embeds/test_embeds.npy')

In [13]:
del x_train, y_train, x_valid, y_valid, labels
gc.collect()

1572

In [14]:
predictions = model.predict(x_test)
del x_test
gc.collect()

chunk_size = 5_000
chunks = [range(i, min(i + chunk_size, len(predictions))) for i in range(0, len(predictions), chunk_size)]

final_sub = pd.DataFrame()  # Create an empty DataFrame to hold the final result

print(f"processing {len(chunks)} chunks of {chunk_size} predictions each")

for chunk in chunks:
    print(f"processing chunk {chunk}")
    sub = pd.DataFrame(data=predictions[chunk], columns=list(selected_terms), index=test_ids[chunk])
    sub = sub.T.unstack().reset_index(name='prediction')
    sub = sub.loc[sub['prediction'] > 0]
    final_sub = pd.concat([final_sub, sub])  # Concatenate current chunk DataFrame to the final DataFrame

final_sub.head()

processing 29 chunks of 5000 predictions each
processing chunk range(0, 5000)
processing chunk range(5000, 10000)
processing chunk range(10000, 15000)
processing chunk range(15000, 20000)
processing chunk range(20000, 25000)
processing chunk range(25000, 30000)
processing chunk range(30000, 35000)
processing chunk range(35000, 40000)
processing chunk range(40000, 45000)
processing chunk range(45000, 50000)
processing chunk range(50000, 55000)
processing chunk range(55000, 60000)
processing chunk range(60000, 65000)
processing chunk range(65000, 70000)
processing chunk range(70000, 75000)
processing chunk range(75000, 80000)
processing chunk range(80000, 85000)
processing chunk range(85000, 90000)
processing chunk range(90000, 95000)
processing chunk range(95000, 100000)
processing chunk range(100000, 105000)
processing chunk range(105000, 110000)
processing chunk range(110000, 115000)
processing chunk range(115000, 120000)
processing chunk range(120000, 125000)
processing chunk range(1

Unnamed: 0,level_0,level_1,prediction
0,Q9CQV8,GO:0042802,0.166924
1,Q9CQV8,GO:0051174,0.195427
2,Q9CQV8,GO:0097708,0.136584
3,Q9CQV8,GO:0065009,0.239023
4,Q9CQV8,GO:0043229,0.49063


In [15]:
final_sub.to_csv('submission.tsv', sep='\t', index=False, header=False)