# Multilabel Classification with XGBoost

This notebook trains a multilabel classification model using scikit learn's implementatoin of the XGBoost algorithm. The input to the model needs to be feature vectors extarcted from the protein sequences provided by the train_sequences.fasta files. The T5 embeddings generated by Grandmaster Sergei Fironov are used as the feature vectors. These

In [1]:
n_labels_to_consider = 1499 # We will choose only top frequent labels (in train) and predict only them. 
n_max_preds = 1499

In [3]:
import time
t0start = time.time() 

import numpy as np
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, KFold, RandomizedSearchCV
from sklearn.linear_model import Ridge,RidgeCV
from sklearn.neural_network import MLPClassifier
from sklearn.multioutput import MultiOutputClassifier

import xgboost as xgb

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/t5embeds/train_ids.npy
/kaggle/input/t5embeds/test_embeds.npy
/kaggle/input/t5embeds/train_embeds.npy
/kaggle/input/t5embeds/test_ids.npy
/kaggle/input/cafa-5-protein-function-prediction/sample_submission.tsv
/kaggle/input/cafa-5-protein-function-prediction/IA.txt
/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta
/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset-taxon-list.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta
/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo


In [4]:
%%time
trainTerms = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv",sep="\t")
print(trainTerms.shape)
display(trainTerms.head(2))
vec_freqCount = (trainTerms['term'].value_counts())
print(vec_freqCount )

(5363863, 3)


Unnamed: 0,EntryID,term,aspect
0,A0A009IHW8,GO:0008152,BPO
1,A0A009IHW8,GO:0034655,BPO


GO:0005575    92912
GO:0008150    92210
GO:0110165    91286
GO:0003674    78637
GO:0005622    70785
              ...  
GO:0031772        1
GO:0042324        1
GO:0031771        1
GO:0051041        1
GO:0102628        1
Name: term, Length: 31466, dtype: int64
CPU times: user 3.07 s, sys: 362 ms, total: 3.44 s
Wall time: 4.39 s


In [5]:
## drop very rares
vec_freqCount = vec_freqCount[vec_freqCount>=30]
print(vec_freqCount.shape[0])
vec_freqCount.describe().round()

8632


count     8632.0
mean       602.0
std       3255.0
min         30.0
25%         49.0
50%         93.0
75%        261.0
max      92912.0
Name: term, dtype: float64

In [6]:
vec_freqCount[vec_freqCount>200].shape[0]

2597

In [7]:
labels_to_consider = list(vec_freqCount.index[:n_labels_to_consider] )
print('n_labels_to_consider:', len(labels_to_consider), 'First 10:', labels_to_consider[:10] ) 

n_labels_to_consider: 1499 First 10: ['GO:0005575', 'GO:0008150', 'GO:0110165', 'GO:0003674', 'GO:0005622', 'GO:0009987', 'GO:0043226', 'GO:0043229', 'GO:0005488', 'GO:0043227']


In [8]:
%%time
fn = '/kaggle/input/t5embeds/train_ids.npy'
vec_train_protein_ids = np.load(fn)
print(vec_train_protein_ids.shape)
vec_train_protein_ids

(142246,)
CPU times: user 0 ns, sys: 8.35 ms, total: 8.35 ms
Wall time: 45.8 ms


array(['P20536', 'O73864', 'O95231', ..., 'Q5RGB0', 'A0A2R8QMZ5',
       'A0A8I6GHU0'], dtype='<U10')

In [9]:
%%time 
train_size = 142246 # len(X)
Y = np.zeros( (train_size ,n_labels_to_consider) )
print(Y.shape)

series_train_protein_ids = pd.Series(vec_train_protein_ids ) # 

trainTerms_smaller = trainTerms[ trainTerms['term'].isin( labels_to_consider ) ] # to speed-up the next step 
print( trainTerms_smaller.shape)

for i in range(Y.shape[1]):
    m = trainTerms_smaller['term'] ==  labels_to_consider[i]
#     m.sum()
    Y[:,i] =  series_train_protein_ids.isin(  set(trainTerms_smaller[m]['EntryID'] ) ).astype(float )
    if (i % 10) == 0: 
        print(i, m.sum())
Y

(142246, 1499)
(4420307, 3)
0 92912


KeyboardInterrupt: 

In [10]:
%%time 
# save for possible future reuse 
fn4saveY = 'Y_'+str(Y.shape[1])
print(fn4saveY)
np.save( fn4saveY , Y) 

Y_1499
CPU times: user 2.2 ms, sys: 1.42 s, total: 1.42 s
Wall time: 1.43 s


In [14]:
%%time
fn4save_labels = 'Y_'+str(Y.shape[1]) + '_labels'
np.save(fn4save_labels, labels_to_consider )

CPU times: user 1.89 ms, sys: 0 ns, total: 1.89 ms
Wall time: 2.03 ms


In [15]:
%%time 
# Someone may prefer  Y as dataframe 
if 1:
    df_Y = pd.DataFrame(data = Y, columns = labels_to_consider)
    display(df_Y.head(2))
#     print( df.info().sum() )
    print('memory_usage:', df_Y.memory_usage(index=True).sum() )
    display(df_Y.describe() )    
    fn4save =  'df_Y_'+str(Y.shape[1]) + '.csv'
    df_Y.to_csv(fn4save)


KeyboardInterrupt



In [3]:
%%time

fn = '/kaggle/input/t5embeds/train_embeds.npy'

print(fn)
if '.csv' in fn:
    df = pd.read_csv(fn, index_col = 0)
    X = df.values
elif '.npy' in fn:
    X = np.load(fn)
print(X.shape)
X

/kaggle/input/t5embeds/train_embeds.npy
(142246, 1024)
CPU times: user 1.12 ms, sys: 674 ms, total: 675 ms
Wall time: 11.5 s


array([[ 0.04948843, -0.03293516,  0.03247323, ..., -0.04353154,
         0.0964628 ,  0.07306959],
       [-0.04461636,  0.06492499, -0.08026284, ...,  0.02672353,
         0.02787905, -0.04842958],
       [-0.02012804, -0.04977943,  0.00789446, ..., -0.03610279,
         0.00769301,  0.10623412],
       ...,
       [ 0.01691809,  0.04133058,  0.00079253, ...,  0.0088079 ,
         0.00648063, -0.01334958],
       [ 0.06125151,  0.08340203,  0.0440247 , ...,  0.00138361,
        -0.04754627,  0.01012351],
       [ 0.02160021,  0.06516985,  0.07492343, ...,  0.0496657 ,
        -0.01987522,  0.04471432]])

In [11]:
%%time
fn = '/kaggle/input/t5embeds/train_ids.npy'
vec_train_protein_ids = np.load(fn)
print(vec_train_protein_ids.shape)
vec_train_protein_ids

(142246,)
CPU times: user 1.01 ms, sys: 3 ms, total: 4.01 ms
Wall time: 4.96 ms


array(['P20536', 'O73864', 'O95231', ..., 'Q5RGB0', 'A0A2R8QMZ5',
       'A0A8I6GHU0'], dtype='<U10')

In [12]:
print(X.shape)
print(len(X))

(142246, 1024)
142246


In [13]:
IX = np.arange(len(X))
print(IX.shape)
print(IX)
IX_train, IX_test, _,_ = train_test_split( IX, IX, train_size=0.1, random_state=42)
# print(len(IX_train), len(IX_test),  IX_train[:10], IX_test[:10] )

(142246,)
[     0      1      2 ... 142243 142244 142245]


In [14]:
clf_xgb = xgb.XGBClassifier(objective="binary:logistic", random_state=42, tree_method="gpu_hist", verbosity=2)

In [16]:
clf_xgb.fit(X[IX_train,:], Y[IX_train,:])

In [17]:
y_pred_test = clf_xgb.predict(X[IX_train[:10],:])

In [18]:
print(y_pred_test)

[[1. 1. 1. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 ...
 [0. 1. 0. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]
 [1. 1. 1. ... 0. 0. 0.]]


In [1]:
clf_xgb.save_model("model.json")

NameError: name 'clf_xgb' is not defined