In [None]:
# Importing Libraries
import numpy as np
import pandas as pd
import os
import xgboost as xgb
from sklearn.metrics import confusion_matrix, roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split, GridSearchCV

In [None]:
# Defining XGB Classifier for binary classification
clf_xgb = xgb.XGBClassifier(objective="binary:logistic", random_state=42, tree_method = "gpu_hist")

In [None]:
# Loading model trained on all labels 
clf_xgb.load_model("/kaggle/input/xgbdata/xgboost-full-2905-5pm.json")

In [None]:
# Loading GO labels
Y = np.load("/kaggle/input/xgbdata/Y_1499.npy")

In [None]:
labels_to_consider = np.load("/kaggle/input/xgbdata/Y_1499_labels.npy")

In [None]:
fn = '/kaggle/input/t5embeds/test_embeds.npy'

In [None]:
print(fn)
if '.csv' in fn:
    df = pd.read_csv(fn, index_col = 0)
    X = df.values
elif '.npy' in fn:
    X = np.load(fn)
print(X.shape)
X

## If taking new Predictions, run following 2 cells

In [None]:
# Y_pred = clf_xgb.predict(X)

In [None]:
# np.save("xgb-prediction-full-2905-5pm.json", Y_pred)

In [None]:
Y_pred = np.load("/kaggle/input/predictions-t5-test-embeds/xgb-prediction-test-embeds-full-2905-5pm.json.npy")

In [None]:
%%time 
df_finalSubmission = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

In [None]:
%%time
fn = '/kaggle/input/t5embeds/test_ids.npy'
vec_test_protein_ids = np.load(fn)
print(vec_test_protein_ids.shape)
vec_test_protein_ids

In [None]:
test_data_batch_sz = 5000

In [None]:
%%time 
l = []
for k in list(vec_test_protein_ids[:test_data_batch_sz]):
    l += [ k] * Y_pred.shape[1]
print(len(l), l[:20])    

df_finalSubmission['Protein Id'] = l

In [None]:
labels_to_consider_repeated = [item for _ in range(test_data_batch_sz) for item in labels_to_consider]
labels_to_consider_repeated

In [None]:
df_finalSubmission['GO Term Id'] = labels_to_consider * test_data_batch_sz

In [None]:
# confusion_matrix(Y[IX_test,:],Y_pred)

In [None]:
l = []
for i in range(Y.shape[1]):
    if len(np.unique(Y[IX_test,i]) ) > 1:
        s = roc_auc_score(Y[IX_test,i], Y_pred[:,i]);
    else:
        s = 0.5
    l.append(s)        
    if i %10 == 0:
        print(i, s)

In [None]:
df_models_stat = pd.DataFrame()
df_models_stat.loc["XGB",'RocAuc Mean Test'] = np.mean(l)
df_models_stat.loc["XGB",'Test Size'] = len(IX_test)
df_models_stat

In [None]:
import matplotlib.pyplot as plt
plt.hist(l)
plt.show()
pd.Series(l).describe()

In [None]:
l = []
for i in range(Y.shape[1]):
    if len(np.unique(Y[IX_test,i]) ) > 1:
        s = accuracy_score(Y[IX_test,i], Y_pred[:,i]);
    else:
        s = 0.5
    l.append(s)        
    if i %10 == 0:
        print(i, s)

In [None]:
df_finalSubmission = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

In [None]:
%%time
fn = '/kaggle/input/t5embeds/test_ids.npy'
vec_test_protein_ids = np.load(fn)
print(vec_test_protein_ids.shape)
vec_test_protein_ids

In [None]:
l = []
for k in list(vec_test_protein_ids):
    l += [ k] * Y_pred.shape[1]
print(len(l), l[:20])    

df_finalSubmission['Protein Id'] = l

In [None]:
default_params = {}
gparams = clf_xgb.get_params()
for key in gparams.keys():
    gp = gparams[key]
    default_params[key] = [gp]

    
#benchmark model. Grid search is not performed, since only single values are provided as parameter grid.
#However, cross-validation is still executed
clf0 = GridSearchCV(estimator=clf_xgb, scoring='roc_auc', param_grid=default_params, return_train_score=True, verbose=1, cv=3)
clf0.fit(X[IX_test,:], Y[IX_train,:].values.ravel())

df = pd.DataFrame(clf0.cv_results_)

train_predictions = clf0.predict(X_train)
test_predictions = clf0.predict(X_test)

accs_train = accuracy_score(y_train, train_predictions)
accs_test = accuracy_score(y_test, test_predictions)