# Random Forest Classifier CAFA5 - Protein Function Prediction

# Import Statements

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import cudf
# import cuml

from sklearn.model_selection import train_test_split
from cuml.ensemble import RandomForestClassifier as CuMLRFClassifier

from sklearn.metrics import hamming_loss

!pip install pickle-mixin

from tqdm import tqdm
import pickle

In [None]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [None]:
class Paths:
    train_terms_tsv =  "/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv"
    test_embeds_npy = "/kaggle/input/t5embeds/test_embeds.npy"
    test_ids_npy = "/kaggle/input/t5embeds/test_ids.npy"
    train_embeds_npy = "/kaggle/input/t5embeds/train_embeds.npy"
    train_ids_npy = "/kaggle/input/t5embeds/train_ids.npy"
    train_labels_csv = "/kaggle/input/train-labels-cafa5/train_labels.csv"

# Read The Training Dataset

In [None]:
# read the training dataset
train_terms = pd.read_csv(Paths.train_terms_tsv,sep="\t")
print(train_terms.shape)

In [None]:
num_of_labels = 500
labels_to_consider = train_terms["term"].value_counts().iloc[:num_of_labels]

In [None]:
plt.bar(x= labels_to_consider.index,height =  labels_to_consider,)
plt.xticks([])
plt.show()

In [None]:
print("We will choose only top {} frequent labels (in train) and predict only them.".format(num_of_labels))

In [None]:
labels_to_consider = train_terms["term"].value_counts().index[:num_of_labels].tolist()
print("labels_to_consider")
print("First {}\n {}".format(10,labels_to_consider[:10]))

In [None]:
# update the train_terms to hold only those values which have the selected labels
train_terms_updated = train_terms.loc[train_terms.term.isin(labels_to_consider)]
train_terms_updated.shape

# Load the t5_embeds Data

In [None]:
# Load the embedding protien ids
prot_ids = np.load(Paths.train_ids_npy, allow_pickle=True)
prot_embeddings = np.load(Paths.train_embeds_npy, allow_pickle=True).astype(np.float32)

In [None]:
num_of_proteins = len(prot_ids)

In [None]:
# create a datafrme of the embeddings
column_num = prot_embeddings.shape[1]
prot_df = pd.DataFrame(prot_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)], dtype=np.float32)
prot_df.set_index(prot_ids,inplace=True)
prot_df.head()

In [None]:
print(prot_df.shape)

# Create a prot_labels DataFrame which contains binary one-hot encoded GO Labels

In [None]:
# !pip install progressbar
# import progressbar

# # Creating a label_df which is a binary representation of which protien is
# # annotated by which of the chosen top protien labels and which are not.

# bar = progressbar.ProgressBar(maxval=num_of_labels, \
#     widgets=[progressbar.Bar('=', '[', ']'), ' ', progressbar.Percentage()])

# prot_labels = np.zeros((num_of_proteins ,num_of_labels), dtype=np.float32)
# # convert prot_ids to series to use the .isin method
# prot_ids_series = pd.Series(prot_ids)

# bar.start()
# for label_i in range(num_of_labels):

#     label_i_go_id = labels_to_consider[label_i]

#     prots_annotated_by_label = train_terms_updated[train_terms_updated['term'] == label_i_go_id ]["EntryID"].unique()

#     prot_labels[:,label_i] = prot_ids_series.isin(prots_annotated_by_label).astype(np.float32)

#     bar.update(label_i+1)
# bar.finish()

# prot_labels = pd.DataFrame(prot_labels,columns=labels_to_consider, dtype=np.float32)
# prot_labels.head()

In [None]:
# load pre-created train_labels dataset from private dataset
prot_labels = pd.read_csv(Paths.train_labels_csv, dtype=np.float32)
prot_labels.head()

In [None]:
# As we are only using top 500 terms right now and not all 1500 available in the prot_labels
prot_labels.drop("Unnamed: 0",axis=1,inplace=True)
prot_labels = prot_labels.iloc[:,:500]
prot_labels.set_index(prot_ids,inplace=True)
prot_labels.shape
prot_labels.head()

# Split data into testing and training set

In [None]:
train_df, test_df, train_labels, test_labels = train_test_split(prot_df, prot_labels, test_size=0.4, random_state=42)

# Training Model Using GPU - Acceleration (GPU T4 x2)

In [None]:
train_df_cudf = cudf.DataFrame.from_pandas(train_df).astype('float32')
train_labels_cudf = cudf.DataFrame.from_pandas(train_labels).astype('float32')
test_df_cudf = cudf.DataFrame.from_pandas(test_df).astype('float32')

In [None]:
from time import sleep

In [54]:
predicted_labels_train_df = pd.DataFrame(index= train_df.index, columns= train_labels.columns, dtype= np.float32).fillna(np.float32(0))

In [55]:
predicted_labels_df = pd.DataFrame(index= test_df.index, columns= test_labels.columns, dtype= np.float32).fillna(np.float32(0))

In [57]:
n_columns = 500

with tqdm(total=n_columns, ncols=100, desc="Training", unit="column") as pbar:
    for col in range(n_columns):
        
        rf_clf = CuMLRFClassifier()
        rf_clf.fit(train_df_cudf, train_labels_cudf.iloc[:,col])
        
        predict_labels = rf_clf.predict(test_df_cudf)
        predicted_labels_df[train_labels_cudf.columns[col]] = predict_labels.to_numpy()
        
        predict_labels_train = rf_clf.predict(train_df_cudf)
        predicted_labels_train_df[train_labels_cudf.columns[col]] = predict_labels_train.to_numpy()
        
        
        pbar.update(1)

Training: 100%|███████████████████████████████████████████████| 500/500 [24:44<00:00,  2.97s/column]


In [58]:
loss = hamming_loss(train_labels.iloc[:,:n_columns],predicted_labels_train_df.iloc[:,:n_columns])
print(f"Hamming Loss on the train set itself : {loss}.")

Hamming Loss on the train set itself : 0.014508090501130678.


In [59]:
loss = hamming_loss(test_labels.iloc[:,:n_columns],predicted_labels_df.iloc[:,:n_columns])
print(f"Hamming Loss on the test set : {loss}.")

Hamming Loss on the test set : 0.0424265452819909.


In [60]:
predicted_labels_train_df.to_csv("rf_pred_train_upto_500_terms_random_state_42.csv")
predicted_labels_df.to_csv("rf_pred_upto_500_terms_random_state_42.csv")

## Testing evaluating test_labels efficiency of prediction of the model