# Using PCA to reduce the dimensions of the protein training data in CAFA 5 PFP

In [1]:
import cudf
from tqdm import tqdm

from sklearn.model_selection import train_test_split
# from sklearn.ensemble import RandomForestClassifier

from cuml.ensemble import RandomForestClassifier as CuMLRFClassifier

from sklearn.decomposition import PCA

from sklearn.metrics import hamming_loss

import numpy as np
import pandas as pd


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/t5embeds/train_ids.npy
/kaggle/input/t5embeds/test_embeds.npy
/kaggle/input/t5embeds/train_embeds.npy
/kaggle/input/t5embeds/test_ids.npy
/kaggle/input/train-labels-cafa5/random_forest_pickle_thousand
/kaggle/input/train-labels-cafa5/random_forest_pickle
/kaggle/input/train-labels-cafa5/train_labels.csv


In [3]:
prot_embeds = np.load("/kaggle/input/t5embeds/train_embeds.npy").astype('float32')
prot_ids = np.load("/kaggle/input/t5embeds/train_ids.npy")

In [5]:
prot_labels = pd.read_csv("/kaggle/input/train-labels-cafa5/train_labels.csv", dtype=np.float32)
# For only top 1500 GO terms
prot_labels.drop("Unnamed: 0",axis=1,inplace=True)
prot_labels.set_index(prot_ids,inplace=True)
prot_labels.head()

Unnamed: 0,GO:0005575,GO:0008150,GO:0110165,GO:0003674,GO:0005622,GO:0009987,GO:0043226,GO:0043229,GO:0005488,GO:0043227,...,GO:0034250,GO:0140053,GO:0031345,GO:0098802,GO:0045861,GO:0051783,GO:0031674,GO:0001818,GO:0006874,GO:0016887
P20536,0.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O73864,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
O95231,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A0A0B4J1F4,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
P54366,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Transform the data to new data

In [6]:
pca_256 = PCA(n_components=256)
pca_256

In [7]:
%%time
pca_256.fit(prot_embeds)

CPU times: user 26.4 s, sys: 2.49 s, total: 28.9 s
Wall time: 16.6 s


In [8]:
prot_df = pd.DataFrame(pca_256.transform(prot_embeds),index= prot_ids)
prot_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,246,247,248,249,250,251,252,253,254,255
P20536,0.27066,-0.478872,-0.150257,-0.012765,-0.120466,-0.143243,-0.03466,-0.015345,-0.057034,-0.078058,...,0.011249,-0.023013,0.00069,-0.055629,-0.115525,-0.003904,-0.00599,0.003964,0.006203,-0.064973
O73864,-0.117618,-0.010339,-0.206943,0.039822,-0.157024,0.058493,-0.168782,-0.250182,0.104331,-0.38656,...,-0.042388,0.018114,-0.040248,0.008605,-0.034004,0.068799,-0.026455,0.006045,0.042596,-0.035406
O95231,0.224266,0.694896,-0.190856,0.283427,-0.032432,0.342178,-0.126916,-0.004499,-0.562844,-0.149241,...,0.030598,-0.0059,-0.00612,0.02232,-0.00879,0.022833,0.022804,-0.009777,0.030373,-0.00077
A0A0B4J1F4,-0.280479,0.377051,-0.213807,-0.139802,-0.007929,0.089613,-0.077499,0.019848,0.032122,-0.07264,...,-0.026817,-0.00309,-0.004056,-0.006177,0.028495,-0.002037,0.018092,-0.006559,0.019666,-0.017296
P54366,0.542605,0.471899,0.676284,0.192114,0.130854,-0.22421,-0.315011,-0.227262,0.085949,-0.479028,...,0.00806,-0.01297,-0.021175,0.042514,-0.027147,-0.009879,0.014824,0.002485,-0.012398,-0.01317


In [9]:
prot_df.to_csv("prot_df_256_pca.csv")

In [10]:
print(f"prot_df shape {prot_df.shape}")
print(f"prot_labels shape {prot_labels.shape}")

prot_df shape (142246, 256)
prot_labels shape (142246, 1500)


# Prepare to train the model

In [11]:
train_df, test_df, train_labels, test_labels = train_test_split(prot_df, prot_labels, test_size=0.4, random_state=42)

In [12]:
train_df_cudf = cudf.DataFrame.from_pandas(train_df).astype('float32')
train_labels_cudf = cudf.DataFrame.from_pandas(train_labels).astype('float32')
test_df_cudf = cudf.DataFrame.from_pandas(test_df).astype('float32')

# Train the model

In [13]:
predicted_labels_train_df = pd.DataFrame(index= train_df.index,
                                         columns= train_labels.columns,
                                         dtype= np.float32).fillna(np.float32(0))

predicted_labels_df = pd.DataFrame(index= test_df.index,
                                   columns= test_labels.columns,
                                   dtype= np.float32).fillna(np.float32(0))

In [14]:
n_columns = prot_labels.shape[1]

with tqdm(total=n_columns, ncols=100, desc="Training", unit="column") as pbar:
    for col in range(n_columns):
        
        rf_clf = CuMLRFClassifier()
        rf_clf.fit(train_df_cudf, train_labels_cudf.iloc[:,col])
        
        predict_labels = rf_clf.predict(test_df_cudf)
        predicted_labels_df[train_labels_cudf.columns[col]] = predict_labels.to_numpy()
        
        predict_labels_train = rf_clf.predict(train_df_cudf)
        predicted_labels_train_df[train_labels_cudf.columns[col]] = predict_labels_train.to_numpy()
        
        
        pbar.update(1)

Training: 100%|█████████████████████████████████████████████| 1500/1500 [47:59<00:00,  1.92s/column]


In [15]:
loss = hamming_loss(train_labels.iloc[:,:n_columns],predicted_labels_train_df.iloc[:,:n_columns])
print(f"Hamming Loss on the train set itself : {loss}.")

Hamming Loss on the train set itself : 0.006939255822309708.


In [16]:
loss = hamming_loss(test_labels.iloc[:,:n_columns],predicted_labels_df.iloc[:,:n_columns])
print(f"Hamming Loss on the test set : {loss}.")

Hamming Loss on the test set : 0.018094424623748512.


In [17]:
predicted_labels_train_df.to_csv("rf_pred_train_upto_500_terms_random_state_42.csv")
predicted_labels_df.to_csv("rf_pred_upto_500_terms_random_state_42.csv")