In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd 'drive/MyDrive/Uni/UniPD/BioData/project/biological_data_pfp'

/content/drive/MyDrive/Uni/UniPD/BioData/project/biological_data_pfp


In [1]:
import h5py
import pandas as pd
import numpy as np
import networkx
import hashlib
import obonet
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from scipy import sparse
from tensorflow.keras import backend as K
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import KFold
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras import layers

2024-01-26 15:37:47.990527: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-01-26 15:37:47.990581: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-01-26 15:37:47.991473: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-01-26 15:37:47.997077: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


### Common Function

In [2]:
def readh5_to_dict(file_path):
  # Create an empty dictionary to store the data
  p_embeddings_data = {}

  # Open the HDF5 file
  with h5py.File(file_path, 'r') as p_embeddings:
    # Store the data in the dictionary
    for key in p_embeddings.keys():
      p_embeddings_data[key] = p_embeddings[key][...]

  return p_embeddings_data

In [3]:
def sample_protein_ids(file_path,percentage):

  # Read the IDs from the text file
  with open(file_path, 'r') as file:
    ids = [line.strip() for line in file]

  # Calculate the index to get the first 30% of IDs
  split_index = int(len(ids) * percentage)

  # Select the first 30% of IDs
  selected_ids = ids[:split_index]

  return selected_ids

In [4]:
def read_tsv(tsv_file_path):
  # Read the TSV file into a Pandas DataFrame
  df_train_set = pd.read_csv(tsv_file_path, sep='\t')

  # Display the DataFrame
  return df_train_set

In [5]:
def read_dat(file_path):
  column_names = ['Protein_ID', 'IPR_ID', 'description', 'domain','dc1','dc2']
  df = pd.read_csv(file_path, delimiter='\t',names=column_names)

  return df


In [6]:
def filter_train_data(df, selected_ids, category):
  filtered_df = df[df['Protein_ID'].isin(selected_ids)]
  filtered_df = filtered_df[filtered_df['aspect'] == category]

  return filtered_df

In [7]:
def encode_go_terms(train_df):
  one_hot_encoding = pd.get_dummies(train_df['GO_term'])

  # Concatenate the one-hot encoded columns with the original DataFrame
  df_encoded = pd.concat([train_df, one_hot_encoding], axis=1)
  df_encoded_grouped = df_encoded.groupby('Protein_ID').sum().reset_index()

  return df_encoded_grouped

In [8]:
def encode_go_terms_sparse(train_df, chunk_size=10000):
    # Unique GO terms and Protein IDs
    go_terms = train_df['GO_term'].unique()
    protein_ids = train_df['Protein_ID'].unique()
    
    # Mapping of GO terms and Protein IDs to integer indices
    go_term_to_index = {go_term: i for i, go_term in enumerate(go_terms)}
    protein_id_to_index = {protein_id: i for i, protein_id in enumerate(protein_ids)}
    
    # Initialize a sparse matrix
    encoded_matrix = sparse.lil_matrix((len(protein_ids), len(go_terms)), dtype=np.int8)
    
    # Process in chunks using tqdm for progress bar
    total_rows = train_df.shape[0]
    for start in tqdm(range(0, total_rows, chunk_size), desc="Encoding", total=total_rows // chunk_size + 1):
        end = min(start + chunk_size, total_rows)
        chunk = train_df.iloc[start:end]
        
        rows = chunk['Protein_ID'].map(protein_id_to_index)
        cols = chunk['GO_term'].map(go_term_to_index)
        data = np.ones(len(chunk), dtype=np.int8)
        
        # Create a sparse matrix for the chunk
        chunk_matrix = sparse.coo_matrix((data, (rows, cols)), shape=encoded_matrix.shape, dtype=np.int8)
        
        # Add the chunk matrix to the main matrix
        encoded_matrix += chunk_matrix

    # Convert to DataFrame
    df_encoded = pd.DataFrame.sparse.from_spmatrix(encoded_matrix, index=protein_ids, columns=go_terms)
    df_encoded = df_encoded.reset_index()
    df_encoded.rename(columns={'index': 'Protein_ID'}, inplace=True)

    return df_encoded

In [9]:
def encode_ipr_domain(df_ipr):
    df_ipr = df_ipr.drop(columns=['IPR_ID', 'description','dc1','dc2'])
    one_hot_encoding = pd.get_dummies(df_ipr['domain'],sparse=True)

    # Concatenate the one-hot encoded columns with the original DataFrame
    df_encoded = pd.concat([df_ipr, one_hot_encoding], axis=1)
    df_encoded_grouped = df_encoded.groupby('Protein_ID').sum().reset_index()

    return df_encoded

In [10]:
def get_embeddings(df, embeddings_dict):
  df['embedding'] = df['Protein_ID'].map(embeddings_dict)

  return df

In [11]:
def get_ipr(df_ipr,df_train):
   isp_dict = df_ipr.set_index('Protein_ID')['domain'].to_dict()
   df_train['ipr'] = df_train['Protein_ID'].map(isp_dict)

   return df_train

In [12]:
def create_y(df):
  y = df.to_numpy()
  return y


In [13]:
def create_X(df,variables):
  X = np.array(df[variables])
  X = np.vstack(X)

  return X

In [14]:
def get_top_freq(column, freq):
    top_freq = []
    for col in column:
        if col not in freq:
            continue
        top_freq.append(col)
    return top_freq

In [15]:
def create_training_ids(selected_ids, train_ids, percentage):
    ids = sample_protein_ids(train_ids, percentage)
    new_ids = sample_protein_ids(selected_ids, 1.0)
    difference = list(set(ids) - set(new_ids))
    return difference

In [16]:
def build_test_data(test_embeddings_data, test_ids):
    X_test = []
    for id in test_ids:
        X_test.append(test_embeddings_data[id])
    X_test = np.array(X_test)
    return X_test

In [17]:
def f1_score(y_true, y_pred):
    precision = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) / (K.sum(K.round(K.clip(y_pred, 0, 1))) + K.epsilon())
    recall = K.sum(K.round(K.clip(y_true * y_pred, 0, 1))) / (K.sum(K.round(K.clip(y_true, 0, 1))) + K.epsilon())
    return 2 * (precision * recall) / (precision + recall + K.epsilon())

In [18]:
def k_fold_cv(X, y, model, n_splits):
    histories = []
    
    # Initialize KFold
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    
    # Iterate over each fold
    fold_no = 1
    for train_index, test_index in kf.split(X):
        # Splitting the data into training and testing sets for this fold
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # Compile the model
        model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    
        # Fit the model
        print(f'Training for fold {fold_no} ...')
        history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))
        histories.append(history)
        
        # Here, you can evaluate the model on the test set, e.g., calculate metrics
        results = model.evaluate(X_test, y_test)
        print(f"Test results - Fold {fold_no}: {model.metrics_names[0]} of {results[0]}; {model.metrics_names[1]} of {results[1]*100}%")
    
        fold_no += 1

In [19]:
### Propagating the probability of the children to the parent, in this case if the parent has several children, we will take the max probabilty of the children
def post_processing(y_pred, pred_scolumns, graph):
    new_preds = []

    for pred in y_pred:
        ### Build prediction dict
        preds = {k: 0 for k in pred_columns}
        new_pred = [0 for i in range(len(pred))]
        for i in range(len(pred)):
            term = pred_columns[i]
            preds[term] = pred[i]

        ### Search the probabilty for the parent
        pool = set()
        for term, prob in preds.items():
            for parent, child, key in graph.in_edges(term, keys=True):
                if key not in {'is_a', 'part_of'} or parent not in preds:
                    continue

                probability = max(prob, preds[parent])
                preds[parent] = probability
                    
        ### Build the array for the new preds
        for term, prob in preds.items():
            idx = pred_columns.index(term)
            new_pred[idx] = prob
        new_preds.append(new_pred)
    return np.array(new_preds)

In [20]:
def generate_submission_df(y_pred, test_ids, pred_columns):
    # assert that the length of y_pred must be same as test_ids
    assert len(y_pred) == len(test_ids)
    
    # Group by the result and then sort by score id
    out = {'id': [], 'term': [], 'score': []}
    for i in range(len(y_pred)):
        for j in range(len(y_pred[i])):
            out['id'].append(test_ids[i])
            out['term'].append(pred_columns[j])
            out['score'].append(y_pred[i][j])
    
    out_df = pd.DataFrame(out).reset_index(drop=True)
    
    out_df = out_df.groupby('id', group_keys=False)
    out_df = out_df.apply(lambda x: x.sort_values(by='score', ascending=False))
    
    # Filter the DataFrame
    out_df = out_df[out_df['id'].isin(test_ids)]
    
    # Convert the 'ID' column to a Categorical with the order defined in filter_array
    out_df['id'] = pd.Categorical(out_df['id'], categories=test_ids, ordered=True)
    
    # Sort by the 'ID' column
    out_df = out_df.sort_values('id')
    out_df['id'] = out_df['id'].astype(str)
    return out_df

In [132]:
def train(X_train, y_train, name_path=None):
    '''
        Train the model.
    '''
    embedding_size = len(X_train[1]) 
    num_classes = len(y_train[1])
    
    final_model = keras.Sequential([
        layers.Input(shape=(embedding_size,)),
        layers.Dense(128, activation='relu'),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='sigmoid')
    ])
    
    final_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=[f1_score])
    final_model.fit(X_train, y_train, epochs=50, batch_size=32)

    if name_path:
        final_model.save(f'{name_path}.h5')

    return final_model

In [22]:
def concat_predictions(bp_path, mf_path, cc_path):
    # Read the files into DataFrames
    bp_df = pd.read_csv(bp, sep='\t', header=None, names=['Protein_ID', 'GO_term', 'score'])
    mf_df = pd.read_csv(mf, sep='\t', header=None, names=['Protein_ID', 'GO_term', 'score'])
    cc_df = pd.read_csv(cc, sep='\t', header=None, names=['Protein_ID', 'GO_term', 'score'])
    
    # Concatenate the DataFrames
    concatenated_df = pd.concat([bp_df, mf_df, cc_df])
    
    # Create a custom sorting order based on the external list
    sorting_order = {id: index for index, id in enumerate(test_ids)}
    concatenated_df['sort_order'] = concatenated_df['Protein_ID'].map(sorting_order)
    
    # Sort by custom order and then by probability within each group
    sorted_df = concatenated_df.sort_values(by=['sort_order', 'score'], ascending=[True, False])
    
    # Limit to 1500 rows per ID
    limited_df = sorted_df.groupby('Protein_ID').head(1500)
    
    # Drop the auxiliary 'sort_order' column
    limited_df = limited_df.drop(columns=['sort_order'])
    
    return limited_df

### Preprocess Dataset - Combined Dataset

In [86]:
df_train_set_all = read_tsv('./dataset/train/train_set.tsv')
# Desired aspects
desired_aspects = {'cellular_component', 'biological_process', 'molecular_function'}

# Function to check if all aspects are present
def check_aspects(group):
    return desired_aspects == set(group['aspect'])

# Apply the function to each group
result = df_train_set_all.groupby('Protein_ID').filter(check_aspects)['Protein_ID'].unique()
np.savetxt('./dataset/train/sampled_train.txt', result, fmt='%s')

In [87]:
# Create test dataset containing 1000 data of the proteins in the result, this is created to help validate our modell using cafa evaluator
selected_test_ids = np.array(sample_protein_ids('./dataset/train/sampled_train.txt', 0.03140))
np.savetxt('./dataset/test/sampled_test.txt', selected_test_ids, fmt='%s')

# Create ground truth file
ground_truth_df = df_train_set_all[df_train_set_all['Protein_ID'].isin(selected_test_ids)]
ground_truth_df = ground_truth_df[['Protein_ID', 'GO_term']]
ground_truth_df.to_csv('./dataset/test/sampled_gt.tsv', sep='\t', index=False, header=False)

In [88]:
p_embeddings_data = readh5_to_dict('./dataset/train/train_embeddings.h5')
test_embeddings_data = readh5_to_dict('./dataset/test/test_embeddings.h5')

In [89]:
%%time
graph = obonet.read_obo('./dataset/taxonomy/go-basic.obo')

CPU times: user 4.05 s, sys: 28.5 ms, total: 4.08 s
Wall time: 4.08 s


### Preprocess Dataset - Celullar component

In [119]:
# Train dataset
selected_ids = create_training_ids('./dataset/test/sampled_test.txt', './dataset/train/train_ids.txt', 1.0)
df_train_set = filter_train_data(df_train_set_all, selected_ids,'cellular_component')

In [120]:
# Test dataset
test_ids = sample_protein_ids('./dataset/test/sampled_test.txt', 1.0)
X_test_gt = build_test_data(p_embeddings_data, test_ids)

In [121]:
df_encoded = encode_go_terms_sparse(df_train_set)
df_encoded = get_embeddings(df_encoded, p_embeddings_data)
df_encoded.head()

Encoding: 100%|███████████████████████████████| 110/110 [01:02<00:00,  1.77it/s]


Unnamed: 0,Protein_ID,GO:0005575,GO:0110165,GO:0005622,GO:0043226,GO:0030139,GO:0097708,GO:0005737,GO:0045335,GO:0031982,...,GO:0005637,GO:0099092,GO:0099091,GO:0030140,GO:0005703,GO:0097386,GO:0000235,GO:0070822,GO:0010287,embedding
0,Q55DL5,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"[0.1418, 0.06207, 0.07367, -0.0712, 0.0703, -0..."
1,O81027,1,1,1,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0.0491, 0.0389, -0.0178, 0.02779, -0.00568, 0..."
2,Q04418,1,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.022, -0.06964, -0.007042, 0.0544, -0.04633..."
3,Q7ZT12,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.04028, -0.03357, 0.1046, 0.0669, -0.07935, ..."
4,Q07627,1,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0.013565, 0.1422, 0.1249, 0.05283, 0.00569, -..."


In [122]:
df_encoded.isna().sum().sum()

  df_encoded.isna().sum().sum()


0

In [123]:
# Select top N labels, for celullar component we select top 300
freq_df = pd.read_csv('./dataset/train/cellular_component_freq.csv')[:300]

In [124]:
y_columns = df_encoded.iloc[:, 1:-1]
pred_columns = get_top_freq(y_columns.columns.tolist(), set(freq_df['id']))

y_columns = y_columns[pred_columns]
y = create_y(y_columns)
X = create_X(df_encoded,'embedding')

In [133]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
model = train(X_train, y_train, './model/cc_model')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


In [134]:
y_pred = model.predict(X_test)



### Post Processing - Cellular Component

In [44]:
# new_y_pred = post_processing(y_pred, pred_columns, graph)

### Evaluation - Cellular Component

In [135]:
# Convert probabilities to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12546
           1       0.99      1.00      1.00     12438
           2       0.89      0.96      0.92      9942
           3       0.81      0.90      0.85      8531
           4       0.41      0.13      0.20       138
           5       0.49      0.28      0.36       768
           6       0.76      0.86      0.81      7405
           7       0.39      0.11      0.18        96
           8       0.51      0.22      0.31      1045
           9       0.77      0.86      0.81      7343
          10       0.79      0.86      0.82      7814
          11       0.80      0.90      0.85      8154
          12       0.49      0.28      0.35       766
          13       0.83      0.60      0.69      1329
          14       0.76      0.36      0.49       968
          15       0.73      0.32      0.44       728
          16       0.58      0.38      0.46      2544
          17       0.69    

  _warn_prf(average, modifier, msg_start, len(result))


In [100]:
# # Convert probabilities to binary predictions
# new_y_pred_binary = (new_y_pred > 0.5).astype(int)
# print(classification_report(y_test, new_y_pred_binary))

300

In [136]:
### Prediction using sample test
y_pred_gt = model.predict(X_test_gt)



In [137]:
submission_df = generate_submission_df(y_pred_gt, test_ids, pred_columns)
submission_df.to_csv('./dataset/prediction/sample_prediction_cc.tsv', sep='\t', header=False, index=False)

In [53]:
# new_y_pred_gt = post_processing(y_pred_gt, pred_columns, graph)

In [54]:
# submission_df = generate_submission_df(new_y_pred_gt, test_ids, pred_columns)
# submission_df.to_csv('./dataset/prediction/sample_prediction_cc_propagate.tsv', sep='\t', header=False, index=False)

### Preprocess Dataset - Molecular Function

In [138]:
# Train dataset
selected_ids = create_training_ids('./dataset/test/sampled_test.txt', './dataset/train/train_ids.txt', 1.0)
df_train_set = filter_train_data(df_train_set_all, selected_ids, 'molecular_function')

In [139]:
# Test dataset
test_ids = sample_protein_ids('./dataset/test/sampled_test.txt', 1.0)
X_test_gt = build_test_data(p_embeddings_data, test_ids)

In [140]:
df_encoded = encode_go_terms_sparse(df_train_set)
df_encoded = get_embeddings(df_encoded, p_embeddings_data)
df_encoded.head()

Encoding: 100%|█████████████████████████████████| 53/53 [00:18<00:00,  2.83it/s]


Unnamed: 0,Protein_ID,GO:0016830,GO:0016829,GO:0016833,GO:0003824,GO:0003674,GO:0005488,GO:0005515,GO:0003676,GO:0003690,...,GO:0001091,GO:0042287,GO:0050897,GO:0015923,GO:0016251,GO:0004559,GO:0030515,GO:0016863,GO:0005337,embedding
0,O81027,1,1,1,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0.0491, 0.0389, -0.0178, 0.02779, -0.00568, 0..."
1,Q8IXT2,0,0,0,0,1,1,0,1,1,...,0,0,0,0,0,0,0,0,0,"[-0.02515, -0.01331, 0.00575, 0.004353, -0.069..."
2,Q9WUC4,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0.06134, -0.00452, 0.01472, 0.001324, 0.03162..."
3,Q6P6T4,0,0,0,0,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,"[0.02074, 0.09515, 0.0519, 0.00766, -0.02692, ..."
4,P04014,0,0,0,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,"[0.01222, -0.0453, 0.0269, -0.00953, -0.01057,..."


In [141]:
# Select top N labels, for molecular function we pick top 450
freq_df = pd.read_csv('./dataset/train/molecular_function_freq.csv')[:450]

In [142]:
y_columns = df_encoded.iloc[:, 1:-1]
pred_columns = get_top_freq(y_columns.columns.tolist(), set(freq_df['id']))
y_columns = y_columns[pred_columns]
y = create_y(y_columns)
X = create_X(df_encoded,'embedding')

In [143]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
model = train(X_train, y_train, './model/mf_model')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


In [144]:
y_pred = model.predict(X_test)



### Evaluation - Molecular Function

In [145]:
# Convert probabilities to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.76      0.46      0.57        70
           1       0.84      0.60      0.70       250
           2       0.87      0.92      0.90      3627
           3       1.00      1.00      1.00      8205
           4       0.85      0.86      0.85      5453
           5       0.75      0.73      0.74      4182
           6       0.79      0.71      0.75      1431
           7       0.63      0.60      0.62       488
           8       0.74      0.69      0.71       606
           9       0.76      0.59      0.67      1795
          10       0.62      0.58      0.60       446
          11       0.79      0.72      0.76       845
          12       0.49      0.04      0.07       579
          13       0.00      0.00      0.00        25
          14       0.77      0.42      0.54       414
          15       0.66      0.65      0.65       139
          16       0.64      0.55      0.59        51
          17       0.63    

  _warn_prf(average, modifier, msg_start, len(result))


In [146]:
### Prediction using sample test
y_pred_gt = model.predict(X_test_gt)



In [147]:
submission_df = generate_submission_df(y_pred_gt, test_ids, pred_columns)
submission_df.to_csv('./dataset/prediction/sample_prediction_mf.tsv', sep='\t', header=False, index=False)

### Preprocess - Biological Process

In [148]:
# Train dataset
selected_ids = create_training_ids('./dataset/test/sampled_test.txt', './dataset/train/train_ids.txt', 1.0)
df_train_set = filter_train_data(df_train_set_all, selected_ids, 'biological_process')

In [149]:
# Test dataset
test_ids = sample_protein_ids('./dataset/test/sampled_test.txt', 1.0)
X_test_gt = build_test_data(p_embeddings_data, test_ids)

In [150]:
df_encoded = encode_go_terms_sparse(df_train_set)
df_encoded = get_embeddings(df_encoded, p_embeddings_data)
df_encoded.head()

Encoding: 100%|███████████████████████████████| 260/260 [03:02<00:00,  1.43it/s]


Unnamed: 0,Protein_ID,GO:0090304,GO:0044271,GO:0010467,GO:0034641,GO:0016070,GO:0006366,GO:0044249,GO:0043170,GO:0009058,...,GO:0008360,GO:0030522,GO:1901264,GO:0051983,GO:0042129,GO:0050777,GO:0051445,GO:0071322,GO:2000027,embedding
0,Q04418,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,"[-0.022, -0.06964, -0.007042, 0.0544, -0.04633..."
1,Q7ZT12,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.04028, -0.03357, 0.1046, 0.0669, -0.07935, ..."
2,Q6DBW0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.01106, 0.02277, 0.02895, 0.03293, -0.00641,..."
3,Q9WUC4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.06134, -0.00452, 0.01472, 0.001324, 0.03162..."
4,Q03370,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.014366, -0.0655, 0.0208, 0.0652, 0.0433, 0..."


In [151]:
# Select top N labels, for biological process 1100
freq_df = pd.read_csv('./dataset/train/biological_process_freq.csv')[:1100]

In [152]:
y_columns = df_encoded.iloc[:, 1:-1]
pred_columns = get_top_freq(y_columns.columns.tolist(), set(freq_df['id']))
y_columns = y_columns[pred_columns]
y = create_y(y_columns)
X = create_X(df_encoded,'embedding')

In [153]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
model = train(X_train, y_train, './model/bp_model')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


  saving_api.save_model(


In [154]:
y_pred = model.predict(X_test)



### Evaluation - Bilogical Process

In [155]:
# Convert probabilities to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.62      0.53      0.57      1023
           1       0.60      0.28      0.38       689
           2       0.59      0.42      0.49       943
           3       0.65      0.50      0.56      1572
           4       0.57      0.49      0.52       672
           5       0.39      0.07      0.12       100
           6       0.59      0.49      0.53      1961
           7       0.61      0.52      0.56      2363
           8       0.61      0.53      0.57      2090
           9       0.66      0.63      0.64      3411
          10       0.63      0.54      0.58      2908
          11       0.70      0.70      0.70      4173
          12       0.51      0.11      0.18       175
          13       0.54      0.21      0.30       598
          14       0.72      0.92      0.81      8284
          15       0.55      0.18      0.28       509
          16       1.00      1.00      1.00     12310
          17       0.64    

  _warn_prf(average, modifier, msg_start, len(result))


In [156]:
### Prediction using sample test
y_pred_gt = model.predict(X_test_gt)



In [157]:
submission_df = generate_submission_df(y_pred_gt, test_ids, pred_columns)
submission_df.to_csv('./dataset/prediction/sample_prediction_bp.tsv', sep='\t', header=False, index=False)

### Concat the prediction

In [159]:
# Non propagated predictions
# File paths
bp = './dataset/prediction/sample_prediction_bp.tsv'
mf = './dataset/prediction/sample_prediction_mf.tsv'
cc = './dataset/prediction/sample_prediction_cc.tsv'

concat_df = concat_predictions(bp, mf, cc)

# Output the result as a TSV file
output_file = './dataset/prediction/sample_test_pred_all.tsv'
concat_df.to_csv(output_file, sep='\t', index=False, header=False)

### Evaluate using CAFA Evaluator

In [160]:
import cafaeval
from cafaeval.evaluation import cafa_eval, write_results
res = cafa_eval("./dataset/taxonomy/go-basic.obo", "./dataset/prediction/pred_all/", "./dataset/test/sampled_gt.tsv")
write_results(*res)