In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
cd 'drive/MyDrive/Uni/UniPD/BioData/project/biological_data_pfp'

/content/drive/MyDrive/Uni/UniPD/BioData/project/biological_data_pfp


In [3]:
import h5py
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import classification_report
from tensorflow import keras
from tensorflow.keras import layers

In [4]:
def readh5_to_dict(file_path):
  # Create an empty dictionary to store the data
  p_embeddings_data = {}

  # Open the HDF5 file
  with h5py.File(file_path, 'r') as p_embeddings:
    # Store the data in the dictionary
    for key in p_embeddings.keys():
      p_embeddings_data[key] = p_embeddings[key][...]

  return p_embeddings_data

In [5]:
def sample_protein_ids(file_path,percentage):

  # Read the IDs from the text file
  with open(file_path, 'r') as file:
    ids = [line.strip() for line in file]

  # Calculate the index to get the first 30% of IDs
  split_index = int(len(ids) * percentage)

  # Select the first 30% of IDs
  selected_ids = ids[:split_index]

  return selected_ids

In [6]:
def read_tsv(tsv_file_path):
  # Read the TSV file into a Pandas DataFrame
  df_train_set = pd.read_csv(tsv_file_path, sep='\t')

  # Display the DataFrame
  return df_train_set

In [7]:
def read_dat(file_path):
  column_names = ['Protein_ID', 'IPR_ID', 'description', 'domain','dc1','dc2']
  df = pd.read_csv(file_path, delimiter='\t',names=column_names)

  return df


In [8]:
def filter_train_data(df, selected_ids, category):
  filtered_df = df[df['Protein_ID'].isin(selected_ids)]
  filtered_df = filtered_df[filtered_df['aspect'] == category]

  return filtered_df

In [9]:
def encode_go_terms(train_df):
  one_hot_encoding = pd.get_dummies(train_df['GO_term'])

  # Concatenate the one-hot encoded columns with the original DataFrame
  df_encoded = pd.concat([train_df, one_hot_encoding], axis=1)
  df_encoded_grouped = df_encoded.groupby('Protein_ID').sum().reset_index()

  return df_encoded_grouped

In [10]:
def encode_ipr_domain(df_ipr):
    df_ipr = df_ipr.drop(columns=['IPR_ID', 'description','dc1','dc2'])
    one_hot_encoding = pd.get_dummies(df_ipr['domain'],sparse=True)

    # Concatenate the one-hot encoded columns with the original DataFrame
    df_encoded = pd.concat([df_ipr, one_hot_encoding], axis=1)
    df_encoded_grouped = df_encoded.groupby('Protein_ID').sum().reset_index()

    return df_encoded

In [11]:
def get_embeddings(df, embeddings_dict):
  df['embedding'] = df['Protein_ID'].map(embeddings_dict)

  return df

In [12]:
def get_ipr(df_ipr,df_train):
   isp_dict = df_ipr.set_index('Protein_ID')['domain'].to_dict()
   df_train['ipr'] = df_train['Protein_ID'].map(isp_dict)

   return df_train

In [13]:
def create_y(df):
  y = df.to_numpy()
  return y


In [14]:
def create_X(df,variables):
  X = np.array(df[variables])
  X = np.vstack(X)

  return X

In [15]:
p_embeddings_data = readh5_to_dict('train/train_embeddings.h5')
selected_ids = sample_protein_ids('train/train_ids.txt',0.6)
df_train_set = read_tsv('train/train_set.tsv')
df_ipr = read_dat('train/train_protein2ipr.dat')
df_train_set_filter = filter_train_data(df_train_set, selected_ids,'cellular_component')

In [16]:
#df_ipr_encoded = encode_ipr_domain(df_ipr)
#df_ipr_encoded.head()

In [17]:
df_encoded = encode_go_terms(df_train_set_filter)
df_encoded = get_embeddings(df_encoded, p_embeddings_data)
#df_encoded = get_ipr(df_ipr,df_encoded)
df_encoded.head()

  df_encoded_grouped = df_encoded.groupby('Protein_ID').sum().reset_index()


Unnamed: 0,Protein_ID,GO:0000118,GO:0000123,GO:0000124,GO:0000131,GO:0000137,GO:0000138,GO:0000139,GO:0000145,GO:0000151,...,GO:1905360,GO:1905368,GO:1905369,GO:1990023,GO:1990204,GO:1990234,GO:1990351,GO:1990752,GO:1990904,embedding
0,A0A021WW32,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.01643, -0.001583, 0.00389, 0.0734, 0.01243..."
1,A0A023GPJ3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.01512, 0.01102, 0.0217, -0.02512, 0.0396, 0..."
2,A0A023GUT0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[-0.00414, -0.01288, 0.0716, 0.01605, -0.03983..."
3,A0A023T787,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.02449, 0.04828, 0.0592, 0.01656, 0.04898, 0..."
4,A0A023VTS2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,"[0.0628, 0.0604, 0.02348, 0.0619, -0.00581, 0...."


In [18]:
df_encoded.isna().sum().sum()

0

In [19]:
df_encoded.columns[3:-1][df_encoded.iloc[1,3:-1] == 1]

Index(['GO:0005575', 'GO:0005622', 'GO:0005737', 'GO:0005829', 'GO:0110165'], dtype='object')

In [20]:
y_columns = df_encoded.iloc[:, 3:-1]

y = create_y(y_columns)
X = create_X(df_encoded,'embedding')

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)

In [24]:
X_train

array([[ 0.0655   ,  0.02339  , -0.02263  , ...,  0.0004447,  0.010994 ,
        -0.01154  ],
       [-0.03192  , -0.03287  , -0.01333  , ...,  0.04922  ,  0.04382  ,
        -0.04977  ],
       [ 0.03525  ,  0.00977  ,  0.0742   , ..., -0.03418  ,  0.041    ,
         0.07465  ],
       ...,
       [ 0.04263  , -0.02618  ,  0.0571   , ...,  0.009544 ,  0.02147  ,
         0.02808  ],
       [-0.03004  ,  0.002043 ,  0.02539  , ...,  0.0002644, -0.007496 ,
         0.03265  ],
       [ 0.02054  , -0.03125  ,  0.014656 , ..., -0.11633  ,  0.0083   ,
         0.0453   ]], dtype=float16)

In [25]:
y_train

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=uint8)

In [23]:
embedding_size = len(X_train[1]) #1024
num_classes = len(y_train[1]) #678

# Build a neural network model
model = keras.Sequential([
    layers.Input(shape=(embedding_size,)),  # Adjust the input shape based on your embedding size
    layers.Dense(128, activation='relu'),
    layers.Dense(64, activation='relu'),
    layers.Dense(num_classes, activation='sigmoid')  # Sigmoid for multi-label classification
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x7b4a9aa75630>

In [26]:
#Evaluate the model on the test set
y_pred = model.predict(X_test)

# Convert probabilities to binary predictions
y_pred_binary = (y_pred > 0.5).astype(int)
print(classification_report(y_test, y_pred_binary))

              precision    recall  f1-score   support

           0       0.33      0.09      0.14        11
           1       0.00      0.00      0.00        12
           2       0.00      0.00      0.00         4
           3       0.50      0.20      0.29        10
           4       0.53      0.12      0.20        67
           5       0.00      0.00      0.00         6
           6       0.83      0.09      0.16        57
           7       0.00      0.00      0.00        13
           8       1.00      0.14      0.25         7
           9       1.00      0.10      0.18        10
          10       0.00      0.00      0.00        68
          11       0.00      0.00      0.00         3
          12       0.00      0.00      0.00        14
          13       0.54      0.45      0.49        31
          14       0.50      0.11      0.18         9
          15       0.57      0.20      0.30        20
          16       0.25      0.02      0.03        57
          17       0.77    

  _warn_prf(average, modifier, msg_start, len(result))


In [27]:
y_pred

array([[3.2386151e-08, 3.9992160e-03, 1.8691221e-03, ..., 1.0666171e-03,
        7.5460065e-07, 1.9598716e-05],
       [2.0764054e-08, 7.5291631e-05, 3.5829244e-06, ..., 8.9707908e-05,
        5.1495681e-07, 4.7574186e-04],
       [1.9594839e-09, 3.4960454e-08, 8.9052683e-06, ..., 1.9464815e-02,
        2.7311444e-07, 1.4946540e-04],
       ...,
       [9.1172202e-04, 5.8677484e-05, 2.6187792e-05, ..., 1.8050175e-06,
        2.7428081e-05, 3.1990224e-01],
       [5.8725793e-05, 7.1629847e-06, 2.7147398e-05, ..., 6.8063056e-04,
        1.7142012e-04, 1.2627012e-02],
       [1.2836513e-06, 1.3922578e-04, 1.8492260e-03, ..., 1.5000098e-03,
        4.3055294e-05, 2.8789653e-03]], dtype=float32)