In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
import torch
import matplotlib.pyplot as plt
from tqdm import tqdm

## Analysis of full dataset and data preprocessing

In [2]:
# Get the absolute path of the current notebook
notebook_path = os.path.abspath('')

# Navigate to the project root (CS182-Final-Project)
project_root = os.path.dirname(notebook_path)
os.chdir(project_root)

# Now you can load the data using relative path from project root
print("Loading full dataset...")
full_data = pd.read_pickle('./data/benchmarkingGS_v1-0_similarityMeasure_sequence_v3-1.pkl')
display(full_data)


# Keep only the required columns
columns_to_keep = ['uniprotID_A', 'uniprotID_B', 'isInteraction', 'trainTest', 'sequence_A', 'sequence_B']
data = full_data[columns_to_keep]
display(data)

# Calculate dataset statistics
print("\n--- Full Dataset Statistics ---")

Loading full dataset...


Unnamed: 0,uniprotID_A,uniprotID_B,isInteraction,trainTest,RNAseqHPA,tissueHPA,tissueCellHPA,subcellularLocationHPA,bioProcessUniprot,cellCompUniprot,molFuncUniprot,domainUniprot,motifUniprot,Bgee,sequence_A,sequence_B
0,P28223,P41595,1,test2,0.160188,-0.449930,-0.060381,,0.400892,0.404061,0.680414,0.0,0.790569,0.422078,MDILCEENTSLSSTTNSLMQLNDDTRLYSNDFNSGEANTSDAFNWT...,MALSYRVSELQSTIPEHILQSTFVHVISSNWSGLQTESIPEEMKQI...
1,O00161,P56962,1,train,0.825131,0.851690,0.675880,0.000000,0.190693,0.200000,0.353553,0.0,0.000000,0.922975,MDNLSSEEIQQRAHQITDESLESTRRILGLAIESQDAGIKTITMLD...,MSEDEEKVKLRRLEPAIQKFIKIVIPTDLERLRKHQINIEKYQRCR...
2,P82979,Q01081,1,train,0.930790,0.954869,0.911887,0.000000,0.547723,0.365148,0.408248,0.0,0.000000,,MATETVELHKLKLAELKQECLARGLETKGIKQDLIHRLQAYLEEHA...,MAEYLASIFGTEKDKVNCSFYFKIGACRHGDRCSRLHNKPTFSQTI...
3,O60678,Q14524,1,train,0.219384,,,,0.000000,0.000000,0.000000,0.0,0.000000,,MCSLASGATGGRGAVENEEDLPELSDSGDEAAWEDEDDADLPHGKQ...,MANFLLPRGTSSFRRFTRESLAAIEKRMAEKQARGSTTLQESREGL...
4,P10275,Q15648,1,train,0.500558,-0.311704,-0.371061,0.000000,0.169811,0.375000,0.157895,0.0,0.000000,,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,MKAQGETEESEKLSKMSSLLERLHAKFNQNRPWSETIKLVRQVMEK...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
268494,O95678,Q8IYX3,0,test2,0.022646,,,,0.000000,0.000000,0.000000,0.0,0.000000,,MSRQSSITFQSGSRRGFSTTSAITPAAGRSRFSSVSVARSAAGSGG...,MARCRHHSGYLADDEASHSMCSARVQLPKKPLVPEMRPACKPGRVP...
268495,O95835,Q8WUJ0,0,test2,0.854290,,,,0.000000,0.447214,0.000000,0.0,0.000000,0.930655,MKRSEKPEGYRQMRPKTFPASNYTVSSRQMLQEIRESLRNLSKPSD...,MEDVKLEFPSLPQCKEDAEEWTYPMRREMQEILPGLFLGPYSSAMK...
268496,P60409,Q9NUB4,0,test2,0.894159,,,,0.000000,0.000000,0.000000,0.0,0.000000,,MAASTMSVCSSDLSYGSRVCLPGSCDSCSDSWQVDDCPESCCEPPC...,MTRLCLPRPEAREDPIPVPPRGLGAGEGSGSPVRPPVSTWGPSWAQ...
268497,O43294,Q6ZRT6,0,test2,0.062636,,,0.288675,0.000000,0.000000,0.000000,0.0,0.000000,-0.606365,MEDLDALLSDLETTTSHMPRSGAPKERPAEPLTPPPSYGHQPQTGS...,MVSRPRSPSAFPAPWWGQQPGGPGPAKRLRLEEPAGPEPRAAPSLE...


Unnamed: 0,uniprotID_A,uniprotID_B,isInteraction,trainTest,sequence_A,sequence_B
0,P28223,P41595,1,test2,MDILCEENTSLSSTTNSLMQLNDDTRLYSNDFNSGEANTSDAFNWT...,MALSYRVSELQSTIPEHILQSTFVHVISSNWSGLQTESIPEEMKQI...
1,O00161,P56962,1,train,MDNLSSEEIQQRAHQITDESLESTRRILGLAIESQDAGIKTITMLD...,MSEDEEKVKLRRLEPAIQKFIKIVIPTDLERLRKHQINIEKYQRCR...
2,P82979,Q01081,1,train,MATETVELHKLKLAELKQECLARGLETKGIKQDLIHRLQAYLEEHA...,MAEYLASIFGTEKDKVNCSFYFKIGACRHGDRCSRLHNKPTFSQTI...
3,O60678,Q14524,1,train,MCSLASGATGGRGAVENEEDLPELSDSGDEAAWEDEDDADLPHGKQ...,MANFLLPRGTSSFRRFTRESLAAIEKRMAEKQARGSTTLQESREGL...
4,P10275,Q15648,1,train,MEVQLGLGRVYPRPPSKTYRGAFQNLFQSVREVIQNPGPRHPEAAS...,MKAQGETEESEKLSKMSSLLERLHAKFNQNRPWSETIKLVRQVMEK...
...,...,...,...,...,...,...
268494,O95678,Q8IYX3,0,test2,MSRQSSITFQSGSRRGFSTTSAITPAAGRSRFSSVSVARSAAGSGG...,MARCRHHSGYLADDEASHSMCSARVQLPKKPLVPEMRPACKPGRVP...
268495,O95835,Q8WUJ0,0,test2,MKRSEKPEGYRQMRPKTFPASNYTVSSRQMLQEIRESLRNLSKPSD...,MEDVKLEFPSLPQCKEDAEEWTYPMRREMQEILPGLFLGPYSSAMK...
268496,P60409,Q9NUB4,0,test2,MAASTMSVCSSDLSYGSRVCLPGSCDSCSDSWQVDDCPESCCEPPC...,MTRLCLPRPEAREDPIPVPPRGLGAGEGSGSPVRPPVSTWGPSWAQ...
268497,O43294,Q6ZRT6,0,test2,MEDLDALLSDLETTTSHMPRSGAPKERPAEPLTPPPSYGHQPQTGS...,MVSRPRSPSAFPAPWWGQQPGGPGPAKRLRLEEPAGPEPRAAPSLE...



--- Full Dataset Statistics ---


In [3]:
# Training set stats
train_data = data[data['trainTest'] == 'train']
train_pos = train_data[train_data['isInteraction'] == 1]
train_neg = train_data[train_data['isInteraction'] == 0]
print(f"Train set: {train_data.shape[0]} examples")
print(f"  Positive: {train_pos.shape[0]} ({train_pos.shape[0]/train_data.shape[0]*100:.2f}%)")
print(f"  Negative: {train_neg.shape[0]} ({train_neg.shape[0]/train_data.shape[0]*100:.2f}%)")

# Test1 set stats
test1_data = data[data['trainTest'] == 'test1']
test1_pos = test1_data[test1_data['isInteraction'] == 1]
test1_neg = test1_data[test1_data['isInteraction'] == 0]
print(f"\nTest1 set: {test1_data.shape[0]} examples")
print(f"  Positive: {test1_pos.shape[0]} ({test1_pos.shape[0]/test1_data.shape[0]*100:.2f}%)")
print(f"  Negative: {test1_neg.shape[0]} ({test1_neg.shape[0]/test1_data.shape[0]*100:.2f}%)")

# Test2 set stats
test2_data = data[data['trainTest'] == 'test2']
test2_pos = test2_data[test2_data['isInteraction'] == 1]
test2_neg = test2_data[test2_data['isInteraction'] == 0]
print(f"\nTest2 set: {test2_data.shape[0]} examples")
print(f"  Positive: {test2_pos.shape[0]} ({test2_pos.shape[0]/test2_data.shape[0]*100:.2f}%)")
print(f"  Negative: {test2_neg.shape[0]} ({test2_neg.shape[0]/test2_data.shape[0]*100:.2f}%)")

# Create validation set from training data (20% of training)
print("\n--- Creating Validation Set ---")
# Split the training data into train and validation
train_data_new, val_data = train_test_split(
    train_data, 
    test_size=0.2, 
    random_state=42, 
    stratify=train_data['isInteraction']  # Maintain the same pos/neg ratio
)

# Update the trainTest field
val_data = val_data.copy()
val_data['trainTest'] = 'validation'

print(f"New train set: {train_data_new.shape[0]} examples")
print(f"  Positive: {train_data_new[train_data_new['isInteraction'] == 1].shape[0]} "
      f"({train_data_new[train_data_new['isInteraction'] == 1].shape[0]/train_data_new.shape[0]*100:.2f}%)")

print(f"Validation set: {val_data.shape[0]} examples")
print(f"  Positive: {val_data[val_data['isInteraction'] == 1].shape[0]} "
      f"({val_data[val_data['isInteraction'] == 1].shape[0]/val_data.shape[0]*100:.2f}%)")

# Save datasets to full_dataset directory
output_dir = './data/full_dataset'
os.makedirs(output_dir, exist_ok=True)

print("\nSaving datasets...")
train_data_new.to_pickle(f'{output_dir}/train_data.pkl')
val_data.to_pickle(f'{output_dir}/validation_data.pkl')
test1_data.to_pickle(f'{output_dir}/test1_data.pkl')
test2_data.to_pickle(f'{output_dir}/test2_data.pkl')

print(f"All datasets successfully saved to '{output_dir}' directory") 

Train set: 106662 examples
  Positive: 53331 (50.00%)
  Negative: 53331 (50.00%)

Test1 set: 24898 examples
  Positive: 12449 (50.00%)
  Negative: 12449 (50.00%)

Test2 set: 136939 examples
  Positive: 12449 (9.09%)
  Negative: 124490 (90.91%)

--- Creating Validation Set ---
New train set: 85329 examples
  Positive: 42665 (50.00%)
Validation set: 21333 examples
  Positive: 10666 (50.00%)

Saving datasets...
All datasets successfully saved to './data/full_dataset' directory
