DATA EXPLORATION AND ANALYSIS
=

Imports
------------

In [None]:
# Import essential libraries
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns

# Configure visualization aesthetics
%matplotlib inline
sns.set(style="whitegrid")


Dataset Loading
--

Initial Data Inspection
--

EDA
=

Unique proteins and ligands in full dataset
-

In [None]:
# Ορισμός του μονοπατιού του αρχείου
file_path = 'C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/cleaned_dataset.txt'

# Φόρτωση του αρχείου σε ένα DataFrame με διαχωριστικό διπλό κενό
df = pd.read_csv(file_path, sep='\s+', engine='python', header=None, names=['SMILES', 'Protein', 'Label'])
# Αριθμός μοναδικών πρωτεϊνών
unique_proteins = df['Protein'].nunique()
print(f'Αριθμός μοναδικών πρωτεϊνών: {unique_proteins}')

# Αριθμός μοναδικών SMILES
unique_smiles = df['SMILES'].nunique()
print(f'Αριθμός μοναδικών SMILES: {unique_smiles}')


Label Distribution
--

In [None]:
# Count of each label
label_counts = df['Label'].value_counts()
print(label_counts)

# Visualization
plt.figure(figsize=(6,4))
sns.countplot(x='Label', data=df_cleaned)
plt.title('Distribution of Labels')
plt.xlabel('Label')
plt.ylabel('Count')
plt.show()


Protein Distribution
--

In [None]:
# Count of samples per protein
protein_counts = df_cleaned['Protein'].value_counts()

# Visualization (Top 20 Proteins)
plt.figure(figsize=(12, 6))
sns.barplot(x=protein_counts.head(20).index, y=protein_counts.head(20).values)

# Suppress x-axis labels
plt.xticks([])

plt.title('Top 20 Proteins by Sample Count')
plt.xlabel('Protein')  # You can keep this line if you want the label without ticks
plt.ylabel('Number of Samples')
plt.show()


Protein vs Label Distribution
---

In [None]:
plt.figure(figsize=(14,8))
sns.countplot(x='Protein', hue='Label', data=df_cleaned, palette='Set2')
plt.title('Label Distribution Across Proteins')
plt.xlabel('Protein')
plt.ylabel('Count')
plt.xticks([])
plt.legend(title='Label')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the figure size
plt.figure(figsize=(14, 8))

# Create the countplot
ax = sns.countplot(x='Protein', hue='Label', data=df, palette='Set2')

# Set the title and axis labels with increased font size
plt.title('Label Distribution Across Proteins', fontsize=30)
plt.xlabel('Protein', fontsize=30)
plt.ylabel('Count', fontsize=30)

# Increase the font size of the legend
plt.legend(title='Label', fontsize=20, title_fontsize=20)

# Increase the tick label size on the y-axis
plt.yticks(fontsize=20)

# Remove x-axis ticks and labels since they are too dense
plt.xticks([])

# Adjust layout to prevent labels from overlapping
plt.tight_layout()

# Show the plot
plt.show()



Finding min and max lengths of protein seq 
==

In [None]:
import pandas as pd

# Load the dataset with space-separated values
df = pd.read_csv('C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/cleaned_dataset.txt', 
                 sep="\s+", header=None, names=['SMILES', 'Protein', 'Label'])

# Make sure the 'Protein' column contains strings (in case of any non-string values)
df['Protein'] = df['Protein'].astype(str)

# Calculate the lengths of each protein sequence
sequence_lengths = df['Protein'].apply(len)

# Find the minimum and maximum lengths
min_length = sequence_lengths.min()
max_length = sequence_lengths.max()

# Find the shortest sequence (by sequence length)
shortest_sequence = df['Protein'][sequence_lengths.idxmin()]

# Print the results
print(f"Minimum sequence length: {min_length}")
print(f"Maximum sequence length: {max_length}")
print(f"Shortest sequence: {shortest_sequence}")


In [None]:
import matplotlib.pyplot as plt

# Path to your dataset
file_path = 'C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/cleaned_dataset.txt'

# Initialize a set to store unique protein sequences
unique_proteins = set()

# Read the file and extract unique protein sequences
with open(file_path, 'r') as file:
    for line in file:
        parts = line.strip().split()
        if len(parts) >= 2:
            protein_sequence = parts[1]
            unique_proteins.add(protein_sequence)

# Calculate the length of each unique protein sequence
sequence_lengths = [len(seq) for seq in unique_proteins]

# Plotting the length distribution
plt.figure(figsize=(10, 6))
plt.hist(sequence_lengths, bins=50, color='skyblue', edgecolor='black')
plt.yticks(fontsize=15)
plt.xticks(fontsize=15)
plt.title('Protein Sequence Length Distribution',fontsize=20)
plt.xlabel('Sequence Length',fontsize=20)
plt.ylabel('Frequency',fontsize=20)
plt.grid(True)
plt.show()


Size of Dictionaries For the Complete Dataset
-

In [None]:
file_path='C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/preprocessed_clean_dataset/fingerprint_dict.pickle'
import pickle
hd=open(file_path,'rb')
inter=pickle.load(hd)
print(len(inter))


In [None]:
file_path='C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/preprocessed_clean_dataset/word_dict.pickle'

import pickle
file=open(file_path,'rb')
data=pickle.load(file)
print(len(data))

In [None]:
# Inspect the first sample in dataset_train to understand input shapes
sample_data = dataset_test[0]

# Extract individual components
fingerprints, adjacency, words, inter = sample_data  # Assuming last element is label

# Print shapes of each component to determine input dimensions
print("Fingerprints shape:", fingerprints.shape)
print("Adjacency shape:", adjacency.shape)
print("Words shape:", words.shape)
print("Int shape:", inter.shape)


PROTEIN-CENTRIC SPLIT
=

Custom data split function 72 training 30 testing protein centric
==

In [None]:
import random
from collections import defaultdict
import pickle
import sys
import timeit
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.metrics import roc_auc_score, precision_score, recall_score

def load_tensor(file_name, dtype):
    with open(file_name + '.pkl', 'rb') as f:
        return [dtype(d).to(device) for d in pickle.load(f)]


def load_pickle(file_name):
    with open(file_name, 'rb') as f:
        return pickle.load(f)
dir_input = ('C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/preprocessed_clean_dataset/')

"""CPU or GPU."""
if torch.cuda.is_available():
    device = torch.device('cuda')
    print('The code uses GPU...')
else:
    device = torch.device('cpu')
    print('The code uses CPU!!!')

compounds = load_tensor(dir_input + 'compounds', torch.LongTensor)
adjacencies = load_tensor(dir_input + 'adjacencies', torch.FloatTensor)
proteins = load_tensor(dir_input + 'proteins', torch.LongTensor)
interactions = load_tensor(dir_input + 'interactions', torch.LongTensor)
fingerprint_dict = load_pickle(dir_input + 'fingerprint_dict.pickle')
word_dict = load_pickle(dir_input + 'word_dict.pickle')
n_fingerprint = len(fingerprint_dict)
n_word = len(word_dict)
dataset = list(zip(compounds, adjacencies, proteins, interactions))
def custom_split(dataset):
    """Create a dataset and split it into train/dev/test."""
    
    # Step 1: Map each unique protein to its samples
    protein_to_samples = defaultdict(list)
    for i, (compound, adjacency, protein, interaction) in enumerate(dataset):
        protein_to_samples[tuple(protein.tolist())].append((compound, adjacency, protein, interaction))
    #print(protein_to_samples)
        # Sort protein_to_samples by the number of samples for each protein in descending order
    sorted_proteins = sorted(protein_to_samples.items(), key=lambda x: len(x[1]), reverse=True)
    
    # Convert back to a dictionary, if you need the sorted dictionary
    sorted_protein_to_samples = dict(sorted_proteins)

    # Step 2: Split proteins into training, validation, and test sets
    unique_proteins = list(sorted_protein_to_samples.keys())
    random.seed(1234)
    #random.shuffle(unique_proteins)
    
    # Select 72 proteins for training/validation and 30 for testing
    train_val_proteins = unique_proteins[:72]
    test_proteins = unique_proteins[72:]
    
    # Further split train_val_proteins into 80% training and 20% validation
    train_size = int(len(train_val_proteins) * 0.8)
    train_proteins = train_val_proteins[:train_size]
    val_proteins = train_val_proteins[train_size:]
    
    # Step 3: Collect samples based on the protein split
    dataset_train = [sample for protein in train_proteins for sample in sorted_protein_to_samples[protein]]
    dataset_dev = [sample for protein in val_proteins for sample in sorted_protein_to_samples[protein]]
    dataset_test = [sample for protein in test_proteins for sample in sorted_protein_to_samples[protein]]
    return dataset_train,dataset_dev,dataset_test
dataset_train,dataset_dev,dataset_test=custom_split(dataset)
# Verify the size of each dataset
print(f"Training set: {len(dataset_train)} samples")
print(f"Validation set: {len(dataset_dev)} samples")
print(f"Test set: {len(dataset_test)} samples")


In [None]:
import torch

# Assuming `dataset_test` is a list of tuples as explained
# Save the dataset as a PyTorch file
torch.save(dataset_test, 'C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/base_model/test_set_ex3.pt')
torch.save(dataset_train, 'C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/base_model/train_set_ex3.pt')
torch.save(dataset_dev, 'C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/base_model/dev_set_ex3.pt')

print("Dataset saved successfully.")


In [None]:
dataset_test=torch.load('C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/base_model/test_set_ex3.pt',weights_only=False)
len(dataset_test)

In [None]:
import matplotlib.pyplot as plt
from collections import Counter
import torch

#dataset=torch.load('C:/Users/alexandra/PycharmProjects/GING_DIPLOMA/DeepEmbedding-DTI/dataset/dude/base_model/val_set_ex3.pt',weights_only=False)
dataset=dataset_test
# Υπολογισμός της κατανομής των labels στο training set
def plot_label_distribution(dataset):
    # Εξαγωγή των labels από το dataset
    labels = [data[-1].item() for data in dataset]  # Αν το label είναι σε tensor, μετατρέψτε σε item
    label_counts = Counter(labels)

    # Εκτύπωση της κατανομής
    print("Label Distribution in Training Set:")
    for label, count in label_counts.items():
        print(f"Label {label}: {count} samples")

    # Bar plot της κατανομής
    plt.figure(figsize=(8, 6))
    plt.bar(label_counts.keys(), label_counts.values(), color='skyblue')
    plt.xlabel("Labels")
    plt.ylabel("Frequency")
    plt.title("Label Distribution in Training Set")
    plt.xticks(list(label_counts.keys()))  # Αν θέλουμε ονόματα κάτω από κάθε label
    plt.show()

# Κλήση της συνάρτησης
plot_label_distribution(dataset)

