In [2]:
import os
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

class CustomDataset(Dataset):
    def __init__(self, folder_path, split):
        self.folder_path = folder_path
        self.split = split
        self.data = self.load_data()
        self.label_encoding = {'I': 0, 'O':1, 'P': 2, 'S': 3, 'M':4, 'B': 5}

    def load_data(self):
        file_list = [f for f in os.listdir(self.folder_path) if f.endswith('.npy')]
        file_list.sort()  # Make sure the order is consistent

        data = []
        for file_name in file_list:
            file_path = os.path.join(self.folder_path, file_name)
            data.append(np.load(file_path, allow_pickle=True).item())

        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        sample = self.data[index]
        
        inputs = sample['data']
        labels_str = sample['labels']

        labels_list = [self.label_encoding[label] for label in labels_str]
        print(labels_list)
        print(np.shape(labels_list))
        labels_tensor = torch.tensor(labels_list, dtype=torch.long)
        return {'data': inputs, 'labels': labels_tensor}
def collate_fn(batch):
    # Get the length of each sequence in the batch
    lengths = [len(sample['data']) for sample in batch]
    
    # Pad sequences to the maximum length in the batch
    max_len = max(lengths)
    padded_data = [torch.nn.functional.pad(torch.tensor(sample['data']), pad=(0, max_len - len(sample['data']))) for sample in batch]
    
    # Stack padded sequences and labels
    data = torch.stack(padded_data)
    labels = torch.stack([sample['labels'] for sample in batch])
    
    return {'data': data, 'labels': labels}

def create_data_loaders(data_root):
    splits = [
        (['cv0', 'cv1', 'cv2'], 'cv3' , 'cv4'),
        (['cv1', 'cv2', 'cv3',], 'cv4', 'cv0'),
        (['cv2', 'cv3', 'cv4'], 'cv0', 'cv1'),
        (['cv3', 'cv4', 'cv0'], 'cv1', 'cv2'),
        (['cv4', 'cv0', 'cv1'], 'cv2', 'cv3'),
    ]

    data_loaders = {}

    for train_folders, val_folder, test_folder in splits:
        train_datasets = [CustomDataset(os.path.join(data_root, folder), 'train') for folder in train_folders]
        val_dataset = CustomDataset(os.path.join(data_root, val_folder), 'val')
        test_dataset = CustomDataset(os.path.join(data_root, test_folder), 'test')
        
        for train_folder, train_dataset in zip(train_folders, train_datasets):
            data_loaders[train_folder] = DataLoader(train_dataset, batch_size=32, shuffle=True)
        data_loaders[val_folder] = DataLoader(val_dataset, batch_size=32, shuffle=False)
        data_loaders[test_folder] = DataLoader(test_dataset, batch_size=32, shuffle=False)

    return data_loaders

data_root = os.getcwd() + '\encoder_proteins'
print(data_root)
data_loaders = create_data_loaders(data_root)

# Accessing the data loaders
train_loader_cv0 = data_loaders['cv0']
val_loader_cv1 = data_loaders['cv1']
test_loader_cv2 = data_loaders['cv2']

# Iterate through a few batches to test the DataLoader
for batch_idx, batch in enumerate(train_loader_cv0):
    inputs, labels = batch['data'], batch['labels']

    print(f"Batch {batch_idx + 1}:")
    print("Inputs shape:", inputs.shape)  # Assuming inputs is a NumPy array or a PyTorch tensor
    print("Labels shape:", labels.shape)  # Assuming labels is a NumPy array or a PyTorch tensor

    # Break the loop after a few batches for testing purposes
    if batch_idx == 1:
        break

C:\Users\andre\Documents\DeepLearning Project\encoder_proteins
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
(244,)
[3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2

RuntimeError: stack expects each tensor to be equal size, but got [244, 512] at entry 0 and [446, 512] at entry 1

cv0, cv1, cv2 for train, cv3 for validation, cv4 for test
cv1, cv2, cv3 for train, cv4 for validation, cv0 for test
cv2, cv3, cv4 for train, cv0 for validation, cv1 for test
cv3, cv4, cv0 for train, cv1 for validation, cv2 for test
cv4, cv0, cv1 for train, cv2 for validation, cv3 for test


In [1]:
import os
import json
import torch
import numpy as np
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

**DENSE NN Model** 

In [2]:
class DenseNN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(DenseNN, self).__init__()

        # Flatten the input to a 1D array
        self.flatten = nn.Flatten()

        # Define fully connected layers
        self.fc1 = nn.Linear(input_size, 256)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(256, 128)
        self.relu2 = nn.ReLU()

        # Output layer with softmax activation for multiclass classification
        self.fc3 = nn.Linear(128, num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        x = self.flatten(x)
        x = self.relu1(self.fc1(x))
        x = self.relu2(self.fc2(x))
        x = self.softmax(self.fc3(x))
        return x

# Specify the input size (L * 512) and number of classes (6)
input_size = L * 512
num_classes = 6

# Create the model
dense_nn_model = DenseNN(input_size, num_classes)

# Print the model architecture
print(dense_nn_model)


NameError: ignored

In [19]:
# Define the base directory where your protein folders are located
print(os.getcwd())
os.chdir('encoder_proteins') 
base_dir = os.getcwd()

C:\Users\andre\Documents\DeepLearning Project


In [26]:
# List to store loaded proteins
train_proteins = []
test_proteins = []
val_proteins = []
# Iterate through the folders 'cv0' to 'cv5'
print(os.listdir(base_dir))
# Train split:
for i in range(2):  # Assuming folders are named 'cv0' to 'cv5'
    folder_name = f'cv{i}'
    print("cv:", i)
    folder_path = os.path.join(base_dir, folder_name)
    # Iterate through files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.npy'):
            file_path = os.path.join(folder_path, filename)

            # Load the protein and append to the list
            protein = np.load(file_path, allow_pickle=True).item()
            train_proteins.append(protein)
# Test split:
for i in range(1):
    folder_name = f'cv{i+3}'
    print("cv:", i+3)
    folder_path = os.path.join(base_dir, folder_name)

    # Iterate through files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith('.npy'):
            file_path = os.path.join(folder_path, filename)

            # Load the protein and append to the list
            protein = np.load(file_path, allow_pickle=True).item()
            test_proteins.append(protein)
# Val dataset:

folder_name = f'cv{5}'
folder_path = os.path.join(base_dir, folder_name)

# Iterate through files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.npy'):
        file_path = os.path.join(folder_path, filename)

        # Load the protein and append to the list
        protein = np.load(file_path, allow_pickle=True).item()
        val_proteins.append(protein)

print("Train proteins shape: ",len(train_proteins))
print("Test proteins shape: ", len(test_proteins))
print("Val proteins shape: ", len(val_proteins))

['cv0', 'cv1', 'cv2', 'cv3', 'cv4', 'cv5']
cv: 0
cv: 1
cv: 3
Train proteins shape:  2
Test proteins shape:  1
Val proteins shape:  1


In [None]:

# Now, all_proteins contains the loaded proteins from all folders

# Assuming X and y are your feature matrix and target variable, as in the scikit-learn example
print(os.getcwd())
f = open("DeepTMHMM.partitions.json")

labels = json.load(f)
print(labels.keys())
encoder_path = f"../encoder_proteins/{cv}/{protein['id']}"
read_dictionary = np.load(encoder_path + ".npy", allow_pickle='TRUE').item()
print(read_dictionary)
# Convert data to PyTorch tensors
X = torch.tensor(X.values, dtype=torch.float32)
y = torch.tensor(y.values, dtype=torch.float32)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create DataLoader for training and testing sets
train_dataset = TensorDataset(X_train, y_train)
test_dataset = TensorDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)


# Initialize the model, loss function, and optimizer
input_size = X.shape[1]
model = LogisticRegression(input_size)
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
num_epochs = 50
for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels.view(-1, 1))
        loss.backward()
        optimizer.step()

# Evaluation on the test set
model.eval()
with torch.no_grad():
    all_predictions = []
    all_labels = []
    for inputs, labels in test_loader:
        outputs = model(inputs)
        predictions = (outputs > 0.5).float()
        all_predictions.extend(predictions.numpy())
        all_labels.extend(labels.numpy())

# Calculate accuracy
accuracy = accuracy_score(all_labels, all_predictions)
print(f"Accuracy: {accuracy}")
