In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, random_split, Dataset
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import os
from torchvision import transforms
import matplotlib.pyplot as plt

from sklearn.preprocessing import RobustScaler, QuantileTransformer, MinMaxScaler

from umap import UMAP
from sklearn.cluster import DBSCAN
import time

2024-06-09 11:21:34.428932: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


# Add labels to meta data

In [2]:
labels_raw = [np.load('./labels/'+x) for x in os.listdir('./labels/') if x.endswith('.npy')]

label_dict = {}
value_array = []
for values in labels_raw:
    for value,ind in zip(values[0],values[1]):
        value_array.append(value)
        #print(value)
        label_dict[value] = ind

meta_data = pd.read_csv('cluster_meta.csv')
file_index = meta_data['cluster'].values

label_list = [label_dict[int(x[:-2])] for x in file_index]

meta_data.insert(5,'label',label_list)

meta_data.drop(meta_data[meta_data['label'] == 4].index)
meta_data.drop(meta_data[meta_data['label'] == 0].index)
meta_data['label'] = meta_data['label'] - 1

meta_data = meta_data.to_csv('cluster_meta_labels.csv', index = False)

# Import Data

In [3]:


meta_data = pd.read_csv('cluster_meta_labels.csv')
print(meta_data.head())

def load_data_with_label(folder_path, meta_data):
    data = []
    for file in meta_data['cluster']:
        file = file + '.csv'
        file_path = os.path.join(folder_path,file)
        cluster = pd.read_csv(file_path,header=None).values
        cluster = cluster.flatten()
        # cluster = np.append(cluster, meta_data[meta_data['cluster'] == file[:-4]][['y', 'x', 'E', 'size']].values.flatten())
        data.append(cluster)
    combined_array = np.stack(data,axis=0)
    print('shape of combined array: ')
    print(combined_array.shape)
    return combined_array, meta_data[['y', 'x', 'E', 'size']].values, meta_data['label'].values




meta_scaler = RobustScaler()
meta_data_values = meta_scaler.fit_transform(meta_data[['y', 'x', 'E', 'size']])



   Unnamed: 0   cluster     y     x          E  label   size
0           0  004580_A   9.0  26.0   715232.0      2  108.0
1           1  011701_G  15.0  17.0  1184202.0      3  179.0
2           2  003882_A   3.0   3.0    31156.0      0    6.0
3           3  009717_G   7.0  62.0  1423016.0      2  245.0
4           4  005590_A  32.0  31.0  1014772.0      2  169.0


In [4]:
class NumpyArrayDataset(Dataset):
    def __init__(self, data, transform=None):
        self.data = data.astype(np.float32)
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sample = self.data[idx]
        if self.transform:
            sample = self.transform(sample)
        return sample  # Return only the sample and dummy label





folder_path = 'clusters_colour_rotations_rescaled'
combined_array, meta_data_values, meta_data_labels = load_data_with_label(folder_path, meta_data)

meta_data_labels = meta_data_labels.astype(int)







shape of combined array: 
(3936, 4096)


# Normelize Data

In [5]:
scaler = MinMaxScaler()
combined_array_scaled = scaler.fit_transform(np.log(combined_array))
print(combined_array.shape)

meta_scaler = QuantileTransformer()
meta_data_values = meta_scaler.fit_transform(meta_data_values)

combined_array_scaled = np.append(combined_array_scaled, meta_data_values, axis=1)

combined_array_scaled = np.append(combined_array_scaled, meta_data_labels[:,None], axis=1)
#print(combined_array_scaled.T[-5:])

transform = transforms.Compose([
    transforms.ToTensor()
])

np.random.shuffle(combined_array_scaled)

#meta_data_values = NumpyArrayDataset(meta_data_values, transform=transform)
dataset = NumpyArrayDataset(combined_array_scaled, transform=transform)
print(combined_array_scaled.shape)

# lazy, non-random split
test_size = 0.2
split_index = int(len(dataset) * (1 - test_size))
train_dataset = dataset[:split_index]
test_dataset = dataset[split_index:]
#train_meta = meta_data_values[:split_index]
#test_meta = meta_data_values[split_index:]
#train_labels = meta_data_labels[:split_index]
#test_labels = meta_data_labels[split_index:]


train_loader = DataLoader(train_dataset, batch_size=32, shuffle=False)
valid_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)


print(train_loader.dataset[0].shape)




(3936, 4096)
(3936, 4101)
torch.Size([3148, 4101])


# Chosse device

In [6]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cpu")
device

device(type='cpu')

# Create Model

In [13]:



def create_torch_model_CNN(trial):
    N_conv_layers = 2 #trial.suggest_int('N_conv_layers', 1, 3)
    N_dense_layers = 2 #trial.suggest_int('N_dense_layers', 1, 3)
    dropout_rate = 0.5 #trial.suggest_float('dropout_rate', 0.0, 0.5)
    num_classes = 3
    class torch_CNN(nn.Module):
        def __init__(self, output_size):
            super().__init__()

            ### Convolutional section:
            #self.cnn_layer = nn.Sequential(
            #    nn.Conv2d(1, 32, 3, stride=1, padding=1),  # 1x64x64 -> 32x64x64
            #    nn.ReLU(True),
            #    nn.Conv2d(32, 64, 3, stride=1, padding=1),  # 32x128x128 -> 64x64x64
            #    nn.ReLU(True),
            #    nn.MaxPool2d(2)  # 64x64x64 -> 64x32x32
            #)

            ### Flattening:
            self.flatten = nn.Flatten(start_dim=0)

            ### Linerar section
            
            #self.linerar_layer = nn.Sequential(
            #    nn.Linear(65536, 128),
            #    nn.ReLU(True)
            #)
            
            ### output layer
            #self.output_layer = nn.Sequential(
            #    nn.Linear(128, output_size),
            #    nn.Softmax()
            #)

            ### dropout
            self.dropout = nn.Dropout(dropout_rate, inplace=True)

            self.compile_model()
        
        def create_CNN_layer(self, filters_in, filters_out):
            #filters_out = 32
            filter_size = 3
            stride_cnn = 1
            padding = 1
            pool_size = 3
            pool_stride = 1
            pool_padding = 1
            cnn_layer = nn.Sequential(
                nn.Conv2d(filters_in, filters_out, filter_size, stride=stride_cnn, padding=padding),  # 1x256x256 -> 32x128x128
                nn.ReLU(True),
                nn.MaxPool2d(pool_size, stride=pool_stride, padding=pool_padding)  # 64x64x64 -> 64x32x32
            ).to(device)
            return cnn_layer, filters_out
        
        def create_liniar_layer(self, input_size, output_size):
            liniar_layer = nn.Sequential(
                nn.Linear(input_size, output_size),
                nn.ReLU(True)
            ).to(device)
            return liniar_layer, output_size
        
        def create_output_layer(self, input_size, output_size):
            output_layer = nn.Sequential(
                nn.Linear(input_size, output_size),
                nn.Softmax()
            ).to(device)
            return output_layer
        
        def compile_model(self):
            #print('Model compiled')
            cnn_layer0, filters_out0 = self.create_CNN_layer(1, 32)
            cnn_layer1, filters_out1 = self.create_CNN_layer(filters_out0, 64)
            liniar_in = 64 * 64 * filters_out1
            liniar_layer0, output_size0 = self.create_liniar_layer(liniar_in, 128)
            liniar_layer1, output_size1 = self.create_liniar_layer(output_size0, 64)
            output_layer = self.create_output_layer(output_size1 + 4, num_classes)

            def model_compiled(x, meta):
                x = cnn_layer0(x)
                x = cnn_layer1(x)
                x = self.flatten(x)
                x = self.dropout(x)
                x = liniar_layer0(x)
                x = liniar_layer1(x)
                y = torch.cat((x, meta), dim=0)
                x = output_layer(y)
                return x
            self.model_compiled = model_compiled


        def forward(self, x):
            
            meta = x[-5:-1]
            label = x[-1]
            x = x[:-5].unflatten(0, (1, 1, 64, 64))

            x = self.model_compiled(x, meta)
           
            return x, label.type(torch.cuda.ByteTensor)
    return torch_CNN(num_classes).to(device)


def create_tf_optimizer(trial):
    # We optimize the choice of optimizers as well as their parameters.
    kwargs = {}
    
    optimizer_selected = "Adam"
    
    kwargs["learning_rate"] = trial.suggest_float("adam_learning_rate", 1e-7, 1e-1, log=True)

    optimizer = getattr(tf.optimizers, optimizer_selected)(**kwargs)
    return optimizer

def objective(trial, X_train, Y_train, X_test, Y_test):
    # Build model and optimizer.
    model = create_tf_model_CNN(trial)
    optimizer = create_tf_optimizer(trial)
    model.compile(optimizer, loss=CategoricalCrossentropy())
    # Fit the model to the data
    model.fit(x=X_train, y = Y_train, epochs=30, validation_data=(X_test, Y_test), verbose=0)
    # Find the accuracy
    cce = CategoricalCrossentropy()
    accuracy = cce(Y_test, model(X_test))
    # Return accuracy
    return accuracy





def loss_function(output, target):
    return F.cross_entropy(output, target)





def train_epoch(dataset_train, model, optimizer, loss_function):
    running_loss = 0

    
    for batch in dataset_train:
        optimizer.zero_grad()
        #print(next(model.parameters()).device, batch, model.device)
        output, label_train = model(batch)
        print(label_train.type(), output.type())
        loss = loss_function(output, label_train)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    return running_loss / len(dataset_train)

def validate_epoch(dataset_val, model, loss_function):
    running_loss = 0
    with torch.no_grad():
        for batch in dataset_val:
            output, label_val = model(batch)
            loss = loss_function(output, label_val)
            running_loss += loss.item()
    return running_loss / len(dataset_val)

def train_model(model, num_epochs, dataset_train, dataset_val, optimizer, loss_function):
    

    train_losses = []
    valid_losses = []
    for epoch in range(num_epochs):
        start_time = time.time()
        train_loss = train_epoch(dataset_train, model, optimizer, loss_function)
        valid_loss = validate_epoch(dataset_val, model, loss_function)
        train_losses.append(train_loss)
        valid_losses.append(valid_loss)
        end_time = time.time()
        print(f"Epoch: {epoch+1}/{num_epochs}, Train Loss: {train_loss:.4f}, Valid Loss: {valid_loss:.4f}, Duration: {end_time - start_time:.2f} sec")
    return train_losses, valid_losses

def plot_losses(train_losses, valid_losses):
    plt.plot(train_losses, label='Train Loss')
    plt.plot(valid_losses, label='Valid Loss')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.yscale('log')
    plt.legend()
    plt.show()

# Load data to gpu

In [8]:
dataset_train = train_loader.dataset[0]
dataset_train = dataset_train.to(device)

# create model

In [9]:
model = create_torch_model_CNN(3)
#model = model.to(device)

learning_rate = 0.001
torch.manual_seed(42)
peram_to_optimize = model.parameters()
optimizer = optim.Adam(peram_to_optimize, lr=learning_rate)
model.train()


torch_CNN(
  (cnn_layer): Sequential(
    (0): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU(inplace=True)
    (2): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (3): ReLU(inplace=True)
    (4): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (flatten): Flatten(start_dim=0, end_dim=-1)
  (linerar_layer): Sequential(
    (0): Linear(in_features=65536, out_features=128, bias=True)
    (1): ReLU(inplace=True)
  )
  (output_layer): Sequential(
    (0): Linear(in_features=128, out_features=3, bias=True)
    (1): Softmax(dim=None)
  )
  (dropout): Dropout(p=0.5, inplace=True)
)

# Run model

In [10]:
def test_func(model, dataset):
    output = np.zeros(len(dataset))
    label = np.zeros(len(dataset))
    for batch in dataset:
        model(batch)
        
for i in range(10):
    test_func(model, dataset_train)

  return self._call_impl(*args, **kwargs)


In [11]:
print()




# CPU test

# Train model

In [12]:
model = create_torch_model_CNN(3).to(device)
model = model.to(device)


dataset_train = train_loader.dataset[0]
dataset_val = valid_loader.dataset[0]


dataset_train = dataset_train.to(device)
dataset_val = dataset_val.to(device)

learning_rate = 0.001
torch.manual_seed(42)
peram_to_optimize = model.parameters()



optimizer = optim.Adam(peram_to_optimize, lr=learning_rate)

model = model.to(device)

model.train()

num_epochs = 10



print(next(model.parameters()).device, dataset_train.is_cuda)


train_losses, valid_losses = train_model(model, num_epochs, dataset_train, dataset_val, optimizer, loss_function)

plot_losses(train_losses, valid_losses)


cpu False
torch.cuda.ByteTensor torch.FloatTensor


RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument target in method wrapper_CUDA_nll_loss_forward)