In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import utils.dataloader
import pandas as pd
import random
import numpy as np
import tqdm
from utils.model import SpectralCNNClassifier

In [2]:
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

In [3]:
print("Using device: ", device)
print("Device count: ", torch.cuda.device_count())

Using device:  cuda:0
Device count:  1


In [4]:
# Set a random seed for PyTorch
seed = 42
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Set random seeds for other libraries, like NumPy
random.seed(seed)
np.random.seed(seed)

# If you're using CUDA (GPU), set a seed for it as well
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

In [5]:
import torch
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import numpy as np

class CustomSpectrumDataset(Dataset):
    def __init__(self, spectrum_file, label_file):
        # Load spectrum and label data from CSV files
        #self.spectrum_data = pd.read_csv(spectrum_file, delimiter=",", header=None).to_numpy(dtype=np.float32)
        #self.label_data = pd.read_csv(label_file, delimiter=",", header=None).to_numpy(dtype=np.float32).astype(np.int64)
        self.spectrum_data = torch.from_numpy(np.loadtxt(spectrum_file, delimiter=",", dtype=np.float32))
        self.label_data = torch.from_numpy(np.loadtxt(label_file, delimiter=",", dtype=np.float32).astype(np.int64))
    
    def __len__(self):
        # Ensure both datasets have the same length
        assert self.spectrum_data.shape[1] == self.label_data.shape[0]
        return self.spectrum_data.shape[1]

    def __getitem__(self, idx):
        # Get spectrum and label data for a given index
        spectrum = self.spectrum_data[ : , idx]
        label = self.label_data[ idx ]

        return spectrum, label


In [6]:
# spectrum_file = "datasets/training_set.csv"
# total_dataset = np.loadtxt(spectrum_file, delimiter=",", dtype=np.float32)

In [7]:
#scaler = MinMaxScaler()
#scaler.fit(total_dataset)
#total_dataset = scaler.transform(total_dataset)
# print(total_dataset.shape)
# print(np.min(total_dataset))
# print(np.max(total_dataset))

In [8]:
# total_dataset_norm = (total_dataset/np.max(total_dataset))
# print(np.min(total_dataset_norm))
# print(np.max(total_dataset_norm))
# print(total_dataset_norm.shape)

In [9]:

# training_dataset = total_dataset_norm[:, 0:40000]
# validation_dataset = total_dataset_norm[:, 40000:43152]
# np.savetxt("datasets/training_set_new.csv", training_dataset, delimiter=",")
# np.savetxt("datasets/validation_set_new.csv", validation_dataset, delimiter=",")

In [10]:
# label_file = "datasets/training_labels.csv"
# label = np.loadtxt(label_file, delimiter=",", dtype=np.float32).astype(np.int64)

In [11]:
# print(label.shape)
# training_label = label[0:40000]
# validation_label = label[40000:43152]
# np.savetxt("datasets/training_labels_new.csv", training_label, delimiter=",")
# np.savetxt("datasets/validation_labels_new.csv", validation_label, delimiter=",")

In [12]:
# test_set = np.loadtxt("datasets/test_set.csv", delimiter=",", dtype=np.float32)
# print(test_set.shape)
# print(np.min(test_set))
# print(np.max(test_set))
# test_set_norm = (test_set/np.max(total_dataset))
# print(np.min(test_set_norm))
# print(np.max(test_set_norm))



In [13]:
# np.savetxt("datasets/test_set_new.csv", test_set_norm, delimiter=",")

In [14]:
# Usage:
training_label_path = "datasets/training_labels_new.csv"
training_set_path = "datasets/training_set_new.csv"
training_dataset = CustomSpectrumDataset(training_set_path, training_label_path)


In [15]:
print(len(training_dataset))
print(training_dataset[0])
sample, label = training_dataset[0]
print(sample.shape)
print(label.shape)

40000
(tensor([0.3533, 0.3535, 0.3536,  ..., 0.3014, 0.3015, 0.3013]), tensor(7))
torch.Size([19967])
torch.Size([])


In [16]:
validation_set_path = "datasets/validation_set_new.csv"
validation_label_path = "datasets/validation_labels_new.csv"
validation_dataset = CustomSpectrumDataset(validation_set_path, validation_label_path)

In [17]:
print(len(validation_dataset))
print(validation_dataset[0])
sample, label = validation_dataset[0]
print(sample.shape)
print(label.shape)

3152
(tensor([0.5623, 0.5622, 0.5623,  ..., 0.4603, 0.4603, 0.4600]), tensor(8))
torch.Size([19967])
torch.Size([])


In [18]:
# Create the train and validation data loader
batch_size = 32
train_dataloader = DataLoader(training_dataset, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(validation_dataset, batch_size=batch_size, shuffle=True)



In [19]:
for idx, batch in enumerate(train_dataloader):
    print(idx, batch[0].shape, batch[1].shape)
    print(batch[0].unsqueeze(0).shape)
    break

0 torch.Size([32, 19967]) torch.Size([32])
torch.Size([1, 32, 19967])


In [20]:
input_size = len(training_dataset[0][0])
print(input_size)
num_classes = len(np.unique(training_dataset.label_data))
print(num_classes)

19967
10


In [23]:
#finally the training loop

import torch.optim as optim
# Define your loss function
criterion = nn.CrossEntropyLoss()
model = SpectralCNNClassifier(input_size, num_classes)
# Define your optimizer
optimizer = optim.Adam(model.parameters(), lr=1e-4)
epochs = 50

In [24]:
# put model on GPU (set runtime to GPU in Google Colab)
device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

# put model in training mode
model.train()

SpectralCNNClassifier(
  (conv1): Conv1d(1, 64, kernel_size=(3,), stride=(1,), padding=(1,))
  (relu): ReLU()
  (pool): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv1d(64, 128, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv3): Conv1d(128, 256, kernel_size=(3,), stride=(1,), padding=(1,))
  (conv4): Conv1d(256, 512, kernel_size=(3,), stride=(1,), padding=(1,))
  (fc1): Linear(in_features=638464, out_features=512, bias=True)
  (batch_norm): BatchNorm1d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (dropout): Dropout(p=0.5, inplace=False)
  (fc2): Linear(in_features=512, out_features=10, bias=True)
)

In [25]:
for epoch in range(epochs):
    print(f"Epoch {epoch+1} of {epochs}")
    for idx, batch in enumerate(train_dataloader):
        # Get the input features and target labels, and put them on the GPU
        X = batch[0].unsqueeze(1).to(device)
        y = batch[1].to(device)

        # Zero out the gradients
        optimizer.zero_grad()

        # Perform forward pass
        y_pred = model(X)

        # Compute loss
        loss = criterion(y_pred, y)

        # Perform backward pass
        loss.backward()

        # Update the parameters
        optimizer.step()

        # Print some information
        if idx % 100 == 0:
            print(f"    batch {idx} loss: {loss.item()}")

    

Epoch 1 of 50


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.22 GiB. GPU 0 has a total capacty of 8.00 GiB of which 0 bytes is free. Including non-PyTorch memory, this process has 17179869184.00 GiB memory in use. Of the allocated memory 6.12 GiB is allocated by PyTorch, and 248.78 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
torch.save(model.state_dict(), 'models_saved/models_100Epochs.pth')