In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm import tqdm

In [2]:
split = 'split1'
path = "../DATA/"
train_path = "{}/{}/train/train_{}.csv".format(path,split,split)
test_path =  "{}/{}/test/test_{}.csv".format(path,split,split)
valid_path = "{}/{}/val/val_{}.csv".format(path,split,split)

In [3]:
print(train_path)
print(test_path)
print(valid_path)

../DATA//split1/train/train_split1.csv
../DATA//split1/test/test_split1.csv
../DATA//split1/val/val_split1.csv


In [4]:
train_data = pd.read_csv(train_path)
print(train_data.shape)
x_train = train_data[train_data.columns[2:]]
print(x_train.shape)
y_train = train_data[train_data.columns[1]]
le = LabelEncoder()
y_train = np.array(le.fit_transform(y_train))
print(le.classes_)

(292, 32)
(292, 30)
['B' 'M']


In [5]:
# Load the testing set
test_data = pd.read_csv(test_path)
print(test_data.shape)
x_test = test_data[test_data.columns[2:]]
print(x_test.shape)
y_test = test_data[test_data.columns[1]]
le = LabelEncoder()
y_test = np.array(le.fit_transform(y_test))
print(le.classes_)

(91, 32)
(91, 30)
['B' 'M']


In [6]:
# Load the validation set
val_data = pd.read_csv(valid_path)
print(val_data.shape)
x_val = val_data[val_data.columns[2:]]
print(x_val.shape)
y_val = val_data[val_data.columns[1]]
le = LabelEncoder()
y_val = np.array(le.fit_transform(y_val))
print(le.classes_)

(72, 32)
(72, 30)
['B' 'M']


In [7]:
# Define a custom dataset class
class WisconsinDataset(Dataset):
    def __init__(self, x, y):
        self.X = x.values
        self.y = y
        self.X = torch.tensor(self.X, dtype=torch.float32)
        self.y = torch.tensor(self.y, dtype=torch.int64)
        
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [8]:
class CNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, input_length):
        super(CNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.conv1d_1 = nn.Conv1d(in_channels=embedding_dim, out_channels=5000, kernel_size=2)
        self.max_pooling = nn.MaxPool1d(kernel_size=2)
        self.conv1d_2 = nn.Conv1d(in_channels=5000, out_channels=1000, kernel_size=2)
        self.avg_pooling = nn.AvgPool1d(kernel_size=2)
        self.flatten = nn.Flatten()
        
    def forward(self, x):
        x = x.to(torch.int64)  # Cast input tensor to torch.LongTensor
        x = self.embedding(x)
        x = x.transpose(1, 2)
        x = self.conv1d_1(x)
        x = self.max_pooling(x)
        x = self.conv1d_2(x)
        x = self.avg_pooling(x)
        x = self.flatten(x)
        return x

In [38]:
# Initialize the datasets and data loaders
val_dataset = WisconsinDataset(x_val, y_val)
train_dataset = WisconsinDataset(x_train, y_train)
test_dataset = WisconsinDataset(x_test, y_test)
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=1)
val_loader = DataLoader(val_dataset, batch_size=1)

In [10]:
num_features = len(le.classes_)
print(num_features)

2


In [11]:
print(len(train_loader))
print(len(test_loader))
print(len(val_data))

292
91
72


In [12]:
train_dataset[0][0].dtype

torch.float32

## Check data

In [13]:
# # extract a single instance from the dataset
# data = next(iter(train_dataset))
# print(data[0])

In [14]:
# # convert tensor to numpy array and flatten it
# numpy_array = data[0].numpy().flatten()
# print(numpy_array)

In [15]:
# # create a pandas DataFrame from the numpy array
# df = pd.DataFrame(numpy_array)
# print(df)

### Data is correct

In [16]:
# !pip install torchinfo
# !pip install keras


In [17]:
model = CNN(vocab_size=20000, embedding_dim=300, input_length=31)
inputs = torch.randint(low=0, high=20000, size=(1, 31))
outputs = model(inputs)

In [18]:
print(model)

CNN(
  (embedding): Embedding(20000, 300, padding_idx=0)
  (conv1d_1): Conv1d(300, 5000, kernel_size=(2,), stride=(1,))
  (max_pooling): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv1d_2): Conv1d(5000, 1000, kernel_size=(2,), stride=(1,))
  (avg_pooling): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  (flatten): Flatten(start_dim=1, end_dim=-1)
)


In [19]:
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                   Param #
CNN                                      --
├─Embedding: 1-1                         6,000,000
├─Conv1d: 1-2                            3,005,000
├─MaxPool1d: 1-3                         --
├─Conv1d: 1-4                            10,001,000
├─AvgPool1d: 1-5                         --
├─Flatten: 1-6                           --
Total params: 19,006,000
Trainable params: 19,006,000
Non-trainable params: 0

In [20]:
model.eval()

CNN(
  (embedding): Embedding(20000, 300, padding_idx=0)
  (conv1d_1): Conv1d(300, 5000, kernel_size=(2,), stride=(1,))
  (max_pooling): MaxPool1d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv1d_2): Conv1d(5000, 1000, kernel_size=(2,), stride=(1,))
  (avg_pooling): AvgPool1d(kernel_size=(2,), stride=(2,), padding=(0,))
  (flatten): Flatten(start_dim=1, end_dim=-1)
)

In [21]:
# device = "cuda:0" if torch.cuda.is_available() else "cpu"
# print(f"Using {device} device")

In [22]:
# model = model.to(device)

In [23]:
# Define the loss function and optimizer
# criterion = nn.BCELoss()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Decay LR by a factor of 0.1 every 7 epochs
exp_lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)

In [34]:
num_epochs = 10
learning_rate = 0.0001
lr_step_size = 7
lr_gamma = 0.1

In [None]:
# Move model and data to GPU if available
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define optimizer and learning rate scheduler
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=lr_step_size, gamma=lr_gamma)

# Define loss function
criterion = nn.CrossEntropyLoss()

# Train the model
for epoch in range(num_epochs):
    # Set model to train mode
    model.train()
    
    running_loss = 0.0
    for i, data in enumerate(train_loader, 0):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward + backward + optimize
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
        
    # Set model to eval mode
    model.eval()
    
    # Evaluate on validation set
    with torch.no_grad():
        val_loss = 0.0
        val_acc = 0.0
        for i, data in enumerate(val_loader, 0):
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            outputs = model(inputs)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            _, predicted = torch.max(outputs.data, 1)
            val_acc += (predicted == labels).sum().item()

        val_loss /= len(val_loader)
        val_acc /= len(val_loader.dataset)
        
    # Print progress
    print('Epoch %d, train loss: %.3f, val loss: %.3f, val acc: %.3f' % 
          (epoch + 1, running_loss / len(train_loader), val_loss, val_acc))
    
    # Update learning rate scheduler
    scheduler.step()

Epoch 1, train loss: 0.000, val loss: 10.146, val acc: 0.708
Epoch 2, train loss: 0.000, val loss: 10.147, val acc: 0.708
Epoch 3, train loss: 0.000, val loss: 10.148, val acc: 0.708
Epoch 4, train loss: 0.000, val loss: 10.149, val acc: 0.708
Epoch 5, train loss: 0.000, val loss: 10.150, val acc: 0.708
Epoch 6, train loss: 0.000, val loss: 10.149, val acc: 0.708
Epoch 7, train loss: 0.000, val loss: 10.149, val acc: 0.708
Epoch 8, train loss: 0.000, val loss: 10.149, val acc: 0.708


In [26]:
# Save the model to a file
torch.save(model.state_dict(), 'breast-cancer-wisconsin-cnn.pth')

In [None]:
# Get the feature maps for the input data
model.eval()
with torch.no_grad():
    feature_maps = model(input_data)