In [191]:
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import torch.optim as optim

In [424]:
class XRayDataset(Dataset):
    def __init__(self, label_path, imdir_path, remote=True):
        # label_path is the path to the label sheet containing,
        # at minimum, columns 'id', 'Path', and the 9 needed labels
        # imdir_path is the path to the directory of all images
        self.label_path = label_path
        self.imdir_path = imdir_path
        self.remote = remote
        self.inds = None
        if remote:
            self.labels = pd.read_csv(label_path, low_memory=False, index_col='id')
        else:
            labs = pd.read_csv(self.label_path, low_memory=False)
            labs.rename(columns={'Unnamed: 0':'id'}, inplace=True)
            self.inds = []
            for i, path in enumerate(labs.Path.values):
                if i < len(labs.Path.values) - 1:
                    if int(path[9:14]) <= 20:
                        self.inds.append(i)
            labes = labs.iloc[inds]
            self.labels = labes.reset_index()
    
    def __len__(self):
        return self.labels.shape[0]
    
    def __getitem__(self, idx):
        row = self.labels.loc[idx]
        impath = row['Path'].replace('/','_')[6:21] + row['Path'][27:-4] + '.npy'
        im = np.load(os.path.join(imdir_path, impath))
        return torch.from_numpy(im), torch.tensor((row[5:].values).astype(int)+1).type(torch.long)

In [425]:
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.lin1 = nn.Linear(200, 200)
        self.lin2 = nn.Linear(200, 100)
        self.lin3 = nn.Linear(100, 50)
        self.lin4 = nn.Linear(50, 16)
    
    def forward(self, x):
        x = torch.Tensor(x.type(torch.float))
        x = F.relu(F.dropout(self.lin1(x)))
        x = F.relu(F.dropout(self.lin2(x)))
        x = F.relu(F.dropout(self.lin3(x)))
        x = self.lin4(x)
        return F.softmax(x, dim=1)

In [426]:
label_path = "C:\\Users\\danny\\OneDrive\\Documents\\Caltech Stuff\\Junior_Year\\CS156b\\labels_ints.csv" # "/groups/CS156b/2023/BbbBbbB/labels.csv"
imdir_path = "D:\\cs156\\images_200x200" # "/groups/CS156b/2023/BbbBbbB/images_200x200/"

data = XRayDataset(label_path, imdir_path, remote=False) # remote=True

In [431]:
train_test = random_split(data, [int(len(data)*0.8), len(data)-int(len(data)*0.8)])

batch_size = 8
train_loader = DataLoader(train_test[0], batch_size=batch_size, shuffle=True)
test_loader = DataLoader(train_test[1], batch_size=batch_size, shuffle=False)

In [432]:
net = Net()
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)

In [433]:
for epoch in range(10):
    for im in train_loader:
        inputs, labels = im
        i += 1
        optimizer.zero_grad()
        
        outputs = net(inputs)
        loss = loss_fn(outputs, labels)
        loss.backward()
        optimizer.step()
        
    if epoch % 5 == 4:
        print('Epoch:', epoch+1)
        
print('Finished Training')

Epoch: 5
Epoch: 10
Finished Training


In [434]:
correct = 0
total = 0
preds = []
truth = []
with torch.no_grad():
    for data in test_loader:
        inputs, labels = data
        _, true_labels = torch.max(labels, 1)
        outputs = net(inputs)
        _, predicted = torch.max(outputs.data, 1)
        preds.extend(predicted)
        truth.extend(true_labels)
        total += true_labels.size(0)
        correct += (predicted == true_labels).sum().item()

print(f'Accuracy on the test set: {100 * correct // total} %')

RuntimeError: The size of tensor a (16) must match the size of tensor b (8) at non-singleton dimension 1