In [17]:
import copy
import random
import time
import os

import torch
import torch.nn as nn
import torch.nn.functional 
import torch.optim 
import torch.utils.data

import torchvision.transforms
import torchvision.datasets

import skimage.io
import skimage.transform
import sklearn.preprocessing

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Functions

In [18]:
def set_seeds(seed):
    """sets seeds for several used packages"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [19]:
def encode_column(column):
    """
    takes single columned Pandas DataFrame of categorical data and encodes it
    into array of class binarys
    """
    encoder = sklearn.preprocessing.OneHotEncoder()
    shape_arr = encoder.fit_transform(column).toarray().astype(int)
        
    return list(shape_arr)

In [20]:
def prep_data(labels):
    """
    Takes in raw labels dataframe and converts it into the format
    expected for tenX_dataset class
    """

    #Splitting description column into color and shape columns
    new = labels["Description"].str.split(" ", n=1, expand=True)
    labels.drop(columns=['Description'], inplace=True)
    labels['Color'] = new[0].values
    labels['Shape'] = new[1].values
    
    #Decomposing sample keywords into seperate strings
    sample_names = labels["Sample"].str.split(" ", n=1, expand=False)
    labels['Sample'] = sample_names
    
    #Converting identification into boolean for is/is not plastic
    PLASTICS = ['polystyrene', 'polyethylene','polypropylene','Nylon','ink + plastic','PET','carbon fiber']
    identification = labels['Identification']
    
    for i in range(0,len(identification)):
        if identification[i] in PLASTICS:
            identification[i] = True
        else:
            identification[i] = False

    labels['Identification']=identification
    labels.rename(columns={'Identification': 'isPlastic'}, inplace=True)
    labels['isPlastic'] = labels["isPlastic"].astype(int)
    
    
    #Encoding shape and color data
    labels['Shape'] = encode_column(labels[['Shape']])
    labels['Color'] = encode_column(labels[['Color']])
    
    return labels

In [22]:
labels.head(20)

Unnamed: 0,Sample,Sample origin,Size (um),isPlastic,Color,Shape
0,[252_1],mussels,50,0,"[0, 1, 0, 0, 0]","[0, 0, 1, 0, 0]"
1,[252_2],mussels,50,0,"[0, 0, 0, 1, 0]","[1, 0, 0, 0, 0]"
2,[252_3],mussels,25,0,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]"
3,[252_4],mussels,75,1,"[0, 0, 0, 1, 0]","[0, 1, 0, 0, 0]"
4,[252_5],mussels,50,0,"[0, 0, 0, 1, 0]","[0, 0, 0, 0, 1]"
5,[252_6],mussels,50,1,"[0, 0, 0, 0, 1]","[0, 0, 0, 0, 1]"
6,[252_7],mussels,50,1,"[0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]"
7,[252_8],mussels,30,0,"[0, 0, 0, 1, 0]","[0, 0, 1, 0, 0]"
8,[252_9],mussels,100,0,"[0, 1, 0, 0, 0]","[0, 1, 0, 0, 0]"
9,[252_10],mussels,50,0,"[0, 1, 0, 0, 0]","[0, 1, 0, 0, 0]"


In [16]:
def get_filenames(labels, image_root):
    """Replaces sample column of labels with the actual filename so that the dataset class
    doesn't have to do that work."""

SyntaxError: unexpected EOF while parsing (<ipython-input-16-a55670bd33ae>, line 1)

# Custom Dataset

In [14]:
class tenX_dataset(torch.utils.data.Dataset):
    """
    Class inherited from torch Dataset. Required methods are, init,
    len, and getitem.
    """
    def __init__(self, labels_frame, image_dir, transform):
        """
        initializes an instance of the class. Here we store 4 variables
        in the class. Calling init just looks like dataset = tenX_dataset(lables, 'image_folder', transform).
        
        labels: altered version of csv file
        image_dir: The file path to the folder the images are in
        image_filenames: A list of all the image file names in the image folder
        transform: A pytorch object. Works like a function. You call transform(x) and it performs
                    a series of operations on x
        """
        self.labels = labels_frame
        self.image_dir = image_dir
        self.image_filenames = os.listdir(self.image_dir)
        self.transform = transform
        

    def __len__(self):
        """Returns the length of the dataset"""
        return len(self.labels)
    
    
    def __getitem__(self, idx):
        """
        Returns a dictionary containing image and image data. Right now
        it looks like: 
        sample = {'image': image, 'plastic': [0], 'shape':[0,0,0,0,0], 'color':[0,0,0,0,0]}
        """
        image_id = self.labels['Sample'][idx]
        image_file = None
        image = None
        
        #Searching through image folder for the filename we want.
        #Has unsolved bug. Ex. when looking for image 252_1, it will display 252_10 instead
        for filename in self.image_filenames:
            if len(image_id) == 1:
                if image_id[0] in filename:
                    image_file = filename
                    break
            else:
                if image_id[0] in filename and image_id[1] in filename:
                    image_file = filename
                    break
                
        #Right now if the image file is not found I just use the first image in the folder
        #To refine this more, we should take this step out of the dataset class and into
        #The training and testing loop. Where we will check if the sample['image'] == None
        #For just testing the code this works for now
        if image_file:
            image_filepath = os.path.join(self.image_dir, image_file)
            image = skimage.io.imread(image_filepath)
            
            
            if self.transform is not None:
                image = self.transform(image)
            
        print((image_id, image_file))
        sample = {'image': image,
                  'shape': self.labels['Shape'][idx],
                  'color': self.labels['Color'][idx],
                  'plastic': self.labels['isPlastic'][idx]}
  
        return sample

### Plotting first 20 images of dataset. Obviously getting quite a few duplicates

In [15]:
labels_filepath = 'data/10x_labels.csv'
image_dir = 'data/images_10x'
labels = prep_data(pd.read_csv(labels_filepath))
tenX = tenX_dataset(labels, image_dir, None)


for i in range(len(tenX)):
    sample = tenX[i]
    #plt.figure(i)
    #if sample['image'] is not None:
        #plt.imshow(sample['image'])
    #if i>50:
        #break

(['252_1'], '252_10 - 10x.bmp')
(['252_2'], '252_2 - 10x.bmp')
(['252_3'], '252_3 - 10x.bmp')
(['252_4'], '252_4 - 10x.bmp')
(['252_5'], '252_5 - 10x.bmp')
(['252_6'], '252_6 - 10x.bmp')
(['252_7'], None)
(['252_8'], '252_8 - 10x.bmp')
(['252_9'], None)
(['252_10'], '252_10 - 10x.bmp')
(['20200824', '250_1'], '20200824 250_1 - 10x.bmp')
(['20200824', '252_1'], '20200824 252_1 - 10x.bmp')
(['20200824', '272_2'], '20200824 272_2 - 10x.bmp')
(['20200819', '93_1'], '20200819 93_1 - 10x.bmp')
(['20200819', '93_2'], '20200819 93_2 - 10x.bmp')
(['20200819', '93_3'], '20200819 93_3 - 10x.bmp')
(['20200819', '93_4'], '20200819 93_4 - 10x.bmp')
(['20200819', '93_5'], '20200819 93_5 - 10x.bmp')
(['20200819', '93_6'], '20200819 93_6 - 10x.bmp')
(['20200819', '135_1'], '20200819 135_1 - 10x.bmp')
(['2020819', '237_1'], None)
(['2020819', '237_2'], None)
(['2020819', '237_3'], None)
(['2020819', '252_1'], None)
(['2020819', '252_2'], None)
(['2020819', '252_3'], None)
(['2020819', '252_4'], None)
([

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


# Things to improve/fix
* if data is for sure consistent. Take datacleaning steps, generalize, and put into a function. Then 10x_dataset class with get passed in the filename of the labels and in the init method the dataclean function should be called.
* Make sure the nonetypes are because the file actually isn't in my folder of images
* 252_1 is displayign 252_10 because of way code is written. 
* Code for normalizing image data
* Image augmentation. Probably want to cut off some of the edges to get rid of number stuff and decrease extraneous information. The think we actually care about is only occupying like 5-10% of the image.

# Start of me trying to plug into cnn

Most of the code came from this tutorial: https://github.com/bentrevett/pytorch-image-classification/blob/master/2_lenet.ipynb

I was just trying to get this to work so I won't understand it as much

In [None]:
image_dir = 'data/images_10x'
labels_frame = labels

#This transform just resizes the images to 3,480,752. So 3 for red green blue then height of 480
#and width of 752. 
transform = torchvision.transforms.Compose([
                            torchvision.transforms.ToPILImage(),
                            torchvision.transforms.Resize((480, 752)),
                            torchvision.transforms.ToTensor()
                                      ])


train_data = tenX_dataset(labels_frame, image_dir, transform = transform)

#### Splitting into train/validation set

In [None]:
VALID_RATIO = 0.9

n_train_examples = int(len(train_data) * VALID_RATIO)
n_valid_examples = len(train_data) - n_train_examples

train_data, valid_data = torch.utils.data.random_split(train_data, 
                                           [n_train_examples, n_valid_examples])

In [None]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

#### Declaring iterator. The thing that will loop through our dataset.

In [None]:
BATCH_SIZE = 5

train_iterator = torch.utils.data.DataLoader(train_data, 
                                 shuffle = True, 
                                 batch_size = BATCH_SIZE)

valid_iterator = torch.utils.data.DataLoader(valid_data, 
                                 batch_size = BATCH_SIZE)

#### The CNN archetecture

In [None]:
class LeNet(nn.Module):
    def __init__(self, output_dim):
        """
        Initializes CNN. Here we just define layer shapes that we call in the forward func
        """
        super().__init__()

        #Convulution layer 1. 
        #3 input channels (for three images Red, Green, Blue)
        #6 output channels (I THINK this means we are applying two different filters to each image
        #3 images, two filters each, we end up with 6 'images')
        #kernel size is I THINK telling the filters took filter each set of 5 pixels into one.
        #So are images will shrink a little as the edges get cut off
        self.conv1 = nn.Conv2d(in_channels = 3, 
                               out_channels = 6, 
                               kernel_size = 5)
        
        #Convultion layer 2. See above
        self.conv2 = nn.Conv2d(in_channels = 6, 
                               out_channels = 12, 
                               kernel_size = 5)
        
        #Linear layers. These probably arent complicated but I don't follow haha
        #I think it turning the 259740 pixel values into 6 values. Then the second layers
        #Turns the 6 into a different 6? and then 6 into 2. I'm not sure why 2 and not 1.
        #Seeing as the output should be a number between 0-1. Closer to 0 = not plastic,
        #closer to 1 = plastic. But I got errors about not having enough classes when
        #I only had 1 output neuron.
        #TBH these linear layers I just changed based on the error messages I got.
        self.fc_1 = nn.Linear(259740, 6)
        self.fc_2 = nn.Linear(6, 6)
        self.fc_3 = nn.Linear(6, 2)

    def forward(self, x):
        """
        Function that performs all the neural network forward calculation i.e.
        takes image data from the input of the neural network to the output
        """

        
        x = self.conv1(x)
    
        #Gonna have to look at tutorial link.
        x = nn.functional.max_pool2d(x, kernel_size = 2)
        
        x = nn.functional.relu(x)
        
        x = self.conv2(x)
                
        x = nn.functional.max_pool2d(x, kernel_size = 2)
        
        x = nn.functional.relu(x)
        
        x = x.view(x.shape[0], -1)
                
        h = x
        
        x = self.fc_1(x)
                
        x = nn.functional.relu(x)

        x = self.fc_2(x)
                
        x = nn.functional.relu(x)

        x = self.fc_3(x)
        
        return x, h

In [None]:
#Instancing model, loss criteria, device to perform calculations on, and optimizer.
OUTPUT_DIM = 1
model = LeNet(OUTPUT_DIM)


criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters())

In [None]:
#Telling the model and loss function to do math on whatever device is
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def calculate_accuracy(y_pred, y):
    """
    Function calculate accuracy. See tutorial, may not
    even be accurate for our model but it at least runs
    """
    top_pred = y_pred.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [None]:
def train(model, iterator, optimizer, criterion, device):
    """
    Training loop. Takes data through NN calculates loss and adjusts NN. Repeat
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for sample in iterator:
        image = sample['image'].to(device)
        isPlastic = sample['plastic'].to(device)
        
        optimizer.zero_grad()      
        y_pred, what = model(image)

        loss = criterion(y_pred, isPlastic)
        acc = calculate_accuracy(y_pred, isPlastic)
        loss.backward()    
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
#Here the model is actually trained
EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    
    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)

    
    end_time = time.monotonic()

    #epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')