In [2]:
import copy
import random
import time
import os

import torch
import torch.nn as nn
import torch.nn.functional 
import torch.optim 
import torch.utils.data

import torchvision.transforms
import torchvision.datasets
import torch.utils.data 
import skimage.io
import skimage.transform
from sklearn.preprocessing import OneHotEncoder

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

ModuleNotFoundError: No module named 'pandas'

In [None]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Cleaning Data

In [None]:
labels_filepath = 'data/10x_labels.csv'
labels=pd.read_csv(labels_filepath)

### Splitting Description column into shape and color

In [None]:
new= labels["Description"].str.split(" ", n = 1, expand = True)
labels.drop(columns=['Description'],inplace=True)

In [None]:
labels['Color'] = new[0].values
labels['Shape'] = new[1].values

### Decomposing Sample Keywords

In [None]:
sample_names = labels["Sample"].str.split(" ", n = 1, expand = False)

sample_names_frame = pd.DataFrame(sample_names)

        
        
labels['Sample'] = sample_names_frame

In [None]:
labels.head(20)

### Changing Identification to boolean is or is not plastic

In [None]:
labels['Identification'].unique()

In [None]:
plastics = ['polystyrene', 'polyethylene','polypropylene','Nylon','ink + plastic','PET','carbon fiber']
identification = labels['Identification']

for i in range(0,len(identification)):
    if identification[i] in plastics:
        identification[i] = True
    else:
        identification[i] = False
    
labels['Identification']=identification
labels.rename(columns={'Identification': 'isPlastic'}, inplace=True)
labels.head(10)

### Shape/isPlastic/Color Encoding

In [None]:
labels['isPlastic'] = labels["isPlastic"].astype(int)

In [None]:
labels['Color'].unique()

In [None]:
shape_encoder = OneHotEncoder()

In [None]:
shape_encoder.fit_transform(labels[['Shape']]).toarray()

In [None]:
shape_encoder.get_feature_names()

In [None]:
color_encoder = OneHotEncoder()
shape_encoder.fit_transform(labels[['Color']]).toarray()

# Custom Dataset

In [None]:
class tenX_dataset(torch.utils.data.Dataset):
    """"""
    
    #Initializes dataset. Is only used once during the creating on a 'tenX_dataset' class
    #transform is an optional parameter, it defaults to none if nothing is passed into the class
    def __init__(self, labels_frame, image_dir, transform = None):
        'Initialization'
        self.labels = labels_frame
        self.image_dir = image_dir
        self.image_filenames = os.listdir(self.image_dir)
        self.transform = None
        
    #Length of dataset
    def __len__(self):
        return len(self.labels)
    
    #Return an single image with labels based on given index
    def __getitem__(self, idx):
        image_id = self.labels['Sample'][idx]
        image_file = None
        image = None
        
        for filename in self.image_filenames:
            if len(image_id) == 1:
                if image_id[0] in filename:
                    #print(image_id)
                    image_file = filename
                    break
            else:
                if image_id[0] in filename and image_id[1] in filename:
                    #print(image_id)
                    image_file = filename
                    break
                
        if not image_file:
            #raise Exception('Could not find image file')
            image_file = self.image_filenames[0]
            image_filepath = os.path.join(self.image_dir, image_file)
            image = skimage.io.imread(image_filepath)
            sample = {'image': image,
                'shape': self.labels['Shape'][idx],
                'color': self.labels['Color'][idx],
                'plastic': self.labels['isPlastic'][idx]}
            
            return sample
        
        
        image_filepath = os.path.join(self.image_dir, image_file)
        image = skimage.io.imread(image_filepath)
        
        sample = {'image': image,
                'shape': self.labels['Shape'][idx],
                'color': self.labels['Color'][idx],
                'plastic': self.labels['isPlastic'][idx]}
            
        #This 'transform' will be where we specify how we edit the images (resize, 
        #change file type, data augmentation). It is defined outside this classs.
        if self.transform:
            sample = self.transform(sample)
        

        return sample
                
            
            

In [None]:
image_dir = 'data/images_10x'
labels_frame = labels
transform = torchvision.transforms.Compose([
                            torchvision.transforms.Resize(256),
                            torchvision.transforms.ToTensor(),
                                      ])


tenX_dataset = tenX_dataset(labels_frame, image_dir, transform = transform)

In [None]:
#tenX_dataset.image_filenames

In [None]:
samples=[]
for i in range(len(tenX_dataset)):
    sample = tenX_dataset[i]
    samples.append(sample['image'])
    if i>10:
        break

#samples
#samples

In [None]:
count = 0
for im in samples:
    if type(im) != type(None):
        plt.figure(count)
        plt.imshow(im)
        count+=1

In [None]:
np.shape(samples[8])

# Things to improve/fix
* if data is for sure consistent. Take datacleaning steps, generalize, and put into a function. Then 10x_dataset class with get passed in the filename of the labels and in the init method the dataclean function should be called.
* Verify the labels are coming through (i.e. train some sort of model on this data
* Make sure the nonetypes are because the file actually isn't in my folder of images
* 252_1 is displayign 252_10 because of way code is written. 
* One hot encode categorical data

In [None]:
len(tenX_dataset)

In [None]:
def get_mean_of_dataset(image_dataset):
    """"""
    

In [None]:
def get_std_of_dataset(image_dataset):
    """"""

In [None]:
def clean_labels(label_frame):
    """"""

In [None]:
class LeNet(nn.Module):
    def __init__(self, output_dim):
        super().__init__()

        self.conv1 = nn.Conv2d(in_channels = 1, 
                               out_channels = 6, 
                               kernel_size = 5)
        
        self.conv2 = nn.Conv2d(in_channels = 6, 
                               out_channels = 16, 
                               kernel_size = 5)
        
        self.fc_1 = nn.Linear(16 * 4 * 4, 120)
        self.fc_2 = nn.Linear(120, 84)
        self.fc_3 = nn.Linear(84, output_dim)

    def forward(self, x):

        #x = [batch size, 1, 28, 28]
        
        x = self.conv1(x)
        
        #x = [batch size, 6, 24, 24]
        
        x = F.max_pool2d(x, kernel_size = 2)
        
        #x = [batch size, 6, 12, 12]
        
        x = F.relu(x)
        
        x = self.conv2(x)
        
        #x = [batch size, 16, 8, 8]
        
        x = F.max_pool2d(x, kernel_size = 2)
        
        #x = [batch size, 16, 4, 4]
        
        x = F.relu(x)
        
        x = x.view(x.shape[0], -1)
        
        #x = [batch size, 16*4*4 = 256]
        
        h = x
        
        x = self.fc_1(x)
        
        #x = [batch size, 120]
        
        x = F.relu(x)

        x = self.fc_2(x)
        
        #x = batch size, 84]
        
        x = F.relu(x)

        x = self.fc_3(x)

        #x = [batch size, output dim]
        
        return x, h

In [None]:
OUTPUT_DIM = 1

model = LeNet(OUTPUT_DIM)

In [None]:
optimizer = torch.optim.Adam(model.parameters())

In [None]:
criterion = nn.CrossEntropyLoss()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [None]:
model = model.to(device)
criterion = criterion.to(device)

In [None]:
def calculate_accuracy(y_pred, y):
    top_pred = y_pred.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [None]:
def train(model, iterator, optimizer, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for (x, y) in iterator:
        
        x = x.to(device)
        y = y.to(device)
        
        optimizer.zero_grad()
                
        y_pred, _ = model(x)
        
        loss = criterion(y_pred, y)
        
        acc = calculate_accuracy(y_pred, y)
        
        loss.backward()
        
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
BATCH_SIZE = 20

train_iterator = torch.utils.data.DataLoader(tenX_dataset, 
                                 shuffle = True, 
                                 batch_size = BATCH_SIZE)

In [None]:
def evaluate(model, iterator, criterion, device):
    
    epoch_loss = 0
    epoch_acc = 0
    
    model.eval()
    
    with torch.no_grad():
        
        for (x, y) in iterator:

            x = x.to(device)
            y = y.to(device)

            y_pred, _ = model(x)

            loss = criterion(y_pred, y)

            acc = calculate_accuracy(y_pred, y)

            epoch_loss += loss.item()
            epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [None]:
EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    
    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)

    
    end_time = time.monotonic()

    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')