In [226]:
import copy
import random
import time
import os
import re

import torch
import torch.nn as nn
import torch.nn.functional 
import torch.optim 
import torch.utils.data

import torchvision.transforms
import torchvision.datasets

import skimage.io
import skimage.transform
import sklearn.preprocessing

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

# Functions

In [2]:
def set_seeds(seed):
    """sets seeds for several used packages"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [3]:
def encode_column(column):
    """
    takes single columned Pandas DataFrame of categorical data and encodes it
    into array of class binarys
    """
    encoder = sklearn.preprocessing.OneHotEncoder()
    shape_arr = encoder.fit_transform(column).toarray().astype(int)
        
    return list(shape_arr)

In [196]:
def prep_data(labels):
    """
    Takes in raw labels dataframe and converts it into the format
    expected for tenX_dataset class
    """

    #Splitting description column into color and shape columns
    new = labels["Description"].str.split(" ", n=1,  expand=True)
    labels.drop(columns=['Description'], inplace=True)
    labels['Color'] = new[0].values
    labels['Shape'] = new[1].values
    
    #Decomposing sample keywords into seperate strings
    print(labels["Sample"])
    sample_names = labels["Sample"].str.split(" ", expand=False)
    labels['Sample'] = sample_names
    
    #Converting identification into boolean for is/is not plastic
    PLASTICS = ['polystyrene', 'polyethylene','polypropylene','Nylon','ink + plastic','PET','carbon fiber']
    identification = labels['Identification']
    
    for i in range(0,len(identification)):
        if identification[i] in PLASTICS:
            identification[i] = True
        else:
            identification[i] = False

    labels['Identification']=identification
    labels.rename(columns={'Identification': 'isPlastic'}, inplace=True)
    labels['isPlastic'] = labels["isPlastic"].astype(int)
    
    
    #Encoding shape and color data
    labels['Shape'] = encode_column(labels[['Shape']])
    labels['Color'] = encode_column(labels[['Color']])
    
    return labels

In [197]:
prep_data('./data/10x_labels.csv')

TypeError: string indices must be integers

In [198]:
labels = pd.read_csv('data/10x_labels.csv', sep='\t')

In [199]:
labels_2 = pd.read_csv('data/10x_labels_more.csv', sep='\t')

In [200]:
prep_data(labels_2)

0                     252_1
1                     252_2
2                     252_3
3                     252_4
4                     252_5
               ...         
375    OaklandBay_Oyster 3a
376    OaklandBay_Oyster 3b
377    OaklandBay_Oyster 3c
378    OaklandBay_Oyster 3d
379    OaklandBay_Oyster 3f
Name: Sample, Length: 380, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Unnamed: 0,Sample,Sample origin,Size (um),isPlastic,Color,Shape
0,[252_1],mussels,50.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[252_2],mussels,50.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[252_3],mussels,25.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,[252_4],mussels,75.0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,[252_5],mussels,50.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...,...,...
375,"[OaklandBay_Oyster, 3a]",,550.0,0,"[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
376,"[OaklandBay_Oyster, 3b]",size wrong,650.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
377,"[OaklandBay_Oyster, 3c]",,800.0,0,"[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
378,"[OaklandBay_Oyster, 3d]",,125.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [202]:
labels_2.tail(n=1)

Unnamed: 0,Sample,Sample origin,Size (um),isPlastic,Color,Shape
379,"[OaklandBay_Oyster, 3f]",,425.0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [205]:
labels.head()

Unnamed: 0,Sample,file,Sample origin,Size (um),isPlastic,Color,Shape
0,[252_1],OaklandBay_Oyster 3f - 10x.bmp,mussels,50.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[252_2],OaklandBay_Oyster 3f - 10x.bmp,mussels,50.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[252_3],OaklandBay_Oyster 3f - 10x.bmp,mussels,25.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,[252_4],OaklandBay_Oyster 3f - 10x.bmp,mussels,75.0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,[252_5],OaklandBay_Oyster 3f - 10x.bmp,mussels,50.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [206]:
prep_data(labels)

KeyError: 'Description'

In [207]:
def get_filenames(path):
    filenames = []
    for root, dirs, files in os.walk(path):
        for filename in files:
            filenames.append(filename)
    return filenames

In [208]:
image_filenames = get_filenames('./data/images_10x/')

In [269]:
labels = prep_data(pd.read_csv('data/10x_labels_more.csv', sep='\t'))

labels.insert(loc=1, column='file', value=None)

image_filenames = get_filenames('./data/images_10x/')
for index, row in labels.iterrows():
    #print(index, row)
    sample = row['Sample']
    for fname in image_filenames:
        str_id = '^' + ' '.join(row['Sample']) + ' .*'
        result = re.search(str_id, fname)
        if result:
            print('str_id = {}, fname = {}'.format(str_id, fname))
            image_file = result.group()
            assert(os.path.exists('./data/images_10x/' + image_file))
            break
    else:
        image_file = 'None'
        print('no file with ID "{}" exists'.format(row['Sample']))
    
    labels.loc[index, 'file'] = image_file 
 
    
# matches result it should be giving
# make sure it can accept multiple sample ids and be ok
# test the assertion works
# test path with two correct file names and bad examples too
# sample id without ocrresponding filename
# 

#     if len(row['Sample']) == 1:
#         for fname in image_filenames:
#             if ' '.join(row['Sample']) + ' ' in fname:
#                 image_file = fname
#                 break
#     else:
#         for fname in image_filenames:
#             if ' '.join(row['Sample']) + ' ' in fname:
#                 image_file = fname
#                 break
                
           
#     #image_id = row['Sample'][idx]


labels = prep_data(pd.read_csv('data/10x_labels_more.csv', sep='\t'))

def add_filenames(labels, image_root):
    """
    Replaces sample column of labels with the actual filename so that the dataset class doesn't have to do that work.
    """
    image_filenames = get_filenames(image_root)
    
    for index, row in labels.iterrows():
        sample = row['Sample']
        for fname in image_filenames:
            str_id = '^' + ' '.join(row['Sample']) + ' .*'
            result = re.search(str_id, fname)
            if result:
                print('str_id = {}, fname = {}'.format(str_id, fname))
                image_file = result.group()
                assert(os.path.exists('./data/images_10x/' + image_file))
                break
        else:
            image_file = 'None'
            print('no file with ID "{}" exists'.format(row['Sample']))
        labels.loc[index, 'file'] = image_file
    return


0                     252_1
1                     252_2
2                     252_3
3                     252_4
4                     252_5
               ...         
375    OaklandBay_Oyster 3a
376    OaklandBay_Oyster 3b
377    OaklandBay_Oyster 3c
378    OaklandBay_Oyster 3d
379    OaklandBay_Oyster 3f
Name: Sample, Length: 380, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


str_id = ^252_1 .*, fname = 252_1 - 10x.bmp
str_id = ^252_2 .*, fname = 252_2 - 10x.bmp
str_id = ^252_3 .*, fname = 252_3 - 10x.bmp
str_id = ^252_4 .*, fname = 252_4 - 10x.bmp
str_id = ^252_5 .*, fname = 252_5 - 10x.bmp
str_id = ^252_6 .*, fname = 252_6 - 10x.bmp
no file with ID "['252_7']" exists
str_id = ^252_8 .*, fname = 252_8 - 10x.bmp
no file with ID "['252_9']" exists
str_id = ^252_10 .*, fname = 252_10 - 10x.bmp
str_id = ^20200824 250_1 .*, fname = 20200824 250_1 - 10x.bmp
str_id = ^20200824 252_1 .*, fname = 20200824 252_1 - 10x.bmp
str_id = ^20200824 272_2 .*, fname = 20200824 272_2 - 10x.bmp
str_id = ^20200819 93_1 .*, fname = 20200819 93_1 - 10x.bmp
str_id = ^20200819 93_2 .*, fname = 20200819 93_2 - 10x.bmp
str_id = ^20200819 93_3 .*, fname = 20200819 93_3 - 10x.bmp
str_id = ^20200819 93_4 .*, fname = 20200819 93_4 - 10x.bmp
str_id = ^20200819 93_5 .*, fname = 20200819 93_5 - 10x.bmp
str_id = ^20200819 93_6 .*, fname = 20200819 93_6 - 10x.bmp
str_id = ^20200819 135_1 .*, f

no file with ID "['20190112', 'Heritage', 'Control', '19-21a']" exists
no file with ID "['20190112', 'Heritage', 'Control', '19-21b']" exists
no file with ID "['20190112', 'Heritage', 'Control', '19-21c']" exists
str_id = ^20190112 Heritage Oyster 19a .*, fname = 20190112 Heritage Oyster 19a - 10x.bmp
str_id = ^20190112 Heritage Oyster 19b .*, fname = 20190112 Heritage Oyster 19b - 10x.bmp
str_id = ^20190112 Heritage Oyster 19c .*, fname = 20190112 Heritage Oyster 19c - 10x.bmp
str_id = ^20190112 Heritage Oyster 19d .*, fname = 20190112 Heritage Oyster 19d - 10x.bmp
str_id = ^20190112 Heritage Oyster 19e .*, fname = 20190112 Heritage Oyster 19e - 10x.bmp
str_id = ^20190112 Heritage Oyster 19f .*, fname = 20190112 Heritage Oyster 19f - 10x.bmp
str_id = ^20190112 Heritage Oyster 19g .*, fname = 20190112 Heritage Oyster 19g - 10x.bmp
str_id = ^20190112 Heritage Oyster 20a .*, fname = 20190112 Heritage Oyster 20a - 10x.bmp
str_id = ^20190112 Heritage Oyster 20b .*, fname = 20190112 Heritag

str_id = ^20190113 Penrose Pt Oyster 12c .*, fname = 20190113 Penrose Pt Oyster 12c - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 12d .*, fname = 20190113 Penrose Pt Oyster 12d - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 12e .*, fname = 20190113 Penrose Pt Oyster 12e - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 19a .*, fname = 20190113 Penrose Pt Oyster 19a - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 19b .*, fname = 20190113 Penrose Pt Oyster 19b - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 19c .*, fname = 20190113 Penrose Pt Oyster 19c - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 19d .*, fname = 20190113 Penrose Pt Oyster 19d - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 20a .*, fname = 20190113 Penrose Pt Oyster 20a - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 20b .*, fname = 20190113 Penrose Pt Oyster 20b - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 20c .*, fname = 20190113 Penrose Pt Oyster 20c - 10x.bmp
str_id = ^20190113 Penrose Pt Oyster 20d .*, fname = 2019011

str_id = ^20190114 Jacoby Oyster 31d .*, fname = 20190114 Jacoby Oyster 31d - 10x.bmp
str_id = ^20190114 NBay Control 13-15a .*, fname = 20190114 NBay Control 13-15a - 10x.bmp
str_id = ^20190114 NBay Control 13-15b .*, fname = 20190114 NBay Control 13-15b - 10x.bmp
str_id = ^20190114 NBay Control 19-21a .*, fname = 20190114 NBay Control 19-21a - 10x.bmp
str_id = ^20190114 NBay Control 19-21b .*, fname = 20190114 NBay Control 19-21b - 10x.bmp
str_id = ^20190114 NBay Oyster 12a .*, fname = 20190114 NBay Oyster 12a - 10x.bmp
str_id = ^20190114 NBay Oyster 12b .*, fname = 20190114 NBay Oyster 12b - 10x.bmp
str_id = ^20190114 NBay Oyster 12c .*, fname = 20190114 NBay Oyster 12c - 50x.bmp
str_id = ^20190114 NBay Oyster 12e .*, fname = 20190114 NBay Oyster 12e - 10x.bmp
str_id = ^20190114 NBay Oyster 12f .*, fname = 20190114 NBay Oyster 12f - 10x.bmp
str_id = ^20190114 NBay Oyster 13a .*, fname = 20190114 NBay Oyster 13a - 10x.bmp
str_id = ^20190114 NBay Oyster 13b .*, fname = 20190114 NBay O

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [287]:
def add_filenames(labels, image_root):
    """
    Replaces sample column of labels with the actual filename so that the dataset class doesn't have to do that work.
    """
    image_filenames = get_filenames(image_root)
    labels.insert(loc=1, column='file', value=None)
    for index, row in labels.iterrows():
        sample = row['Sample']
        for fname in image_filenames:
            str_id = '^' + ' '.join(row['Sample']) + ' .*'
            result = re.search(str_id, fname)
            if result:
                image_file = result.group()
                assert(os.path.exists('./data/images_10x/' + image_file))
                break
        else:
            image_file = 'None'
        labels.loc[index, 'file'] = image_file
    return labels

In [288]:
labels = prep_data(pd.read_csv('data/10x_labels_more.csv', sep='\t'))

0                     252_1
1                     252_2
2                     252_3
3                     252_4
4                     252_5
               ...         
375    OaklandBay_Oyster 3a
376    OaklandBay_Oyster 3b
377    OaklandBay_Oyster 3c
378    OaklandBay_Oyster 3d
379    OaklandBay_Oyster 3f
Name: Sample, Length: 380, dtype: object


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [289]:
labeled = add_filenames(labels, './data/images_10x/')

In [292]:
labeled.head(n=50)

Unnamed: 0,Sample,file,Sample origin,Size (um),isPlastic,Color,Shape
0,[252_1],252_1 - 10x.bmp,mussels,50.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,[252_2],252_2 - 10x.bmp,mussels,50.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,[252_3],252_3 - 10x.bmp,mussels,25.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,[252_4],252_4 - 10x.bmp,mussels,75.0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,[252_5],252_5 - 10x.bmp,mussels,50.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
5,[252_6],252_6 - 10x.bmp,mussels,50.0,1,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
6,[252_7],,mussels,50.0,1,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
7,[252_8],252_8 - 10x.bmp,mussels,30.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,[252_9],,mussels,100.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
9,[252_10],252_10 - 10x.bmp,mussels,50.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [159]:
testfile = '252_10 - 10x.bmp'
testlabel = '252_10'

In [162]:
testlabel + ' ' in testfile

True

In [164]:
testlabel, testcase = ['20200824', '250_1'], '20200824 250_1 - 10x.bmp'

In [216]:
' '.join(['hi','there']) #in testcase

'hi there'

In [243]:
re.search('hi.*', 'hi there').group()

'hi there'

In [251]:
labels.tail(n=30)

Unnamed: 0,Sample,file,Sample origin,Size (um),isPlastic,Color,Shape
350,"[20190114, NBay, Oyster, 15f]",OaklandBay_Oyster 3f - 10x.bmp,oysters,150.0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
351,"[20190114, NBay, Oyster, 19a]",OaklandBay_Oyster 3f - 10x.bmp,oysters,150.0,0,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
352,"[20190114, NBay, Oyster, 19b]",OaklandBay_Oyster 3f - 10x.bmp,oysters,125.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
353,"[20190114, NBay, Oyster, 20a]",OaklandBay_Oyster 3f - 10x.bmp,oysters,3000.0,0,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
354,"[20190114, NBay, Oyster, 20b]",OaklandBay_Oyster 3f - 10x.bmp,oysters,60.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
355,"[20190114, NBay, Oyster, 20c]",OaklandBay_Oyster 3f - 10x.bmp,oysters,525.0,0,"[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
356,"[20190114, NBay, Oyster, 20d]",OaklandBay_Oyster 3f - 10x.bmp,oysters,300.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
357,"[20190114, NBay, Oyster, 20e]",OaklandBay_Oyster 3f - 10x.bmp,oysters,100.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
358,"[20190114, NBay, Oyster, 21a]",OaklandBay_Oyster 3f - 10x.bmp,oysters,30.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
359,"[20190114, NBay, Oyster, 21b]",OaklandBay_Oyster 3f - 10x.bmp,oysters,60.0,0,"[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Custom Dataset

In [89]:
class tenX_dataset(torch.utils.data.Dataset):
    """
    Class inherited from torch Dataset. Required methods are, init,
    len, and getitem.
    """
    def __init__(self, labels_frame, image_dir, transform):
        """
        initializes an instance of the class. Here we store 4 variables
        in the class. Calling init just looks like dataset = tenX_dataset(lables, 'image_folder', transform).
        
        labels: altered version of csv file
        image_dir: The file path to the folder the images are in
        image_filenames: A list of all the image file names in the image folder
        transform: A pytorch object. Works like a function. You call transform(x) and it performs
                    a series of operations on x
        """
        self.labels = labels_frame
        self.image_dir = image_dir
        self.image_filenames = os.listdir(self.image_dir)
        self.transform = transform
        

    def __len__(self):
        """Returns the length of the dataset"""
        return len(self.labels)
    
    
    def __getitem__(self, idx):
        """
        Returns a dictionary containing image and image data. Right now
        it looks like: 
        sample = {'image': image, 'plastic': [0], 'shape':[0,0,0,0,0], 'color':[0,0,0,0,0]}
        """
        image_id = self.labels['Sample'][idx]
        image_file = None
        image = None
        
        #Searching through image folder for the filename we want.
        #Has unsolved bug. Ex. when looking for image 252_1, it will display 252_10 instead
        for filename in self.image_filenames:
            if len(image_id) == 1:
                if image_id[0] in filename:
                    image_file = filename
                    break
            else:
                if image_id[0] in filename and image_id[1] in filename:
                    image_file = filename
                    
                    break
                
        #Right now if the image file is not found I just use the first image in the folder
        #To refine this more, we should take this step out of the dataset class and into
        #The training and testing loop. Where we will check if the sample['image'] == None
        #For just testing the code this works for now
        if image_file:
            image_filepath = os.path.join(self.image_dir, image_file)
            image = skimage.io.imread(image_filepath)
            
            
            if self.transform is not None:
                image = self.transform(image)
            
        print((image_id, image_file))
        sample = {'image': image,
                  'shape': self.labels['Shape'][idx],
                  'color': self.labels['Color'][idx],
                  'plastic': self.labels['isPlastic'][idx]}
  
        return sample

### Plotting first 20 images of dataset. Obviously getting quite a few duplicates

In [91]:
labels_filepath = 'data/10x_labels.csv'
image_dir = 'data/images_10x'
labels = prep_data(pd.read_csv(labels_filepath, sep='\t'))
tenX = tenX_dataset(labels, image_dir, None)


for i in range(len(tenX)):
    sample = tenX[i]
    #plt.figure(i)
    #if sample['image'] is not None:
        #plt.imshow(sample['image'])
    #if i>50:
        #break

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


(['252_1'], '252_10 - 10x.bmp')
(['252_2'], '252_2 - 10x.bmp')
(['252_3'], '252_3 - 10x.bmp')
(['252_4'], '252_4 - 10x.bmp')
(['252_5'], '252_5 - 10x.bmp')
(['252_6'], '252_6 - 10x.bmp')
(['252_8'], '252_8 - 10x.bmp')
(['252_10'], '252_10 - 10x.bmp')
(['20200824', '250_1'], '20200824 250_1 - 10x.bmp')
(['20200824', '252_1'], '20200824 252_1 - 10x.bmp')
(['20200824', '272_2'], '20200824 272_2 - 10x.bmp')
(['20200818', '72_1'], '20200818 72_1 - 10x.bmp')
(['20200818', '72_2'], '20200818 72_2 - 10x.bmp')
(['20200818', '72_3'], '20200818 72_3 - 10x.bmp')
(['20200818', '72_4'], '20200818 72_4 - 10x.bmp')
(['20200818', '73_1'], '20200818 73_1 - 10x.bmp')
(['20200818', '74_1'], '20200818 74_1 - 10x.bmp')
(['20200818', '74_2'], '20200818 74_2 - 10x.bmp')
(['20200817', '73_1'], '20200817 73_1 - 10x.bmp')
(['20200817', '73_2'], '20200817 73_2 - 10x.bmp')
(['20200817', '110_1'], '20200817 110_1 - 10x.bmp')
(['20200817', '110_3'], '20200817 110_3 - 10x.bmp')
(['20200817', '110_4'], '20200817 110_4

# Things to improve/fix
* if data is for sure consistent. Take datacleaning steps, generalize, and put into a function. Then 10x_dataset class with get passed in the filename of the labels and in the init method the dataclean function should be called.
* Make sure the nonetypes are because the file actually isn't in my folder of images
* 252_1 is displayign 252_10 because of way code is written.- fixed in new funcion 
* Code for normalizing image data
* Image augmentation. Probably want to cut off some of the edges to get rid of number stuff and decrease extraneous information. The think we actually care about is only occupying like 5-10% of the image.

# Start of me trying to plug into cnn

Most of the code came from this tutorial: https://github.com/bentrevett/pytorch-image-classification/blob/master/2_lenet.ipynb

I was just trying to get this to work so I won't understand it as much

In [92]:
image_dir = 'data/images_10x'
labels_frame = labels

#This transform just resizes the images to 3,480,752. So 3 for red green blue then height of 480
#and width of 752. 
transform = torchvision.transforms.Compose([
                            torchvision.transforms.ToPILImage(),
                            torchvision.transforms.Resize((480, 752)),
                            torchvision.transforms.ToTensor()
                                      ])


train_data = tenX_dataset(labels_frame, image_dir, transform = transform)

#### Splitting into train/validation set

In [93]:
VALID_RATIO = 0.9

n_train_examples = int(len(train_data) * VALID_RATIO)
n_valid_examples = len(train_data) - n_train_examples

train_data, valid_data = torch.utils.data.random_split(train_data, 
                                           [n_train_examples, n_valid_examples])

In [94]:
print(f'Number of training examples: {len(train_data)}')
print(f'Number of validation examples: {len(valid_data)}')

Number of training examples: 23
Number of validation examples: 3


#### Declaring iterator. The thing that will loop through our dataset.

In [95]:
BATCH_SIZE = 5

train_iterator = torch.utils.data.DataLoader(train_data, 
                                 shuffle = True, 
                                 batch_size = BATCH_SIZE)

valid_iterator = torch.utils.data.DataLoader(valid_data, 
                                 batch_size = BATCH_SIZE)

#### The CNN archetecture

In [96]:
class LeNet(nn.Module):
    def __init__(self, output_dim):
        """
        Initializes CNN. Here we just define layer shapes that we call in the forward func
        """
        super().__init__()

        #Convulution layer 1. 
        #3 input channels (for three images Red, Green, Blue)
        #6 output channels (I THINK this means we are applying two different filters to each image
        #3 images, two filters each, we end up with 6 'images')
        #kernel size is I THINK telling the filters took filter each set of 5 pixels into one.
        #So are images will shrink a little as the edges get cut off
        self.conv1 = nn.Conv2d(in_channels = 3, 
                               out_channels = 6, 
                               kernel_size = 5)
        
        #Convultion layer 2. See above
        self.conv2 = nn.Conv2d(in_channels = 6, 
                               out_channels = 12, 
                               kernel_size = 5)
        
        #Linear layers. These probably arent complicated but I don't follow haha
        #I think it turning the 259740 pixel values into 6 values. Then the second layers
        #Turns the 6 into a different 6? and then 6 into 2. I'm not sure why 2 and not 1.
        #Seeing as the output should be a number between 0-1. Closer to 0 = not plastic,
        #closer to 1 = plastic. But I got errors about not having enough classes when
        #I only had 1 output neuron.
        #TBH these linear layers I just changed based on the error messages I got.
        self.fc_1 = nn.Linear(259740, 6)
        self.fc_2 = nn.Linear(6, 6)
        self.fc_3 = nn.Linear(6, 2)

    def forward(self, x):
        """
        Function that performs all the neural network forward calculation i.e.
        takes image data from the input of the neural network to the output
        """

        
        x = self.conv1(x)
    
        #Gonna have to look at tutorial link.
        x = nn.functional.max_pool2d(x, kernel_size = 2)
        
        x = nn.functional.relu(x)
        
        x = self.conv2(x)
                
        x = nn.functional.max_pool2d(x, kernel_size = 2)
        
        x = nn.functional.relu(x)
        
        x = x.view(x.shape[0], -1)
                
        h = x
        
        x = self.fc_1(x)
                
        x = nn.functional.relu(x)

        x = self.fc_2(x)
                
        x = nn.functional.relu(x)

        x = self.fc_3(x)
        
        return x, h

In [97]:
#Instancing model, loss criteria, device to perform calculations on, and optimizer.
OUTPUT_DIM = 1
model = LeNet(OUTPUT_DIM)


criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
optimizer = torch.optim.Adam(model.parameters())

In [98]:
#Telling the model and loss function to do math on whatever device is
model = model.to(device)
criterion = criterion.to(device)

In [99]:
def calculate_accuracy(y_pred, y):
    """
    Function calculate accuracy. See tutorial, may not
    even be accurate for our model but it at least runs
    """
    top_pred = y_pred.argmax(1, keepdim = True)
    correct = top_pred.eq(y.view_as(top_pred)).sum()
    acc = correct.float() / y.shape[0]
    return acc

In [100]:
def train(model, iterator, optimizer, criterion, device):
    """
    Training loop. Takes data through NN calculates loss and adjusts NN. Repeat
    """
    epoch_loss = 0
    epoch_acc = 0
    
    model.train()
    
    for sample in iterator:
        image = sample['image'].to(device)
        isPlastic = sample['plastic'].to(device)
    
        optimizer.zero_grad()      
        y_pred, what = model(image)

        loss = criterion(y_pred, isPlastic)
        acc = calculate_accuracy(y_pred, isPlastic)
        loss.backward()    
        optimizer.step()
        
        epoch_loss += loss.item()
        epoch_acc += acc.item()
        
    return epoch_loss / len(iterator), epoch_acc / len(iterator)

In [104]:
#Here the model is actually trained
EPOCHS = 20

best_valid_loss = float('inf')

for epoch in range(EPOCHS):
    
    start_time = time.monotonic()
    
    train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
    
    end_time = time.monotonic()

    #epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    #print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
    #print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
    #print(f'\t Val. Loss: {valid_loss:.3f} |  Val. Acc: {valid_acc*100:.2f}%')

(['252_3'], '252_3 - 10x.bmp')
(['252_6'], '252_6 - 10x.bmp')
(['20200818', '72_2'], '20200818 72_2 - 10x.bmp')
(['252_8'], '252_8 - 10x.bmp')
(['20200824', '250_1'], '20200824 250_1 - 10x.bmp')
(['20200818', '74_1'], '20200818 74_1 - 10x.bmp')
(['252_2'], '252_2 - 10x.bmp')
(['20200817', '117_1'], '20200817 117_1 - 10x.bmp')
(['20200818', '72_4'], '20200818 72_4 - 10x.bmp')
(['20200818', '72_3'], '20200818 72_3 - 10x.bmp')
(['252_1'], '252_10 - 10x.bmp')
(['20200818', '74_2'], '20200818 74_2 - 10x.bmp')
(['20200817', '110_3'], '20200817 110_3 - 10x.bmp')
(['20200818', '72_1'], '20200818 72_1 - 10x.bmp')
(['20200817', '117_2'], '20200817 117_2 - 10x.bmp')
(['252_5'], '252_5 - 10x.bmp')
(['20200817', '110_5'], '20200817 110_5 - 10x.bmp')
(['20200817', '73_2'], '20200817 73_2 - 10x.bmp')
(['252_4'], '252_4 - 10x.bmp')
(['252_10'], '252_10 - 10x.bmp')
(['20200817', '73_1'], '20200817 73_1 - 10x.bmp')
(['20200818', '73_1'], '20200818 73_1 - 10x.bmp')
(['20200824', '272_2'], '20200824 272_2

(['252_6'], '252_6 - 10x.bmp')
(['20200817', '110_3'], '20200817 110_3 - 10x.bmp')
(['252_4'], '252_4 - 10x.bmp')
(['20200817', '73_1'], '20200817 73_1 - 10x.bmp')
(['20200818', '72_1'], '20200818 72_1 - 10x.bmp')
(['252_1'], '252_10 - 10x.bmp')
(['252_3'], '252_3 - 10x.bmp')
(['252_5'], '252_5 - 10x.bmp')
(['20200818', '72_2'], '20200818 72_2 - 10x.bmp')
(['20200824', '272_2'], '20200824 272_2 - 10x.bmp')
(['20200817', '110_5'], '20200817 110_5 - 10x.bmp')
(['20200817', '117_1'], '20200817 117_1 - 10x.bmp')
(['20200817', '117_2'], '20200817 117_2 - 10x.bmp')
(['252_10'], '252_10 - 10x.bmp')
(['20200818', '74_1'], '20200818 74_1 - 10x.bmp')
(['20200817', '73_2'], '20200817 73_2 - 10x.bmp')
(['252_8'], '252_8 - 10x.bmp')
(['20200824', '250_1'], '20200824 250_1 - 10x.bmp')
(['20200818', '74_1'], '20200818 74_1 - 10x.bmp')
(['20200817', '117_1'], '20200817 117_1 - 10x.bmp')
(['20200817', '73_2'], '20200817 73_2 - 10x.bmp')
(['20200817', '110_3'], '20200817 110_3 - 10x.bmp')
(['252_6'], '2

(['20200818', '74_1'], '20200818 74_1 - 10x.bmp')
(['20200818', '74_2'], '20200818 74_2 - 10x.bmp')
(['20200817', '110_5'], '20200817 110_5 - 10x.bmp')
(['20200817', '110_3'], '20200817 110_3 - 10x.bmp')
(['252_4'], '252_4 - 10x.bmp')
(['252_8'], '252_8 - 10x.bmp')
(['252_1'], '252_10 - 10x.bmp')
(['20200818', '72_4'], '20200818 72_4 - 10x.bmp')
(['252_2'], '252_2 - 10x.bmp')
(['20200817', '117_1'], '20200817 117_1 - 10x.bmp')
(['20200818', '72_3'], '20200818 72_3 - 10x.bmp')
(['20200818', '72_2'], '20200818 72_2 - 10x.bmp')
(['20200818', '73_1'], '20200818 73_1 - 10x.bmp')
(['20200818', '74_1'], '20200818 74_1 - 10x.bmp')
(['252_1'], '252_10 - 10x.bmp')
(['20200817', '117_2'], '20200817 117_2 - 10x.bmp')
(['252_4'], '252_4 - 10x.bmp')
(['20200824', '272_2'], '20200824 272_2 - 10x.bmp')
(['20200817', '110_5'], '20200817 110_5 - 10x.bmp')
(['20200818', '72_1'], '20200818 72_1 - 10x.bmp')
(['252_10'], '252_10 - 10x.bmp')
(['20200818', '72_4'], '20200818 72_4 - 10x.bmp')
(['20200817', '11