In [1]:
import copy
import random
import time
import os

import torch
import torch.nn
import torch.nn.functional 
import torch.optim 
import torch.utils.data

import torchvision.transforms
import torchvision.datasets
import torch.utils.data 
import skimage.io
import skimage.transform

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [2]:
SEED = 1234

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# Cleaning Data

In [3]:
labels_filepath = 'data/10x_labels.csv'
labels=pd.read_csv(labels_filepath)

In [4]:
labels.head(10)

Unnamed: 0,Sample,Sample origin,Description,Size (um),Identification
0,252_1,mussels,clear irregular,50,filter paper
1,252_2,mussels,opaque fiber,50,non-microplastic
2,252_3,mussels,opaque spheroid,25,non-microplastic
3,252_4,mussels,opaque fragment,75,polystyrene
4,252_5,mussels,opaque spheroid,50,non-microplastic
5,252_6,mussels,white spheroid,50,Nylon
6,252_7,mussels,opaque irregular,50,polystyrene
7,252_8,mussels,opaque irregular,30,filter paper
8,252_9,mussels,clear fragment,100,quartz
9,252_10,mussels,clear fragment,50,quartz


In [5]:
new= labels["Description"].str.split(" ", n = 1, expand = True)
labels.head(10)
new.head(10)
labels.drop(columns=['Description'],inplace=True)

In [6]:
labels['Color'] = new[0].values
labels['Shape'] = new[1].values

In [7]:
labels.head(10)

Unnamed: 0,Sample,Sample origin,Size (um),Identification,Color,Shape
0,252_1,mussels,50,filter paper,clear,irregular
1,252_2,mussels,50,non-microplastic,opaque,fiber
2,252_3,mussels,25,non-microplastic,opaque,spheroid
3,252_4,mussels,75,polystyrene,opaque,fragment
4,252_5,mussels,50,non-microplastic,opaque,spheroid
5,252_6,mussels,50,Nylon,white,spheroid
6,252_7,mussels,50,polystyrene,opaque,irregular
7,252_8,mussels,30,filter paper,opaque,irregular
8,252_9,mussels,100,quartz,clear,fragment
9,252_10,mussels,50,quartz,clear,fragment


In [8]:
image_dir = 'data/images_10x'
#os.listdir(image_dir)

In [9]:
sample_names = labels["Sample"].str.split(" ", n = 1, expand = False)

sample_names_frame = pd.DataFrame(sample_names)

        
        
labels['Sample'] = sample_names_frame

In [10]:
labels.head(20)

Unnamed: 0,Sample,Sample origin,Size (um),Identification,Color,Shape
0,[252_1],mussels,50,filter paper,clear,irregular
1,[252_2],mussels,50,non-microplastic,opaque,fiber
2,[252_3],mussels,25,non-microplastic,opaque,spheroid
3,[252_4],mussels,75,polystyrene,opaque,fragment
4,[252_5],mussels,50,non-microplastic,opaque,spheroid
5,[252_6],mussels,50,Nylon,white,spheroid
6,[252_7],mussels,50,polystyrene,opaque,irregular
7,[252_8],mussels,30,filter paper,opaque,irregular
8,[252_9],mussels,100,quartz,clear,fragment
9,[252_10],mussels,50,quartz,clear,fragment


In [11]:
labels['Identification'].unique()

array(['filter paper ', 'non-microplastic', 'polystyrene', 'Nylon',
       'filter paper', 'quartz', 'carbon fiber', 'orthoclase',
       'polypropylene', 'ink + plastic', 'PET', 'polyethylene',
       'fluorescence'], dtype=object)

In [12]:
plastics = ['polystyrene', 'polyethylene','polypropylene','Nylon','ink + plastic','PET','carbon fiber']
identification = labels['Identification']

for i in range(0,len(identification)):
    if identification[i] in plastics:
        identification[i] = True
    else:
        identification[i] = False
    
identification.head
    

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


<bound method NDFrame.head of 0     False
1     False
2     False
3      True
4     False
      ...  
58    False
59    False
60    False
61    False
62    False
Name: Identification, Length: 63, dtype: object>

In [13]:
labels['Identification']=identification
labels.rename(columns={'Identification': 'isPlastic'}, inplace=True)
labels.head(10)

Unnamed: 0,Sample,Sample origin,Size (um),isPlastic,Color,Shape
0,[252_1],mussels,50,False,clear,irregular
1,[252_2],mussels,50,False,opaque,fiber
2,[252_3],mussels,25,False,opaque,spheroid
3,[252_4],mussels,75,True,opaque,fragment
4,[252_5],mussels,50,False,opaque,spheroid
5,[252_6],mussels,50,True,white,spheroid
6,[252_7],mussels,50,True,opaque,irregular
7,[252_8],mussels,30,False,opaque,irregular
8,[252_9],mussels,100,False,clear,fragment
9,[252_10],mussels,50,False,clear,fragment


# Custom Dataset

In [14]:
class tenX_dataset(torch.utils.data.Dataset):
    """"""
    
    #Initializes dataset. Is only used once during the creating on a 'tenX_dataset' class
    #transform is an optional parameter, it defaults to none if nothing is passed into the class
    def __init__(self, labels_frame, image_dir, transform = None):
        'Initialization'
        self.labels = labels_frame
        self.image_dir = image_dir
        self.image_filenames = os.listdir(self.image_dir)
        self.transform = None
        
    #Length of dataset
    def __len__(self):
        return len(self.labels)
    
    #Return an single image with labels based on given index
    def __getitem__(self, idx):
        image_id = self.labels['Sample'][idx]
        image_file = None
        image = None
        
        for filename in self.image_filenames:
            if image_id in filename:
                print(image_id)
                image_file = filename
                break
                
        if not image_file:
            #raise Exception('Could not find image file')
            sample = {'image': image,
                'shape': self.labels['Shape'][idx],
                'color': self.labels['Color'][idx],
                'plastic': self.labels['isPlastic'][idx]}
            
            return sample
        
        
        image_filepath = os.path.join(self.image_dir, image_file)
        image = skimage.io.imread(image_filepath)
        
        sample = {'image': image,
                'shape': self.labels['Shape'][idx],
                'color': self.labels['Color'][idx],
                'plastic': self.labels['isPlastic'][idx]}
            
        #This 'transform' will be where we specify how we edit the images (resize, 
        #change file type, data augmentation). It is defined outside this classs.
        
        
        if self.transform:
            sample = self.transform(sample)
        

        return sample
                
            
            

In [16]:
image_dir = 'data/images_10x'
labels_frame = labels
transform = None

tenX_dataset = tenX_dataset(labels_frame, image_dir, transform = transform)

In [None]:
#tenX_dataset.image_filenames

In [17]:
samples=[]
for i in range(len(tenX_dataset)):
    sample = tenX_dataset[i]
    samples.append(sample['image'])
    if i>10:
        break

#samples
samples

TypeError: 'in <string>' requires string as left operand, not list

In [None]:
count = 0
for im in samples:
    if type(im) != type(None):
        plt.figure(count)
        plt.imshow(im)
        count+=1

In [None]:
type(samples[8])

# Things to improve/fix
* if data is for sure consistent. Take datacleaning steps, generalize, and put into a function. Then 10x_dataset class with get passed in the filename of the labels and in the init method the dataclean function should be called.
* Verify the labels are coming through (i.e. train some sort of model on this data
* Make sure the nonetypes are because the file actually isn't in my folder of images
* Figure out why I'm getting duplicate images