In [7]:
from helper_functions import *
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

## Analysis

In [11]:
# read csv into dataframe
data = pd.read_csv("data/patterns.csv")

# inspect the shape
print("Data Shape: " + str(data.shape))

# get the average image size
width, height = get_average_image_size('/home/cassidy/DSC412/project/DSC412-project-cassidy-petrykowski/data/photos')
print("Width: " + str(width), "Height: " + str(height))


Data Shape: (56, 1)
Width: 218 Height: 218


###  Step 1: Isolate the primary object in the image and fill in the backdrop

This uses an image segmentation technique called thresholding

In [12]:
# Adaptive Thresholding from open cv https://docs.opencv.org/4.x/d7/d4d/tutorial_py_thresholding.html

import cv2 as cv
import numpy as np
from matplotlib import pyplot as plt
 
img = cv.imread('/home/cassidy/DSC412/project/DSC412-project-cassidy-petrykowski/data/photos/edba2a6bbe/4ef61f0b26.png', cv.IMREAD_GRAYSCALE)
assert img is not None, "file could not be read, check with os.path.exists()"
 
# global thresholding
ret1,th1 = cv.threshold(img,127,255,cv.THRESH_BINARY)
 
# Otsu's thresholding
ret2,th2 = cv.threshold(img,0,255,cv.THRESH_BINARY+cv.THRESH_OTSU)
 
# Otsu's thresholding after Gaussian filtering
blur = cv.GaussianBlur(img,(5,5),0)
ret3,th3 = cv.threshold(blur,0,255,cv.THRESH_BINARY+cv.THRESH_OTSU)
 
# plot all the images and their histograms
images = [img, 0, th1,
          img, 0, th2,
          blur, 0, th3]
titles = ['Original Noisy Image','Histogram','Global Thresholding (v=127)',
          'Original Noisy Image','Histogram',"Otsu's Thresholding",
          'Gaussian filtered Image','Histogram',"Otsu's Thresholding"]
 
for i in range(3):
    plt.subplot(3,3,i*3+1),plt.imshow(images[i*3],'gray')
    plt.title(titles[i*3]), plt.xticks([]), plt.yticks([])
    plt.subplot(3,3,i*3+2),plt.hist(images[i*3].ravel(),256)
    plt.title(titles[i*3+1]), plt.xticks([]), plt.yticks([])
    plt.subplot(3,3,i*3+3),plt.imshow(images[i*3+2],'gray')
    plt.title(titles[i*3+2]), plt.xticks([]), plt.yticks([])
plt.show()

ModuleNotFoundError: No module named 'matplotlib'

### Split the data between training and testing

Because the csv contains the names for both the photos and grid, it can be split into one training set and one testing set.

When the data is actually accessed, the distinction between inputs and outputs or photos and grids will be made.

In [8]:
train_set, test_set = train_test_split(             data,
                                                    test_size=0.7, 
                                                    random_state=42
                                                   )
print(train_set)

    7092f1985d
43  edba2a6bbe
39  09dbfb528d
35  51684b0d4c
23  c3cbf850c5
45  b1b55738b1
10  2666e3c94b
22  ef642b8b75
18  c3af8775ca
55  f9232fe1b3
20  95fa1a221a
7   113fbb67d4
42  2e024bf09e
14  1e1bdfb2e7
28  df03231a62
51  337c0070cc
38  7ce54e685d


### 
