In [1]:
# Import libraries
import matplotlib.pyplot as plt # Matplotlib is used for plotting
import pandas as pd             # Pandas helps organize data structures
import numpy as np              # Numpy provides mathematical functions like sqrt()
import statsmodels.formula.api as sm  # Statsmodel provides linear regression
from sklearn import metrics 
from sklearn.model_selection import KFold
import skimage.io as io # Loads and displays images
import skimage as ski # Loads and displays images
from scipy.linalg import svd
from pycocotools.coco import COCO # Coco dataset
from sklearn.decomposition import PCA


1. Load image URLS


In [2]:
#Initialize data file annotations
dataDir=''
dataType='val2017'
annFile='./annotations/instances_{}.json'.format(dataType)
# initialize COCO api for instance annotations
coco=COCO(annFile)

paras = ['broccoli', 'apple', 'orange', 'carrot']
catIds = coco.getCatIds(paras);


loading annotations into memory...
Done (t=0.56s)
creating index...
index created!


In [3]:

# display COCO categories
cats = coco.loadCats(catIds)
nms=[cat['name'] for cat in cats]
print('COCO categories: \n{}\n'.format(' '.join(nms)))
print('COCO category IDs:', catIds)

COCO categories: 
apple orange broccoli carrot

COCO category IDs: [53, 55, 56, 57]


In [38]:
#Concatenate all pictures within the categories
imgIds = []
for cat in cats:
    imgIds = imgIds + coco.getImgIds(catIds = cat['id'])
    
#2017 Val only 313 photos
print(len(imgIds))

313


In [40]:
# load and display image
# use url to load image (potentially change to local to improve performance for training)
# image = io.imread(img['coco_url'])

# use local image for loading (much faster)
# image = io.imread('./images/' + img['file_name'])

nsamples = len(imgIds)
nx = 480
ny = 640
ncolors = 3


imgU = np.ndarray(shape=(nsamples, nx * ny * ncolors))
index = 0

for imgId in imgIds:
    
    #Load image from local 
    img = coco.loadImgs(imgId)[0]
    image = io.imread('./images/' + img['file_name'])
    
    #display image
    #plt.axis('off')
    #plt.imshow(image)
    #plt.show()

    #resize image to 480x640
    image_resized = ski.transform.resize(image, (480,640),
                       anti_aliasing=True)
    
    
    #store image as a matrix of RGB floats
    imgMatrix = ski.img_as_float(image_resized)

    #convert matrix into single dimension
    imgMatrix = imgMatrix.reshape(1, nx * ny * ncolors)
    
    #add to our ndarray
    imgU[index] = imgMatrix
    index += 1
    print(index, '/', nsamples)
    




1 / 313
2 / 313
3 / 313
4 / 313
5 / 313
6 / 313
7 / 313
8 / 313
9 / 313
10 / 313
11 / 313
12 / 313
13 / 313
14 / 313
15 / 313
16 / 313
17 / 313
18 / 313
19 / 313
20 / 313
21 / 313
22 / 313
23 / 313
24 / 313
25 / 313
26 / 313
27 / 313
28 / 313
29 / 313
30 / 313
31 / 313
32 / 313
33 / 313
34 / 313
35 / 313
36 / 313
37 / 313
38 / 313
39 / 313
40 / 313
41 / 313
42 / 313
43 / 313
44 / 313
45 / 313
46 / 313
47 / 313
48 / 313
49 / 313
50 / 313
51 / 313
52 / 313
53 / 313
54 / 313
55 / 313
56 / 313
57 / 313
58 / 313
59 / 313
60 / 313
61 / 313
62 / 313
63 / 313
64 / 313
65 / 313
66 / 313
67 / 313
68 / 313
69 / 313
70 / 313
71 / 313
72 / 313
73 / 313
74 / 313
75 / 313
76 / 313
77 / 313
78 / 313
79 / 313
80 / 313
81 / 313
82 / 313
83 / 313
84 / 313
85 / 313
86 / 313
87 / 313
88 / 313
89 / 313
90 / 313
91 / 313
92 / 313
93 / 313
94 / 313
95 / 313
96 / 313
97 / 313
98 / 313
99 / 313
100 / 313
101 / 313
102 / 313
103 / 313
104 / 313
105 / 313
106 / 313
107 / 313
108 / 313
109 / 313
110 / 313
111 / 31

2. (For Efficiency) Do PCA on all images to reduce # of parameters


In [41]:
print(imgU.shape)
pca_scores = PCA(n_components = 20).fit_transform(imgU)

(313, 921600)


In [43]:
print(pca_scores.shape)

#reproduce image from PCA
pca_scores.

(313, 20)


3. Use K-Fold model selection with k=5 to pick best model

In [None]:
k = 5
threshold = 0.5
kf = KFold(n_splits=k, shuffle=True)
train_trrs = []
test_trrs = []

for train_index, test_index in kf.split(df):
    df_train = df.iloc[train_index, :]
    df_test = df.iloc[test_index, :]
    
    # logistic regression
    res = sm.logit('mature ~ Lars2 + Malat1', data=df_train).fit(disp=0)
    # total error rate 
    train_pred_cat = (res.predict(df_train) > threshold).astype(int)
    train_conf_mat = metrics.confusion_matrix(df_train['mature'], train_pred_cat)
    train_trr = error_rate(train_conf_mat)
    train_trrs.append(train_trr)
    
    test_pred_cat = (res.predict(df_test) > threshold).astype(int)
    test_conf_mat = metrics.confusion_matrix(df_test['mature'], test_pred_cat)
    test_trr = error_rate(test_conf_mat)
    test_trrs.append(test_trr)
    
res_table = pd.DataFrame()
res_table['training error rate'] = train_trrs
res_table['test error rate'] = test_trrs
res_table

4. Try on some photos of our own