<a href="https://colab.research.google.com/github/AnnyKong/svm-cnn-idc-detection/blob/master/SVM_with_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

setup kaggle (Now deceperated)

In [0]:
!pip install -q kaggle
!mkdir -p ~/.kaggle
!cp /content/kaggle.json ~/.kaggle/
!ls ~/.kaggle
!chmod 600 /root/.kaggle/kaggle.json

download & unzip dataset (Now deceperated)

In [0]:
!kaggle datasets download -d paultimothymooney/breast-histopathology-images -p /content
!unzip -q -d /content/dataset /content/breast-histopathology-images.zip

New way to download dataset (git bucket)

In [0]:
# setup dataset param(s)
BASE_DIR = '/content/breast-histopathology'
IMG_DIM = 50

In [0]:
# setup dataset
! [ ! -d $BASE_DIR ] && git clone https://nick_lrc@bitbucket.org/nick_lrc/breast-histopathology.git
% cd $BASE_DIR

In [0]:
# Setup training param(s)
BATCH_SIZE = 128
LOG_INTERVAL = 20

# hog settings
PIXEL_PER_CELL = 5
CELL_PER_BLOCK = 10

In [0]:
# Dataset statistics (#negative: 196454, #positive: 78768)
import os
import glob
from PIL import Image

negatives = []
positives = []

# count negative samples
for file in glob.glob('data/0/*'):
  image = Image.open(file)
  if image.size == (IMG_DIM, IMG_DIM):
    negatives.append(file)

# count positive samples
for file in glob.glob('data/1/*'):
  image = Image.open(file)
  if image.size == (IMG_DIM, IMG_DIM):
    positives.append(file)

print(f'Negative: {len(negatives)}')
print(f'Positive: {len(positives)}')

In [0]:
# Helper methods
import glob
import fnmatch
import cv2
from functools import reduce

# Find all images
def find_images(*argv):
  imgs = []
  for p in argv:
    if p[-1] is not '/':
      p += '/'
    imgs += glob.glob(p + '**/*.png', recursive=True)
  return imgs

# Split data set into training set and test set by ratio 8 : 2
def train_test_split(data, train_ratio=0.8):
  np.random.shuffle(data)
  return np.split(data, [int(train_ratio * len(data))])

# Merge negative and positive images randomly
def negative_positive_merge(negatives, positives, negative_label=0, positive_label=1):
  images = np.concatenate([negatives, positives])
  labels = np.array([negative_label] * len(negatives) + [positive_label] * len(positives))
  indices = np.random.permutation(len(images))
  return images[indices], labels[indices]

# Load an image
def load_img(path):
  im = cv2.imread(path)
  return im

# Load images from start to end
def load_imgs(imgs, start, end):
  values=[]
  labels=[]
  for i in range(start, end):
    im = imgs[i]
    values.append(load_img(im))
    if fnmatch.fnmatch(im, '*class0.png'):
      labels.append(0)
    elif fnmatch.fnmatch(im, '*class1.png'):
      labels.append(1)
    else:
      raise Exception('image with no class: ' + im)
  return (values, labels)

# load images
def load_equal_pn_imgs(imgs, total_num):
  neg = fnmatch.filter(imgs, '*class0.png')
  pos = fnmatch.filter(imgs, '*class1.png')

  total_num = min(total_num, min(len(neg),len(pos)) * 2)
  values = []
  labels = []
  print('loading {} images'.format(total_num))
  for i in range(total_num >> 1):
    values.append(load_img(neg[i]))
    labels.append(0)
    values.append(load_img(pos[i]))
    labels.append(1)
  return (values, labels)

# Print results
def print_data_desc(labels):
  print('total number of images: {}'.format(len(labels)))
  print('number of negative images: {}'.format(reduce(lambda a,b: a+1 if b == 0 else a, labels)))
  print('number of positive images: {}'.format(reduce(lambda a,b: a+1 if b == 1 else a, labels)))

# Test accuracy
def test_output(expects, output):
  tp = 0
  fp = 0
  tn = 0
  fn = 0

  for i in range(len(output)):
    e = expects[i]
    o = output[i]
    if e == 0:
      if o == 0:
        tn += 1
      elif o == 1:
        fp += 1
      else:
        print(o)
        raise Exception("unexpected class: {}".format(o))
    elif e == 1:
      if o == 0:
        fn += 1
      elif o == 1:
        tp += 1
      else:
        print(o)
        raise Exception("unexpected class: {}".format(o))
    else:
      raise Exception("unexpected class: {}".format(o))
    
  print("TP: ", tp)
  print("FP: ", fp)
  print("TN: ", tn)
  print("FN: ", fn)
  print("Accuarcy: ", (tp + tn) / (tp + fp + tn + fn))
  return tp, fp, tn, fn

Now start training here

In [0]:
# Training
from sklearn import svm
from skimage import color
from skimage.feature import hog
from sklearn.metrics import confusion_matrix

import time
import numpy as np

# Hog
def hog_features(img):
  img = color.rgb2gray(img)
  return hog(img, orientations=8, pixels_per_cell=(PIXEL_PER_CELL,PIXEL_PER_CELL), cells_per_block=(CELL_PER_BLOCK, CELL_PER_BLOCK),block_norm= 'L2')

# Hog with rgb
def hog_features_rgb(img):
  h = lambda i : hog(i, orientations=8, pixels_per_cell=(PIXEL_PER_CELL,PIXEL_PER_CELL), cells_per_block=(CELL_PER_BLOCK, CELL_PER_BLOCK),block_norm= 'L2')
  return np.concatenate([h(img[:,:,0]), h(img[:,:,1]), h(img[:,:,2])])

# Flatten images
def flatten_data(img):
  return np.array(img).reshape(IMG_DIM*IMG_DIM*3)

#  Evaluate test results
def eval_svm(clf, tests, expects):
  outputs = clf.predict(tests)
  num_correct = 0
  result = []
  for i in range(len(outputs)):
    result.append(outputs[i] == expects[i])
  return result

# Train SVM
# train_imgs and test_imgs are given as np arrays
#   iter is for ADABoost (commented out)
def train_svm(clf, data_train, feature_fn, iter=1):
  batch_num = 0
  for k in range(iter):
    eval_results = []
    for i in range(0, len(data_train), BATCH_SIZE):
      end = min(len(data_train), i+BATCH_SIZE)
      (values, labels) = load_imgs(data_train, i, end)
      values = [feature_fn(im) for im in values]
      # clf.fit(values, labels, weights[i:end])
      clf.fit(values, labels)

    #   eval_results += eval_svm(clf, values, labels)
      if batch_num % 20 == 0 or end == len(data_train):
        print('{} Train Iter: {} [{}/{} ({:.0f}%)]'.format(
            time.ctime(time.time()),
            k, 
            i,
            len(data_train),
            100.0 * i / len(data_train)))
      batch_num += 1
    
    # num_correct = 0
    # for r in eval_results:
    #   if r:
    #     num_correct += 1
    # w_sum = 0
    # misclassified_percent = (len(weights)-num_correct)/len(weights)
    # for i in range(len(weights)):
    #   if eval_results[i]:
    #     weights[i] -= misclassified_percent
    #   w_sum += weights[i]
    # for i in range(len(weights)):
    #   weights[i] /= w_sum
  
  return clf

#  Test SVM
def test_svm(clf, data, feature_fn):
  result = [[0, 0], [0, 0]]
  for i in range(0, len(data), BATCH_SIZE):
    end = min(len(data), i+BATCH_SIZE)
    (tests, expects) = load_imgs(data, i, end)
    tests = [feature_fn(i) for i in tests]
    cm = confusion_matrix(expects, clf.predict(tests))
    result[0][0] += cm[0][0]
    result[0][1] += cm[0][1]
    result[1][0] += cm[1][0]
    result[1][1] += cm[1][1]
  
  return result

In [0]:
# Setup training (preprocessing)
train_n, test_n = train_test_split(negatives, train_ratio=0.5)
train_p, test_p = train_test_split(positives, train_ratio=0.5)
data_train, label_train = negative_positive_merge(train_n, train_p)
data_test, label_test = negative_positive_merge(test_n, test_p)

In [0]:
# Run SVM

# Helper method - hyper data
def hyper_data(*argv):
  return lambda img : np.concatenate([f(img) for f in argv])

# Compute Accuracy
def accuracy(cm):
  tn = cm[0][0]
  fp = cm[0][1]
  fn = cm[1][0]
  tp = cm[1][1]
  return (tn+tp)/(tn+fp+fn+tp)

# Run SVM
def run_svm():
  ff = flatten_data     # which descriptor (if any) to use
  clf = svm.SVC(kernel='rbf')

  print_data_desc(label_train)
  train_svm(clf, data_train, ff)

  # (tests, expects) = load_imgs(data_test, 0, 2000)
  # tests = [extract_features(i) for i in tests]
  # print(clf.score(tests, expects))
  # test_svm(clf, data_train, ff)
  cm = test_svm(clf, data_test, ff)
  print(cm)
  print(accuracy(cm))

run_svm()

In [0]:
# Temp (for debugging)

# Size of data test
len(data_test)