# SVM - Diabetic Retinopathy

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# io related
# from skimage.io import imread
import os
from glob import glob
import cv2
from matplotlib import pyplot as plt
import matplotlib 

from sklearn.model_selection import train_test_split

from PIL import Image

from torch.utils.data import DataLoader, Dataset
import torchvision.transforms as T

from torchvision.utils import make_grid
from torchvision.utils import save_image

from skimage import io, transform

import torch
from sklearn.metrics import precision_recall_fscore_support, accuracy_score, classification_report, confusion_matrix


%matplotlib inline

from sklearn.svm import SVC
import torchvision.transforms as transforms

In [6]:
def load_datasets(base_location = 'F:/UNSW/2022/T2_2022/COMP9417/Project/Data/train.zip.001/train'):

    training_data_name = "train_1"
    train_data_dir = base_location + '/train'
    train_label_file = base_location + '/trainLabels.csv'

    # Load image mapping
    retina_df = pd.read_csv(train_label_file)
    # Get patient ID
    retina_df['PatientId'] = retina_df['image'].map(lambda x: x.split('_')[0])
    # Get image path
    retina_df['path'] = retina_df['image'].map(lambda x: train_data_dir + '/' + x + '.jpeg')
    # See if data exists in training data set
    retina_df['exists'] = retina_df['path'].map(os.path.exists)
    print(retina_df['exists'].sum(), 'images found of', retina_df.shape[0], 'total')

    # Left right eye categorical variable
    # 1 is left eye, 0 is right eye
    retina_df['eye'] = retina_df['image'].map(lambda x: 1 if x.split('_')[-1]=='left' else 0)

    # Remove NA, and keep only the 'existing' images 
    retina_df.dropna(inplace = True)
    retina_df = retina_df[retina_df['exists']]

    # Split traing and valid sets
    rr_df = retina_df[['PatientId', 'level']].drop_duplicates()
    
    train_ids, valid_ids = train_test_split(rr_df['PatientId'], 
                                    test_size = 0.25, 
                                    random_state = 2018,
                                    stratify = rr_df['level'])
                                    
    raw_train_df = retina_df[retina_df['PatientId'].isin(train_ids)]
    valid_df = retina_df[retina_df['PatientId'].isin(valid_ids)]
    print('Pre-balance: train', raw_train_df.shape[0], 'validation', valid_df.shape[0])
    
    # balance size variance in each class
    #train_df = raw_train_df.groupby(['level', 'eye']).apply(lambda x: x.sample(75, replace = True)).reset_index(drop = True)                                                   
    #print('Post-balance: train', train_df.shape[0], 'validation', valid_df.shape[0])

    return raw_train_df, valid_df

In [7]:
class retinaDataset(Dataset):

    def __init__(self, transforms=None, image_size = 192):
        'Initialization'
        self.image_size = image_size
        self.transforms = transforms        
        self.train_df, self.valid_df = load_datasets()
        
        self.train_df.reset_index(drop = True, inplace = True)
        self.valid_df.reset_index(drop = True, inplace = True)
        
    def __len__(self):
        'Denotes the total number of samples'
        return len(self.train_df)

    def __getitem__(self, index):
        'Generates one sample of data'
        # Select sample
        
        img_path = self.train_df["path"][index]
        
        # print(img_path)

        img = Image.open(img_path)
        
        if self.transforms:
            img = self.transforms(img)

        return img, torch.tensor(self.train_df.iloc[index].level)
    
    def testLen(self):
        return len(self.valid_df)
    def getTest(self, index):
        #gets test set 
        img_path = self.valid_df["path"][index]
        
        img = Image.open(img_path)
        
        if(self.transforms):
            img = self.transforms(img)

        return img, torch.tensor(self.valid_df.iloc[index].level)

In [8]:
def generate_results(xTrain, yTrain, xTest, yTest, train_flag = True):
    num_dec_point = 3
    #train_flag = True

    # generate predictions
    y_pred = svm.predict(xTest)

    if train_flag:
        y_train_pred = svm.predict(xTrain) 
        print('Model Training accuracy is: ', accuracy_score(yTrain, y_train_pred))

    # calculate testing accuracy
    accuracy = accuracy_score(yTest, y_pred)
    print('Model Testing accuracy is: ', accuracy)

    p_mic, r_mic, f1_mic, _ = precision_recall_fscore_support(yTest, 
                            y_pred,
                            average='micro',
                            warn_for=())
    p_mac, r_mac, f1_mac, _ = precision_recall_fscore_support(yTest, 
                        y_pred,
                        average='macro',
                        warn_for=())
    print('micro acc,prec,rec,f1: ',round(accuracy,num_dec_point), round(p_mic,num_dec_point), round(r_mic,num_dec_point), round(f1_mic,num_dec_point),sep="\t")
    print('macro prec,rec,f1: ',round(p_mac,num_dec_point), round(r_mac,num_dec_point), round(f1_mac,num_dec_point),sep="\t")
    print('Confusion Matrix is: ', confusion_matrix(yTest, y_pred))

In [9]:
image_size = 192

my_transforms = transforms.Compose([
    transforms.Resize((image_size,image_size)),
    transforms.ToTensor(),
])

dataset = retinaDataset(transforms=my_transforms, image_size = image_size)

8408 images found of 35126 total
Pre-balance: train 6512 validation 2318


In [10]:
#shuffle dataset
import random

n = 600 #number of total datasets
ratio = 0.8 #training:testing
#total = random.sample(range(len(dataset)), n)
training_number = int(n * ratio)
testing_number = n - training_number

training_items = random.sample(range(len(dataset)), training_number)
testing_items = random.sample(range(dataset.testLen()), testing_number)
print('training size: ', training_number, 'testing size: ', testing_number)

xTrain = []
yTrain = []
xTest = []
yTest = []

# Populate xTrain yTrain
for index, i in enumerate(training_items):#range(100):#len(dataset)):
    if index % 10 == 0:
        print(str(index) + ': train /' + str(len(training_items)))
    item, label = dataset.__getitem__(i)
    oneD = item.flatten()
    xTrain.append(oneD)
    # Append labels
    yTrain.append(label)
    
# Populate xTest yTest
for index, i in enumerate(testing_items):#range(100):# range(dataset.testLen()):
    if index % 10 == 0:
        print(str(index) + ': test /' + str(len(testing_items)))
    item, label = dataset.getTest(i)
    oneD = item.flatten()
    # Append flatten image matrix
    xTest.append(oneD)
    # Append labels
    yTest.append(label) 
    
xTrain = np.vstack(xTrain)
yTrain = np.array(yTrain)
xTest = np.vstack(xTest)
yTest = np.array(yTest)

print('Completed.')

training size:  480 testing size:  120
0: train /480
10: train /480
20: train /480
30: train /480
40: train /480
50: train /480
60: train /480
70: train /480
80: train /480
90: train /480
100: train /480
110: train /480
120: train /480
130: train /480
140: train /480
150: train /480
160: train /480
170: train /480
180: train /480
190: train /480
200: train /480
210: train /480
220: train /480
230: train /480
240: train /480
250: train /480
260: train /480
270: train /480
280: train /480
290: train /480
300: train /480
310: train /480
320: train /480
330: train /480
340: train /480
350: train /480
360: train /480
370: train /480
380: train /480
390: train /480
400: train /480
410: train /480
420: train /480
430: train /480
440: train /480
450: train /480
460: train /480
470: train /480
0: test /120
10: test /120
20: test /120
30: test /120
40: test /120
50: test /120
60: test /120
70: test /120
80: test /120
90: test /120
100: test /120
110: test /120
Completed.


## Fitting and Predicting SVM model

train 480, 120

In [11]:
svm = SVC(kernel='linear', C=0.1, gamma='auto', probability=False) #faster when probability = False

# fit model
svm.fit(xTrain, yTrain)
print(svm)

generate_results(xTrain, yTrain, xTest, yTest, train_flag = True)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Model Training accuracy is:  0.9979166666666667
Model Testing accuracy is:  0.625
micro acc,prec,rec,f1: 	0.625	0.625	0.625	0.625
macro prec,rec,f1: 	0.19	0.194	0.192
Confusion Matrix is:  [[72  7 11  1  1]
 [ 8  0  2  0  0]
 [12  1  3  0  0]
 [ 1  0  1  0  0]
 [ 0  0  0  0  0]]


## History

training = 720, testing 180

In [66]:
svm = SVC(kernel='linear', C=0.1, gamma='auto', probability=False) #faster when probability = False

# fit model
svm.fit(xTrain, yTrain)
print(svm)

generate_results(xTrain, yTrain, xTest, yTest, train_flag = True)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Model Training accuracy is:  0.9986111111111111
Model Testing accuracy is:  0.3611111111111111
micro acc,prec,rec,f1: 	0.361	0.361	0.361	0.361
macro prec,rec,f1: 	0.271	0.269	0.231
Confusion Matrix is:  [[44 25 24 17 13]
 [ 4 12  6  1  3]
 [ 2  7  8  3  1]
 [ 0  4  1  1  1]
 [ 1  1  1  0  0]]


In [67]:
svm = SVC(kernel='poly', C=0.1, gamma='auto', probability=False) #faster when probability = False

# fit model
svm.fit(xTrain, yTrain)
print(svm)

generate_results(xTrain, yTrain, xTest, yTest, train_flag = True)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='poly',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Model Training accuracy is:  0.2013888888888889
Model Testing accuracy is:  0.11666666666666667
micro acc,prec,rec,f1: 	0.117	0.117	0.117	0.117
macro prec,rec,f1: 	0.023	0.2	0.042
Confusion Matrix is:  [[  0   0 123   0   0]
 [  0   0  26   0   0]
 [  0   0  21   0   0]
 [  0   0   7   0   0]
 [  0   0   3   0   0]]


In [57]:
svm = SVC(kernel='linear', C=1, gamma='auto', probability=False) #faster when probability = False

# fit model
svm.fit(xTrain, yTrain)
print(svm)

generate_results(xTrain, yTrain, xTest, yTest, train_flag = True)

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Model Training accuracy is:  1.0
Model Testing accuracy is:  0.2875
micro acc,prec,rec,f1: 	0.288	0.288	0.288	0.288
macro prec,rec,f1: 	0.204	0.329	0.171
Confusion Matrix is:  [[19 17  7  9  9]
 [ 5  2  0  2  0]
 [ 3  1  1  3  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  1]]


In [58]:
svm = SVC(kernel='linear', C=0.5, gamma='auto', probability=False) #faster when probability = False

# fit model
svm.fit(xTrain, yTrain)
print(svm)

generate_results(xTrain, yTrain, xTest, yTest, train_flag = True)

SVC(C=0.5, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Model Training accuracy is:  1.0
Model Testing accuracy is:  0.2875
micro acc,prec,rec,f1: 	0.288	0.288	0.288	0.288
macro prec,rec,f1: 	0.204	0.329	0.171
Confusion Matrix is:  [[19 17  7  9  9]
 [ 5  2  0  2  0]
 [ 3  1  1  3  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  1]]


In [59]:
svm = SVC(kernel='linear', C=0.1, gamma='auto', probability=False) #faster when probability = False

# fit model
svm.fit(xTrain, yTrain)
print(svm)

generate_results(xTrain, yTrain, xTest, yTest, train_flag = True)

SVC(C=0.1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)
Model Training accuracy is:  0.996875
Model Testing accuracy is:  0.2875
micro acc,prec,rec,f1: 	0.288	0.288	0.288	0.288
macro prec,rec,f1: 	0.207	0.329	0.172
Confusion Matrix is:  [[19 17  6 10  9]
 [ 5  2  0  2  0]
 [ 3  1  1  3  1]
 [ 0  0  0  0  0]
 [ 0  0  0  0  1]]


In [60]:
svm = SVC(kernel='linear', C=0.01, gamma='auto', probability=False) #faster when probability = False

# fit model
svm.fit(xTrain, yTrain)
print(svm)

generate_results(xTrain, yTrain, xTest, yTest, train_flag = True)

SVC(C=0.01, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
    max_iter=-1, probability=False, random_state=None, shrinking=True,
    tol=0.001, verbose=False)


KeyboardInterrupt: 