# `SVM` testing

In [None]:
"""
Support vector machine model for
# `Completely Automated Public Turing test to tell Computers and Humans Apart`
## **1) Data Preprocessing**
Import the libraries
"""

import os
import cv2
import torch
import string
import random
import pathlib
import numpy as np
import torch.nn as nn
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms
from torch.utils.data import DataLoader

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
if torch.cuda.is_available():
    print(torch.cuda.get_device_name(0))

# Parameters:
NUMBER = ["{}".format(x) for x in range(10)]
ALPHABET = list(string.ascii_uppercase)
TABLE = NUMBER + ALPHABET # The table for CAPTCHA
LEN_OF_TABLE = len(TABLE) # in total 10+26 alphanumeric characters
LEN_OF_CAPTCHA = 6 # each picture contains 6 characters

"""Load the dataset, please modify the path `data_dir` **accordingly**"""

# Load the dataset from my Google Drive
from google.colab import drive
drive.mount('/gdrive')
data_dir = pathlib.Path("/gdrive/My Drive/Data/")
images = list(data_dir.glob("*.jpg")) # dataset as a list
print("Number of images found:", len(images)) # size of the dataset

"""### Customise the dataset class"""

# Convert the CAPTCHA into the (6*36,) vector (6 characters, 10 numbers + 26 uppercase/capital characters)
# 1 means the CAPTCHA image contains this character in TABLE, 0 means otherwise
def captcha_to_vector(captcha_str):
    captcha_str = captcha_str.upper()
    vector = np.zeros(36*LEN_OF_CAPTCHA, dtype=np.float32)
    for i, char in enumerate(captcha_str):
        ascii = ord(char) # Convert char into ASCII code
        if 48 <= ascii <= 57:   # for digits
            index = ascii - 48
        elif 65 <= ascii <= 90: # for Latin letters
            index = ascii - ord('A') + 10
        vector[i*LEN_OF_TABLE+index] = 1.0
    return vector

# Convert the vector to the CAPTCHA (the input vector is different from the vector above)
# Example: input: [1,2,34,2,6,7]; output: "23Y378"
def vector_to_captcha(vector):
    captcha_str = ""
    for i in vector:
        captcha_str += TABLE[i]
    return captcha_str

# Custom dataset
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, images, transform=None, target_transform=None, height=50, width=200):
        self.transform = transform
        self.target_transform = target_transform
        self.images = images
        self.width  = width
        self.height = height

    def __getitem__(self, index):
        # Get the image with path
        image = cv2.imread(str(self.images[index]))
        # Increase contrast: segmentation-based so the preprocessing is more complicated
        image = cv2.convertScaleAbs(image, alpha=3, beta=40)
        # Erode noise
        kernel = np.ones((1, 1), np.uint8)
        image = cv2.erode(image, kernel, iterations=1)
        # Convert the image into grayscale
        image = cv2.cvtColor(image, cv2.COLOR_RGBA2GRAY)
        # Resize the image to ensure the size
        image = cv2.resize(image, (self.width, self.height))
        # Binarization of images
        _, image = cv2.threshold(image, 20, 255, cv2.THRESH_BINARY|cv2.THRESH_OTSU)
        # Method from dsp.stackexchange.com/questions/52089/removing-noisy-lines-from-image-opencv-python
        kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1,3))
        image = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
        # Shear transformation, from thepythoncode.com/article/image-transformations-using-opencv-in-python#Image_Shearing
        M = np.float32([[1, -0.5, 0],
                        [0,    1, 0],
                        [0,    0, 1]])
        rows, cols = image.shape #(50, 200)
        image = cv2.warpPerspective(image,M,(int(cols),int(rows)), cv2.INTER_LINEAR, borderValue=(255, 255, 255))
        # Horizontal stretch
        M = np.float32([[1.2, 0, 0],
                        [0,   1, 0],
                        [0,   0, 1]])
        rows, cols = image.shape #(50, 200)
        image = cv2.warpPerspective(image,M,(int(cols),int(rows)), cv2.INTER_LINEAR, borderValue=(255, 255, 255))

        label = captcha_to_vector(self.images[index].name.split("_")[0])
        img_seg_list = []
        label_lst = []
        # Segmentation 
        for j in range(LEN_OF_CAPTCHA):
            left = (j+1)*25
            right = (j+2)*25
            im_seg = image[:, left:right]
            # Apply the transform to the image
            if self.transform is not None:
                img_seg_list.append(self.transform(im_seg))
            else:
                img_seg_list.append(im_seg)
            label_lst.append(label[j*36:(j+1)*36])
        return img_seg_list, label_lst

    def __len__(self):
        return len(self.images)

"""### Split and create datasets"""

random.shuffle(images)
NUMBER_Images = len(images)
# test data
test_data = images[int(0.8*NUMBER_Images):]     # last 2k images (20%) in dataset are for testing

# the part for training
training = images[:int(0.8*NUMBER_Images)]      # first 8k (80%) images in dataset are for training
train_data = training

print("Training set size:\t", len(train_data))
print("Test set size:\t\t", len(test_data))

train_set = CustomDataset(train_data, transform=transforms.ToTensor())
test_set  = CustomDataset(test_data,  transform=transforms.ToTensor())

train_dataloader = DataLoader(dataset=train_set, batch_size=1, shuffle=True)#BATCH_SIZE=1
test_dataloader  = DataLoader(dataset=test_set,  batch_size=1, shuffle=True)

"""## **2) `SVM` Model** ($\in$ segmentation-based algorithms)"""

def get_data(dataloader):
    X = []
    Y = []
    n = len(dataloader)
    print("")
    for z in range(n):
        if (z+1)%100 == 0: print("{}".format(z+1))
        i, l = next(iter(dataloader))
        for image, label in zip(i, l):
            image = image.to(device)
            label = label.to(device)
            label = label.reshape(1, 36)
            label = torch.argmax(label, dim=1)
            label = vector_to_captcha(label)
            image = image.reshape(image.shape[2], image.shape[3]).cpu()
            X.append(image.flatten().tolist())
            Y.append(label)
    new_Y = []
    for j in range(len(Y)):
        new_Y.append(Y[j][0])
    return X, Y

# Groups the original and the predicted characters together to into CAPTCHAs
def group(lst):
    n = len(lst)
    i = 0
    new_list = []
    while i < n:
        captcha = lst[i:i+LEN_OF_CAPTCHA] # six per group
        new_list.append(''.join(captcha))
        i += LEN_OF_CAPTCHA
    return new_list

"""**Training the support vector machine**"""

# SVC Code from kaggle.com/sanesanyo/digit-recognition-using-svm-with-98-accuracy
# Splitting the data into test and training set for our first simple linear SVM testing
# Creating our linear SVM object
from sklearn.svm import SVC
regularization = [i for i in range(1,10,1)]
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
train_x, train_y = get_data(train_dataloader)
test_x,  test_y  = get_data(test_dataloader)
for c in regularization:
    for k in kernels:
        clf = SVC(C=c, kernel=k)
        clf.fit(train_x, train_y)
        
        """Use the **`SVM`** to recognise new `CAPTCHA`"""
        # Code for prediction and accuracy modified from the same Kaggle source
        # Saving the predictions on the test set 
        y_predict = clf.predict(test_x)
        # Group the original and the predicted characters together to a CAPTCHA
        test_y_ = group(test_y)
        y_predict = group(y_predict)
        # Measuring the accuracy of our predictions
        from sklearn import metrics
        accuracy = metrics.accuracy_score(test_y_, y_predict)
        print("Accuracy for SVM with C={} and kernel={}: {:.2f}%".format(c,k,accuracy*100)) 

cpu
Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
Number of images found: 10000
Training set size:	 8000
Test set size:		 2000

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
2100
2200
2300
2400
2500
2600
2700
2800
2900
3000
3100
3200
3300
3400
3500
3600
3700
3800
3900
4000
4100
4200
4300
4400
4500
4600
4700
4800
4900
5000
5100
5200
5300
5400
5500
5600
5700
5800
5900
6000
6100
6200
6300
6400
6500
6600
6700
6800
6900
7000
7100
7200
7300
7400
7500
7600
7700
7800
7900
8000

100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900
2000
Accuracy for SVM with C=1 and kernel=linear: 29.30%
Accuracy for SVM with C=1 and kernel=poly: 73.25%
Accuracy for SVM with C=1 and kernel=rbf: 78.60%


Accuracy for SVM with C=1 and kernel=linear: 30.05%

Accuracy for SVM with C=1 and kernel=poly: 75.6%

Accuracy for SVM with C=1 and kernel=rbf: 80.2%

Accuracy for SVM with C=2 and kernel=linear: 27.6%

Accuracy for SVM with C=2 and kernel=poly: 75.05%

Accuracy for SVM with C=2 and kernel=rbf: 82.75%

Accuracy for SVM with C=3 and kernel=linear: 26.25%

Accuracy for SVM with C=3 and kernel=poly: 75.10%

Accuracy for SVM with C=3 and kernel=rbf: 81.45%

Accuracy for SVM with C=0.01 and kernel=linear: 44.55%

Accuracy for SVM with C=0.01 and kernel=poly: 63.00%

Accuracy for SVM with C=0.01 and kernel=rbf: 29.60%

Accuracy for SVM with C=100 and kernel=linear: 24.75%

Accuracy for SVM with C=100 and kernel=poly: 71.55%

Accuracy for SVM with C=100 and kernel=rbf: 79.15%
