# Training the model using SVM

In [2]:
#imports:

import cv2
import os.path
from imutils import paths
from helpers import resize_to_fit
import numpy as np
from sklearn import datasets, svm, metrics
from sklearn.model_selection import train_test_split


In [3]:
#loading data sets and labels
LETTERS_FOLDER = "extracted_letter_images"
MODEL_FILENAME = "captcha_model.hdf5" # file extension may be different depending on how tensorflow saves them
#MODEL_LABELS_FILENAME = "model_labels.dat"

data = []
labels = []

print("Gathering data and labels...")
for image_file in paths.list_images(LETTERS_FOLDER):
    # grayscale
    image = cv2.imread(image_file)#.astype(np.float32)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

    # resize to 20x20
    image = resize_to_fit(image, 20, 20)

    # get the name of the letter from its folder
    label = image_file.split(os.path.sep)[-2]

    # add to training data
    data.append(image)
    labels.append(label)

data = np.array(data, dtype="float32") / 255.0
labels = np.array(labels)
print("done")

Gathering data and labels...
done


In [9]:
print("split into training and test data")
(X_train, X_test, Y_train, Y_test) = train_test_split(data, labels, test_size=0.25, random_state=0)
X_train = np.array(X_train)
X_train = X_train.reshape([X_train.shape[0], X_train.shape[1]*X_train.shape[2]])
X_test = X_test.reshape([X_test.shape[0], X_test.shape[1]*X_test.shape[2]])
print('Training data and target sizes: \n{}, {}'.format(X_train.shape,Y_train.shape))
print('Test data and target sizes: \n{}, {}'.format(X_test.shape,Y_test.shape))

split into training and test data
Training data and target sizes: 
(29058, 400), (29058,)
Test data and target sizes: 
(9686, 400), (9686,)


In [16]:
#creating a classifier
classifier = svm.LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=10000000,
     multi_class='crammer_singer', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)
# classifier.fit(X_train,Y_train)


In [17]:
classifier.fit(X_train, Y_train) ##



LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=100000,
     multi_class='crammer_singer', penalty='l2', random_state=None,
     tol=0.0001, verbose=0)

In [18]:
clf_predictions = classifier.predict(X_test)
print("Accuracy: {}%".format(classifier.score(X_test, Y_test) * 100 ))

Accuracy: 99.3186041709684%
