In [137]:
import sklearn.svm as svm
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os

In [138]:
def featureExtractor(image):
     dilated = cv2.dilate(image, np.ones((3,3), np.uint8))
     contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     aspect_ratios = []
     white_ratios = []
     white_edge_ratios = []
     edges = cv2.Canny(image, 200, 200)
     for contour in contours:
          rect = cv2.minAreaRect(contour)  # get the minimum area rectangle
          box = cv2.boxPoints(rect)
          box = np.int0(box)
          #extract the vertices of the rectangle
          x_values = [coord[0] for coord in box]
          y_values = [coord[1] for coord in box]

          min_x = min(x_values)
          max_x = max(x_values)
          min_y = min(y_values)
          max_y = max(y_values)
          width = max_x - min_x
          height = max_y - min_y
          #calculate the area of the rectangle
          area = width * height
          aspect_ratio = width / height
          if aspect_ratio > 1:
               aspect_ratio = 1 / aspect_ratio
          white_pixels = np.sum(image[min_y:max_y, min_x:max_x] > 128)
          white_ratio = white_pixels / area
          white_edge = np.sum(edges[min_y:max_y, min_x:max_x] > 128)
          white_edge_ratio = white_edge / area
          aspect_ratios.append(aspect_ratio)
          white_ratios.append(white_ratio)
          #find the sift features of the pixels in the rectangle
          white_edge_ratios.append(white_edge_ratio)
     return np.mean(aspect_ratios), np.mean(white_ratios),np.mean(white_edge_ratios)
def siftExtractor(image):
     sift_features = []
     sift = cv2.SIFT_create()
     _, deses = sift.detectAndCompute(image, None)
     if deses is None:
          return []
     for des in deses:
          if des is not None:
               sift_features.append(des)
     return  sift_features



In [139]:
def loadImages(amount=200,skip=0):
    dir = "./Preprocessed Dataset"
    images = []
    classes = []
    for folder in os.listdir(dir):
        path = os.path.join(dir, folder)
        i=amount
        j=skip
        for file in os.listdir(path):
            if j>0:
                j-=1
                continue
            img_path = os.path.join(path, file)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            images.append(img)
            classes.append(folder)
            i-=1
            if i==0:
                break
    return images, classes
        

In [140]:
import matplotlib.pyplot as plt
import numpy as np

# Apply the feature extractor to each image
sift_descriptors = []
images, classes = loadImages(400,0)
for i in range(len(images)):
     sift_descriptors.extend(siftExtractor(images[i]))
    




In [141]:
from sklearn.cluster import KMeans
k = 4*28*4  # Number of clusters first 4 is number of forms of a letter,28 is number of letters,4 is number of fonts
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(sift_descriptors)


In [142]:
def bag_of_words_histogram(image,kmeans):
    feature_vector = np.zeros(k+1)
    for desc in siftExtractor(image):
        if desc is not None:
            desc = desc.astype(np.double)  # Convert desc to 'double'
            desc = desc.reshape(1, -1)
            kmeans_pred = kmeans.predict(desc)[0]
            feature_vector[kmeans_pred] += 1
    return feature_vector

In [143]:
x_train = []
y_train = classes

for i in range(len(images)):
    x_train.append(bag_of_words_histogram(images[i],kmeans))


In [144]:
x_test = []
images, classes = loadImages(200,400)
y_test = classes
for i in range(len(images)):
    x_test.append(bag_of_words_histogram(images[i],kmeans))

In [169]:
svm_model = svm.SVC(gamma= 0.0005, C= 0.0005, kernel='linear')
svm_model.fit(np.array(x_train), y_train)
print(svm_model.score(np.array(x_train), y_train),svm_model.score(np.array(x_test), y_test))

0.999375 0.99625


In [154]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'C': [0.0005, 0.0007,0.0001, 0.007,0.005,0.001,0.07 ,0.01,0.05,0.7,0.1,0.5],
    'gamma': [0.0005, 0.0007,0.0001, 0.007,0.005,0.001,0.07 ,0.01,0.05,0.7,0.1,0.5],
    'kernel': ['linear']
}

# Create a GridSearchCV object
grid = GridSearchCV(svm.SVC(), param_grid, refit=True, verbose=2, cv=3)

# Fit the model to the data
grid.fit(np.array(x_train), y_train)

# Print the best parameters
print(grid.best_params_)

# Score the model
print(grid.score(np.array(x_train), y_train), grid.score(np.array(x_test), y_test))

Fitting 3 folds for each of 144 candidates, totalling 432 fits
[CV] END ..............C=0.0005, gamma=0.0005, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0005, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0005, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0007, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0007, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0007, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0001, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0001, kernel=linear; total time=   0.0s
[CV] END ..............C=0.0005, gamma=0.0001, kernel=linear; total time=   0.0s
[CV] END ...............C=0.0005, gamma=0.007, kernel=linear; total time=   0.0s
[CV] END ...............C=0.0005, gamma=0.007, kernel=linear; total time=   0.0s
[CV] END ...............C=0.0005, gamma=0.007,

In [136]:
from joblib import dump

# Save the model
dump(svm_model, 'svm_model2.joblib') 

['svm_model.joblib']

In [145]:
dump(kmeans, 'kmeans_model.joblib') 

['kmeans_model.joblib']