In [75]:
import sklearn.svm as svm
import numpy as np
import matplotlib.pyplot as plt
import cv2
import os

In [76]:
def featureExtractor(image):
     dilated = cv2.dilate(image, np.ones((3,3), np.uint8))
     contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     aspect_ratios = []
     white_ratios = []
     white_edge_ratios = []
     edges = cv2.Canny(image, 200, 200)
     for contour in contours:
          rect = cv2.minAreaRect(contour)  # get the minimum area rectangle
          box = cv2.boxPoints(rect)
          box = np.int0(box)
          #extract the vertices of the rectangle
          x_values = [coord[0] for coord in box]
          y_values = [coord[1] for coord in box]

          min_x = min(x_values)
          max_x = max(x_values)
          min_y = min(y_values)
          max_y = max(y_values)
          width = max_x - min_x
          height = max_y - min_y
          #calculate the area of the rectangle
          area = width * height
          aspect_ratio = width / height
          if aspect_ratio > 1:
               aspect_ratio = 1 / aspect_ratio
          white_pixels = np.sum(image[min_y:max_y, min_x:max_x] > 128)
          white_ratio = white_pixels / area
          white_edge = np.sum(edges[min_y:max_y, min_x:max_x] > 128)
          white_edge_ratio = white_edge / area
          aspect_ratios.append(aspect_ratio)
          white_ratios.append(white_ratio)
          #find the sift features of the pixels in the rectangle
          white_edge_ratios.append(white_edge_ratio)
     return np.mean(aspect_ratios), np.mean(white_ratios),np.mean(white_edge_ratios)
def featureExtractor2(image):
     dilated = cv2.dilate(image, np.ones((3,3), np.uint8))
     contours, _ = cv2.findContours(dilated, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     aspect_ratios = []
     white_ratios = []
     white_edge_ratios = []
     edges = cv2.Canny(image, 200, 200)
     for contour in contours:
          rect = cv2.minAreaRect(contour)  # get the minimum area rectangle
          box = cv2.boxPoints(rect)
          box = np.int0(box)
          #extract the vertices of the rectangle
          x_values = [coord[0] for coord in box]
          y_values = [coord[1] for coord in box]

          min_x = min(x_values)
          max_x = max(x_values)
          min_y = min(y_values)
          max_y = max(y_values)
          width = max_x - min_x
          height = max_y - min_y
          #calculate the area of the rectangle
          area = width * height
          aspect_ratio = width / height
          if aspect_ratio > 1:
               aspect_ratio = 1 / aspect_ratio
          white_pixels = np.sum(image[min_y:max_y, min_x:max_x] > 128)
          white_ratio = white_pixels / area
          white_edge = np.sum(edges[min_y:max_y, min_x:max_x] > 128)
          white_edge_ratio = white_edge / area
          if np.isnan(aspect_ratio) or np.isnan(white_ratio) or np.isnan(white_edge_ratio):
               continue
          aspect_ratios.append(aspect_ratio)
          white_ratios.append(white_ratio)
          white_edge_ratios.append(white_edge_ratio)
     return aspect_ratios, white_ratios,white_edge_ratios
def siftExtractor(image):
     sift_features = []
     sift = cv2.SIFT_create()
     _, deses = sift.detectAndCompute(image, None)
     if deses is None:
          return []
     for des in deses:
          if des is not None:
               sift_features.append(des)
     return  sift_features
def rectangleExtractor(image):
     kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (20, 20))
     processed = cv2.morphologyEx(image, cv2.MORPH_CLOSE, kernel)
     contours, _ = cv2.findContours(processed, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
     sub_images = []
     for contour in contours:
          rect = cv2.minAreaRect(contour)  # get the minimum area rectangle
          box = cv2.boxPoints(rect)
          box = np.int0(box)
          #extract the vertices of the rectangle
          x_values = [coord[0] for coord in box]
          y_values = [coord[1] for coord in box]

          min_x = min(x_values)
          max_x = max(x_values)
          min_y = min(y_values)
          max_y = max(y_values)
          #find the sift features of the pixels in the rectangle
          sub_image = image[min_y:max_y, min_x:max_x]
          if len(sub_image.shape) == 3:
               sub_image = cv2.cvtColor(sub_image, cv2.COLOR_BGR2GRAY)
          if sub_image.dtype != np.uint8:
               sub_image = sub_image.astype(np.uint8)
          if sub_image.size == 0:
               continue
          sub_images.append(sub_image)


          


In [77]:
dir = "./Preprocessed Dataset"
images = []
classes = []
# Process each image
for folder in os.listdir(dir):
    path = os.path.join(dir, folder)
    i=200
    for file in os.listdir(path):
        img_path = os.path.join(path, file)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        images.append(img)
        classes.append(folder)
        i-=1
        if i==0:
            break
        

In [78]:
import matplotlib.pyplot as plt
import numpy as np

# Apply the feature extractor to each image
aspect_ratios = []
white_ratios = []
edge_ratios = []
sift_descriptors = []
spread_classes = []
# for (image,classs) in zip(images,classes):
#     # aspect_ratio, white_ratio, edge_ratio = featureExtractor2(image)
#     # aspect_ratios.extend(aspect_ratio)
#     # white_ratios.extend(white_ratio)
#     # edge_ratios.extend(edge_ratio)
#     # spread_classes.extend([classs] * len(aspect_ratio))
#     # aspect_ratios.append(aspect_ratio)
#     # white_ratios.append(white_ratio)
#     # edge_ratios.append(edge_ratio)
#     # spread_classes.append(classs)
#     descriptors=siftExtractor(image)
for i in range(len(images)):
     sift_descriptors.extend(siftExtractor(images[i]))
     spread_classes.append(classes[i])
    




In [79]:
from sklearn.cluster import KMeans
k = 4*26  # Number of clusters
kmeans = KMeans(n_clusters=k, random_state=42)
kmeans.fit(sift_descriptors)


In [80]:
def bag_of_words_histogram(image,kmeans):
    feature_vector = np.zeros(k+1)
    for desc in siftExtractor(image):
        if desc is not None:
            desc = desc.astype(np.double)  # Convert desc to 'double'
            desc = desc.reshape(1, -1)
            kmeans_pred = kmeans.predict(desc)[0]
            feature_vector[kmeans_pred] += 1
    return feature_vector

In [81]:
feature_vectors = []
for i in range(len(images)):
    feature_vectors.append(bag_of_words_histogram(images[i],kmeans))


In [107]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(np.array(feature_vectors), classes, test_size=0.2, random_state=42,stratify=classes)
svm_model = svm.SVC(gamma=0.05, C=0.7, kernel='linear')
svm_model.fit(x_train, y_train)
print(svm_model.score(x_train, y_train),svm_model.score(x_test, y_test))

1.0 0.9875


In [88]:
feature_vector=bag_of_words_histogram(images[0],kmeans)
print(svm_model.predict([feature_vector]))


['IBM Plex Sans Arabic']


In [108]:
from sklearn.metrics import classification_report
print(classification_report(classes, svm_model.predict(np.array(feature_vectors))))

                      precision    recall  f1-score   support

IBM Plex Sans Arabic       1.00      1.00      1.00       200
            Lemonada       1.00      0.99      1.00       200
              Marhey       1.00      1.00      1.00       200
    Scheherazade New       1.00      0.99      1.00       200

            accuracy                           1.00       800
           macro avg       1.00      1.00      1.00       800
        weighted avg       1.00      1.00      1.00       800



In [109]:
images = []
classes = []
for folder in os.listdir(dir):
    path = os.path.join(dir, folder)
    i=100
    j=200
    for file in os.listdir(path):
        if j>0:
            j-=1
            continue
        img_path = os.path.join(path, file)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        images.append(img)
        classes.append(folder)
        i-=1
        if i==0:
            break

In [110]:
feature_vectors = []
for i in range(len(images)):
    feature_vectors.append(bag_of_words_histogram(images[i],kmeans))

In [111]:
svm_model.score(np.array(feature_vectors), classes)

0.975