In [1]:
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import os
import pandas as pd
from skimage import io
from sklearn import svm
from sklearn.metrics import accuracy_score
import cv2
from skimage.feature import graycomatrix, graycoprops
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn import metrics
from tqdm import tqdm
from PIL import Image, ImageEnhance
import time
import pickle

In [2]:
#uncomment the following line if hinge_and_cold directory is not included in the project
# !git clone https://github.com/Hero2323/hinge_and_cold_feature_extraction.git
os.chdir('hinge_and_cold_feature_extraction')
os.chdir('src')
#!pip install imutils
import hinge_feature_extraction as hinge
import cold_feature_extraction as cold
os.chdir('../../')

In [3]:
# 
male_paths = os.listdir('./input/cmp23-handwritten-males-vs-females/Males/Males')
female_paths = os.listdir('./input/cmp23-handwritten-males-vs-females/Females/Females')
image_labels = []
image_paths = []
for i in range(len(male_paths)):
    if male_paths[i] == 'M152.jpg':
        continue
    image_paths.append(os.path.join('./input/cmp23-handwritten-males-vs-females/Males/Males', male_paths[i]))
    image_labels.append(0)
for i in range(len(female_paths)):
    if female_paths[i] == 'F87.jpg':
        continue
    image_paths.append(os.path.join('./input/cmp23-handwritten-males-vs-females/Females/Females', female_paths[i]))
    image_labels.append(1)

In [4]:
# Split the paths into train and test
from sklearn.model_selection import train_test_split
train_paths, test_paths, train_labels, test_labels = train_test_split(image_paths, image_labels, test_size=0.25, random_state=42)
print("Toal Train size: {}".format(len(train_paths)))
print("Test size: {}".format(len(test_paths)))

Toal Train size: 270
Test size: 91


In [5]:
# Read the image and apply thresholding to it
# Then apply morphological operations to get rid of the noise
train_images, test_images = [], []
for i in tqdm(range(len(train_paths))): #use skimage's imread
    try:
        img = cv2.imread(train_paths[i], 0)
#         kernel = np.ones((3, 3))
#         img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
#         img = cv2.medianBlur(img, 3)
#         (thresh, img) = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        train_images.append(img)
    except  Exception as e:
        print(e , 'in - ', train_paths[i])
        break
for i in tqdm(range(len(test_paths))): #use skimage's imread
    try:
        img = cv2.imread(test_paths[i], 0)
#         kernel = np.ones((3, 3))
#         img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
#         img = cv2.medianBlur(img, 3)
#         (thresh, img) = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        test_images.append(img)
    except  Exception as e:
        print(e , 'in - ', test_paths[i])
        break


100%|██████████| 270/270 [00:50<00:00,  5.37it/s]
100%|██████████| 91/91 [00:15<00:00,  5.73it/s]


In [37]:
#GLCM Preprocessing
glcm_train_images, glcm_test_images = [], []
for i in tqdm(range(len(train_paths))): #use skimage's imread
    try:
        img = cv2.imread(train_paths[i], 0)
        kernel = np.ones((3, 3))
        img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
        img = cv2.medianBlur(img, 3)
        (thresh, img) = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        glcm_train_images.append(img)
    except  Exception as e:
        print(e , 'in - ', train_paths[i])
        break
for i in tqdm(range(len(test_paths))): #use skimage's imread
    try:
        img = cv2.imread(test_paths[i], 0)
        kernel = np.ones((3, 3))
        img = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
        img = cv2.medianBlur(img, 3)
        (thresh, img) = cv2.threshold(img, 128, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
        glcm_test_images.append(img)
    except  Exception as e:
        print(e , 'in - ', test_paths[i])
        break


100%|██████████| 270/270 [00:43<00:00,  6.26it/s]
100%|██████████| 91/91 [00:15<00:00,  6.03it/s]


In [38]:
# GLCM Classifier
def get_glcm_feature(img_list):
    features = []
    glcm_texture_features = ['contrast', 'dissimilarity', 'homogeneity', 'energy', 'correlation', 'ASM']
    used_features = ['energy','homogeneity']
    for img in tqdm(img_list):
        glcm = graycomatrix(img, distances=[1], angles=[0, np.pi / 2, np.pi, 3 * np.pi / 2], levels=256, normed=True)
        curr_img_features = []
        for feature in used_features:
            extracted_feature = np.array(graycoprops(glcm, feature), dtype='float64').flatten()
            curr_img_features.append(np.sum(extracted_feature))
        features.append(np.array(curr_img_features, dtype='float64').flatten())
    features = np.array(features, dtype='float64')
    return features
# train_glcm = get_glcm_feature(train_images)
# test_glcm = get_glcm_feature(test_images)
train_glcm = get_glcm_feature(glcm_train_images)
test_glcm = get_glcm_feature(glcm_test_images)

100%|██████████| 270/270 [00:52<00:00,  5.13it/s]
100%|██████████| 91/91 [00:18<00:00,  5.00it/s]


In [41]:
# GLCM Using SVM
glcm_svm = svm.SVC()
glcm_svm.fit(train_glcm, train_labels)
glcm_svm_predictions = glcm_svm.predict(test_glcm)
accuracy_score(test_labels, glcm_svm_predictions)

0.6263736263736264

In [52]:
# GLCM Using Random Forest Classifier
glcm_rfc = RandomForestClassifier(n_estimators= 500)
glcm_rfc.fit(train_glcm, train_labels)
glcm_rfc_predictions = glcm_rfc.predict(test_glcm)
accuracy_score(test_labels, glcm_rfc_predictions)

0.6153846153846154

In [12]:
# Hinge Classifier
hinge_classifier = hinge.Hinge((10, 3, False, True))
train_hinge_features = []
for img in tqdm(train_images):
    train_hinge_feature = hinge_classifier.get_hinge_features(img)
    train_hinge_features.append(train_hinge_feature)
test_hinge_features = []
for img in tqdm(test_images):
    test_hinge_feature = hinge_classifier.get_hinge_features(img)
    test_hinge_features.append(test_hinge_feature)

100%|██████████| 270/270 [04:07<00:00,  1.09it/s]
100%|██████████| 91/91 [01:19<00:00,  1.15it/s]


In [68]:
# Hinge using SVM
hinge_svm = svm.SVC()
hinge_svm.fit(train_hinge_features, train_labels)
hinge_rfc_predictions = hinge_svm.predict(test_hinge_features)
accuracy_score(test_labels, hinge_rfc_predictions)

0.6263736263736264

In [103]:
# Hinge using Random Forest Classifier
average_acc = 0
hinge_rfc = RandomForestClassifier(n_estimators= 500)
for _ in tqdm(range(25)):
    hinge_rfc.fit(train_hinge_features, train_labels)
    hinge_rfc_predictions = hinge_rfc.predict(test_hinge_features)
    average_acc += accuracy_score(test_labels, hinge_rfc_predictions)
average_acc /= 25
print(average_acc)

100%|██████████| 25/25 [01:51<00:00,  4.47s/it]

0.756923076923077





In [17]:
# Cold Classifier
cold_classifier = cold.Cold((10, 3, False, True))
train_cold_features = []
for img in tqdm(train_images):
    train_cold_feature = cold_classifier.get_cold_features(img)
    train_cold_features.append(train_cold_feature)
test_cold_features = []
for img in tqdm(test_images):
    test_cold_feature = cold_classifier. get_cold_features(img)
    test_cold_features.append(test_cold_feature)

  rhos_log_space = np.log10(rhos)
100%|██████████| 270/270 [04:25<00:00,  1.02it/s]
100%|██████████| 91/91 [01:28<00:00,  1.03it/s]


In [81]:
# Cold using SVM
cold_svm = svm.SVC()
cold_svm.fit(train_cold_features, train_labels)
cold_svm_predictions = cold_svm.predict(test_cold_features)
accuracy_score(test_labels, cold_svm_predictions)

0.6263736263736264

In [84]:
# Cold using Random Forest Classifer
average_acc = 0
cold_rfc = RandomForestClassifier(n_estimators= 1000)
for _ in tqdm(range(25)):
    cold_rfc.fit(train_cold_features, train_labels)
    cold_rfc_predictions = cold_rfc.predict(test_cold_features)
    average_acc += accuracy_score(test_labels, cold_rfc_predictions)
average_acc /= 25
print(average_acc)

100%|██████████| 25/25 [02:49<00:00,  6.78s/it]

0.738901098901099





In [None]:
# import shutil
# os.mkdir('test')
# for path in test_paths:
#     shutil.copy(path, 'test')

In [21]:
# Add up the hinge & cold features together
train_hinge_cold_features = []
for i in range(len(train_hinge_features)):
    train_hinge_cold_features.append(np.concatenate((train_hinge_features[i], train_cold_features[i])))
train_hinge_cold_features = np.array(train_hinge_cold_features, dtype='float64')
test_hinge_cold_features = []
for i in range(len(test_hinge_features)):
    test_hinge_cold_features.append(np.concatenate((test_hinge_features[i], test_cold_features[i])))
test_hinge_cold_features = np.array(test_hinge_cold_features, dtype='float64')

In [101]:
# Cold & Hinge using Random Forest Classifier
average_acc = 0
train_hinge_cold_rfc = RandomForestClassifier(n_estimators= 500)
for _ in tqdm(range(25)):
    train_hinge_cold_rfc.fit(train_hinge_cold_features, train_labels)
    train_hinge_cold_predictions = train_hinge_cold_rfc.predict(test_hinge_cold_features)
    average_acc += accuracy_score(test_labels, train_hinge_cold_predictions)
average_acc /= 25
print(average_acc)

100%|██████████| 25/25 [04:05<00:00,  9.82s/it]

0.7714285714285718





In [72]:
# Hinge with PCA
hinge_pca = PCA(n_components=50)
new_train_hinge_features = hinge_pca.fit_transform(train_hinge_features)
new_test_hinge_features = hinge_pca.fit_transform(test_hinge_features)
new_hinge_rcf = RandomForestClassifier(n_estimators= 1000)
new_hinge_rcf.fit(new_train_hinge_features, train_labels)
new_hinge_predictions = new_hinge_rcf.predict(new_test_hinge_features)
accuracy_score(test_labels, new_hinge_predictions)

0.6263736263736264

In [85]:
# Cold with PCA
cold_pca = PCA(n_components=50)
new_train_cold_features = hinge_pca.fit_transform(train_cold_features)
new_test_cold_features = hinge_pca.fit_transform(test_cold_features)
new_cold_rcf = RandomForestClassifier(n_estimators= 1000)
new_cold_rcf.fit(new_train_cold_features, train_labels)
new_cold_predictions = new_cold_rcf.predict(new_test_cold_features)
accuracy_score(test_labels, new_cold_predictions)

0.5934065934065934

In [88]:
#Decision Tree classifier
from sklearn import tree

tree_classifier = tree.DecisionTreeClassifier()

tree_classifier = tree_classifier.fit(train_glcm, train_labels)
glcm_dt_predictions = tree_classifier.predict(test_glcm)
print(accuracy_score(test_labels, glcm_rfc_predictions))

tree_classifier = tree_classifier.fit(train_hinge_features, train_labels)
hinge_dt_predictions = tree_classifier.predict(test_hinge_features)
print(accuracy_score(test_labels, hinge_dt_predictions))

tree_classifier = tree_classifier.fit(train_cold_features, train_labels)
cold_dt_predictions = tree_classifier.predict(test_cold_features)
print(accuracy_score(test_labels, cold_dt_predictions))


0.6153846153846154
0.5604395604395604
0.6373626373626373


In [104]:
# Save the best model --> Hinge
filename = 'hinge_rfc_checkpoint.ckpt'
pickle.dump(hinge_rfc, open(filename, 'wb'))

In [100]:
hinge_cold_clf = 'hinge_cold_checkpoint.ckpt'
pickle.dump(train_hinge_cold_rfc, open(hinge_cold_clf, 'wb'))   

In [34]:
if os.path.exists('test'):
    test_img_paths = []
    prediction_file = open('results.txt', 'a', buffering=1)
    timing_file = open('time.txt', 'a', buffering=1)
    for img in os.listdir('test'):
        test_img_paths.append('test/' + img)
    for img_path in test_img_paths:
        test_img = cv2.imread(img_path, 0)
        start = time.time()
        prediction = 1 - hinge_rfc.predict(np.reshape(hinge_classifier.get_hinge_features(test_img), (1, 780)))
        duration = time.time() - start
        if duration == 0 :
            duration = 0.001
        prediction_file.write(str(prediction[0]) + '\n')
        timing_file.write(str(round(duration, 2)) + '\n')
    prediction_file.close()
    timing_file.close()

In [None]:
# # Crop the white parts of the image by storing the index of the first (top left) black pixel and
# # the index of the last (bottom right) black pixel
# first_black_idx = [1e6, 1e6]
# last_black_idx = [0, 0]
# # first, get the idx of the top left black pixel
# # for i in range(len(train_images)):
# for i in range(1):
#     i = 1
#     done_first = False
#     done_second = False
#     for j in range(train_images[i].shape[0]):
#         for k in range(train_images[i].shape[1]):
#             if train_images[i][j][k] != 255:
#                 first_black_idx[0] = j
#                 first_black_idx[1] = k
#                 done_first = True
#                 break
#         if done_first:
#             break
#     for j in range(train_images[i].shape[0] - 1, 0, -1):
#         for k in range(train_images[i].shape[1] - 1, 0, -1):
#             if train_images[i][j][k] != 255:
#                 last_black_idx[0] = j
#                 last_black_idx[1] = k
#                 done_second = True
#                 break
#         if done_second:
#             break
