In [32]:
import cv2
import os 
import numpy as np 
from sklearn.model_selection import train_test_split
from skimage.feature import local_binary_pattern, hog
from tqdm import tqdm 
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import classification_report

In [8]:
DATASET_PATH = './dataset/'
IMG_WIDTH = 224
IMG_HEIGHT = 224
BATCH_SIZE = 32

In [9]:
print('Getting file paths and labels')

image_paths = []
labels = []

positive_path = os.path.join(DATASET_PATH, 'Positive')
negative_path = os.path.join(DATASET_PATH, 'Negative')

for filename in os.listdir(positive_path):
    image_paths.append(os.path.join(positive_path, filename))
    labels.append(1)
    
for filename in os.listdir(negative_path):
    image_paths.append(os.path.join(negative_path, filename))
    labels.append(0)
    
image_paths = np.array(image_paths)
labels = np.array(labels)

X_train_paths, X_test_paths, y_train, y_test = train_test_split(
    image_paths, labels, test_size=0.25, random_state=42, stratify=labels
)

print(f'Training test size: {len(X_train_paths)}')
print(f'Testing test size: {len(X_test_paths)}')

Getting file paths and labels
Training test size: 30000
Testing test size: 10000


# Getting the best descriptor and detector

## just detector: using lbp

In [10]:
def feature_generator_lbp(image_paths, labels, batch_size):
    num_samples = len(image_paths)
    
    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        
        shuffled_paths = image_paths[indices]
        shuffled_labels = labels[indices]
        
        for i in range(0, num_samples, batch_size):
            batch_paths = shuffled_paths[i:i+batch_size]
            batch_labels = shuffled_labels[i:i+batch_size]
            
            batch_features = []
            
            for img_path in tqdm(batch_paths, desc='Batch Progress'):
                image = cv2.imread(img_path)
                image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
                
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                
                # gray_image_eq = cv2.equalizeHist(gray_image)
                
                lbp = local_binary_pattern(gray_image, P=8, R=1, method='uniform')
                
                (hist, _) = np.histogram(lbp.ravel(), bins = np.arange(0, 11), range=(0, 10))
                
                hist = hist.astype('float')
                
                hist /= (hist.sum() + 1e-6)
                
                batch_features.append(hist)
            
            yield np.array(batch_features), np.array(batch_labels)

In [11]:
# Verification
train_gen_lbp = feature_generator_lbp(X_train_paths, y_train, BATCH_SIZE)

print('fetching one batch of feature vectors to test')
sample_batch_features, sample_batch_labels = next(train_gen_lbp) 

print('pipeline complete, ready for training')
print(f'shape of one batch of features: {sample_batch_features.shape}') # 32 per batch and 10 length
print(f'shape of one batch of labels: {sample_batch_labels.shape}')  # 32 per batch
print(f'example feature vector (first image in batch:\n {sample_batch_features[0]})') # 10 arrays

fetching one batch of feature vectors to test


Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 78.31it/s]

pipeline complete, ready for training
shape of one batch of features: (32, 10)
shape of one batch of labels: (32,)
example feature vector (first image in batch:
 [0.01747848 0.04125478 0.0455397  0.15909997 0.26143973 0.21257175
 0.09659997 0.04878827 0.05024314 0.06698422])





## detector with descriptor (fast + brief) - not worth exploring

In [12]:
def feature_generator_fast(image_paths, labels, batch_size):
    print("--- RUNNING THE NEW, CORRECTED FAST GENERATOR V2 ---")
    fast = cv2.FastFeatureDetector_create(nonmaxSuppression=False)
    fast.setThreshold(5)
    brief = cv2.xfeatures2d.BriefDescriptorExtractor_create()
    
    num_samples = len(image_paths)
    
    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        shuffled_paths = image_paths[indices]
        shuffled_labels = labels[indices]
        
        for i in range(0, num_samples, batch_size):
            batch_paths = shuffled_paths[i:i + batch_size]
            batch_labels = shuffled_labels[i:i + batch_size]
            
            batch_features = []
            
            print(f'processing batch at index: {i}')
            for img_path in tqdm(batch_paths, desc='Batch Progress'):
                image = cv2.imread(img_path)
                image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
                
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                
                keypoints = fast.detect(gray_image, None)
                
                keypoints, descriptors = brief.compute(gray_image, keypoints)
                
                if descriptors is not None:
                    feature_vector = np.mean(descriptors, axis=0)
                else:
                    feature_vector = np.zeros(32)
                
                batch_features.append(feature_vector)
            
            yield np.array(batch_features), np.array(batch_labels)

In [13]:
# Verification
train_gen_fast = feature_generator_fast(X_train_paths, y_train, BATCH_SIZE)

print('fetching one batch of feature vectors to test')
sample_fast_batch_features, sample_fast_batch_labels = next(train_gen_fast) 

print('pipeline complete, ready for training')
print(f'shape of one batch of features: {sample_fast_batch_features.shape}') # 32 per batch and 10 length
print(f'shape of one batch of labels: {sample_fast_batch_labels.shape}')  # 32 per batch
print(f'example feature vector (first image in batch:\n {sample_fast_batch_features[0]})') # 10 arrays

fetching one batch of feature vectors to test
--- RUNNING THE NEW, CORRECTED FAST GENERATOR V2 ---
processing batch at index: 0


Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 104.36it/s]

pipeline complete, ready for training
shape of one batch of features: (32, 32)
shape of one batch of labels: (32,)
example feature vector (first image in batch:
 [133.9202346  121.7659824  134.4627566  125.57360704 140.17888563
 133.57595308 131.46686217 143.62463343 138.28797654 117.98064516
 132.4340176  126.78592375 125.69090909 121.54252199 121.71964809
 136.15894428 116.30205279 119.74604106 131.04633431 126.45337243
 133.3888563  140.28211144 119.38181818 125.10205279 136.45865103
 124.67331378 129.57653959 131.30146628 124.86568915 132.28621701
 118.66686217 124.2856305 ])





### fast is unreasonably fast, since a detector + descriptor combo usually will take a while
so an isolated evaluation will be done

In [None]:
# SAMPLE_SIZE = 100

# fast = cv2.FastFeatureDetector_create(threshold=5, nonmaxSuppression=False)
# brief = cv2.xfeatures2d.BriefDescriptorExtractor_create()

# def get_descriptor_count(image_path):
#     img = cv2.imread(image_path)
#     img = cv2.resize(img, (224, 224))
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
#     keypoints_found = fast.detect(gray, None)
#     keypoints_kept, descriptors = brief.compute(gray, keypoints_found)
    
#     return len(descriptors) if descriptors is not None else 0

# print("--- Analyzing Cracked Images ---")
# positive_path = os.path.join(DATASET_PATH, 'Positive')
# positive_files = [os.path.join(positive_path, fname) for fname in os.listdir(positive_path)[:SAMPLE_SIZE]]
# positive_counts = [get_descriptor_count(path) for path in tqdm(positive_files, desc="Cracked")]

# print("\n--- Analyzing Uncracked Images ---")
# negative_path = os.path.join(DATASET_PATH, 'Negative')
# negative_files = [os.path.join(negative_path, fname) for fname in os.listdir(negative_path)[:SAMPLE_SIZE]]
# negative_counts = [get_descriptor_count(path) for path in tqdm(negative_files, desc="Uncracked")]

# print("\n\n--- FINAL DIAGNOSTIC REPORT ---")
# print(f"Average descriptors for CRACKED images: {np.mean(positive_counts):.2f}")
# print(f"Average descriptors for UNCRACKED images: {np.mean(negative_counts):.2f}")
# print(f"\nYour 'hero' image had {positive_counts[0]} descriptors.")

--- Analyzing Cracked Images ---


Cracked: 100%|██████████| 100/100 [00:00<00:00, 112.08it/s]



--- Analyzing Uncracked Images ---


Uncracked: 100%|██████████| 100/100 [00:00<00:00, 116.85it/s]



--- FINAL DIAGNOSTIC REPORT ---
Average descriptors for CRACKED images: 2593.14
Average descriptors for UNCRACKED images: 2032.97

Your 'hero' image had 1375 descriptors.





--- Analyzing Cracked Images ---

Cracked: 100%|██████████| 10000/10000 [01:02<00:00, 159.63it/s]

--- Analyzing Uncracked Images ---

Uncracked: 100%|██████████| 10000/10000 [00:44<00:00, 224.80it/s]


--- FINAL DIAGNOSTIC REPORT ---

Average descriptors for CRACKED images: 3583.86

Average descriptors for UNCRACKED images: 2135.19

Your 'hero' image had 1375 descriptors.


very bad result, the uncracked image has so much descriptors, this shows that fast and brief is a bad combo

## orb

In [15]:
def feature_generator_orb(image_paths, labels, batch_size):
    orb = cv2.ORB_create(nfeatures=500)
    
    num_samples = len(image_paths)
    
    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        shuffled_paths = image_paths[indices]
        shuffled_labels = labels[indices]
        
        for i in range(0, num_samples, batch_size):
            batch_paths = shuffled_paths[i:i + batch_size]
            batch_labels = shuffled_labels[i:i + batch_size]
            
            batch_features = []
            
            for img_path in tqdm(batch_paths, desc='Batch Progress'):
                image = cv2.imread(img_path)
                image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                
                keypoints, descriptors = orb.detectAndCompute(gray_image, None)
                
                if descriptors is not None:
                    feature_vector = np.mean(descriptors, axis=0)
                else:
                    feature_vector = np.zeros(32)
                
                batch_features.append(feature_vector)
            
            yield np.array(batch_features), np.array(batch_labels)

In [16]:
# Verification
train_gen_orb = feature_generator_orb(X_train_paths, y_train, BATCH_SIZE)

print('fetching one batch of feature vectors to test')
sample_orb_batch_features, sample_orb_batch_labels = next(train_gen_orb) 

print('pipeline complete, ready for training')
print(f'shape of one batch of features: {sample_orb_batch_features.shape}') # 32 per batch and 10 length
print(f'shape of one batch of labels: {sample_orb_batch_labels.shape}')  # 32 per batch
print(f'example feature vector (first image in batch:\n {sample_orb_batch_features[0]})') # 10 arrays

fetching one batch of feature vectors to test


Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 154.45it/s]

pipeline complete, ready for training
shape of one batch of features: (32, 32)
shape of one batch of labels: (32,)
example feature vector (first image in batch:
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0.])





## akaze

In [17]:
def feature_generator_akaze(image_paths, labels, batch_size):
    akaze = cv2.AKAZE_create()
    
    num_samples = len(image_paths)
    
    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        shuffled_paths = image_paths[indices]
        shuffled_labels = labels[indices]
        
        for i in range (0, num_samples, batch_size):
            batch_paths = shuffled_paths[i:i + batch_size]
            batch_labels = shuffled_labels[i:i + batch_size]
            
            batch_features = []
            
            for img_path in tqdm(batch_paths, desc='Batch Progress'):
                image = cv2.imread(img_path)
                image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
                gray_img = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                
                keypoints, descriptors = akaze.detectAndCompute(gray_img, None)
                
                if descriptors is not None:
                    feature_vectors = np.mean(descriptors, axis=0)
                else:
                    feature_vectors = np.zeros(61)
                
                batch_features.append(feature_vectors)
            
            yield np.array(batch_features), np.array(batch_labels)

In [18]:
# Verification
train_gen_akaze = feature_generator_akaze(X_train_paths, y_train, BATCH_SIZE)

print('fetching one batch of feature vectors to test')
sample_akaze_batch_features, sample_akaze_batch_labels = next(train_gen_akaze) 

print('pipeline complete, ready for training')
print(f'shape of one batch of features: {sample_akaze_batch_features.shape}') # 32 per batch and 10 length
print(f'shape of one batch of labels: {sample_akaze_batch_labels.shape}')  # 32 per batch
print(f'example feature vector (first image in batch:\n {sample_akaze_batch_features[0]})') # 10 arrays

fetching one batch of feature vectors to test


Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 94.12it/s]

pipeline complete, ready for training
shape of one batch of features: (32, 61)
shape of one batch of labels: (32,)
example feature vector (first image in batch:
 [106.13333333  41.4        102.46666667  65.13333333  88.71111111
  25.6        137.44444444 123.84444444 132.57777778 124.02222222
 146.71111111  31.97777778  20.95555556  67.37777778 234.64444444
 131.77777778 143.15555556 163.91111111 124.28888889 225.17777778
  96.82222222 156.68888889  99.02222222  72.75555556   8.44444444
   8.77777778  15.71111111  77.91111111 107.66666667  73.13333333
 133.53333333 128.11111111 142.15555556 126.37777778 126.71111111
 145.53333333 103.62222222  72.37777778 107.62222222 134.82222222
 143.06666667 161.         173.24444444 122.64444444 118.48888889
 146.42222222  41.62222222 103.73333333  42.97777778 113.55555556
  72.46666667  20.55555556 100.37777778 113.97777778 115.53333333
 116.06666667  92.28888889 209.44444444 165.24444444 202.35555556
  30.86666667])





## HOG

In [19]:
def feature_generator_hog(image_paths, labels, batch_size):
    num_samples = len(image_paths)
    
    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        shuffled_paths = image_paths[indices]
        shuffled_labels = labels[indices]
        
        for i in range(0, num_samples, batch_size):
            batch_paths = shuffled_paths[i: i + batch_size]
            batch_labels = shuffled_labels[i: i + batch_size]
            
            batch_features = []
            
            for img_path in batch_paths:
                image = cv2.imread(img_path)
                image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

                hog_features = hog(
                    gray_image,
                    orientations=9,
                    pixels_per_cell=(8,8),
                    cells_per_block=(2,2),
                    transform_sqrt=True,
                    block_norm='L1'
                )
                
                batch_features.append(hog_features)
            
            yield np.array(batch_features), np.array(batch_labels)

In [20]:
# Verification
train_gen_hog = feature_generator_hog(X_train_paths, y_train, BATCH_SIZE)

print('fetching one batch of feature vectors to test')
sample_hog_batch_features, sample_hog_batch_labels = next(train_gen_hog) 

print('pipeline complete, ready for training')
print(f'shape of one batch of features: {sample_hog_batch_features.shape}') # 32 per batch and 10 length
print(f'shape of one batch of labels: {sample_hog_batch_labels.shape}')  # 32 per batch
print(f'example feature vector (first image in batch:\n {sample_hog_batch_features[0]})') # 10 arrays

fetching one batch of feature vectors to test
pipeline complete, ready for training
shape of one batch of features: (32, 26244)
shape of one batch of labels: (32,)
example feature vector (first image in batch:
 [0.06275973 0.01151257 0.01430782 ... 0.         0.02787323 0.00924073])


## MSER

In [22]:
def feature_generator_mser(image_paths, labels, batch_size):
    mser = cv2.MSER_create()
    
    num_samples = len(image_paths)
    
    while True:
        indices = np.arange(num_samples)
        np.random.shuffle(indices)
        shuffled_paths = image_paths[indices]
        shuffled_labels = labels[indices]
        
        for i in range(0, num_samples, batch_size):
            batch_paths = shuffled_paths[i:i + batch_size]
            batch_labels = shuffled_labels[i:i + batch_size]
            
            batch_features = []
            
            for img_path in batch_paths:
                image = cv2.imread(img_path)
                image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
                gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
                
                regions, _ = mser.detectRegions(gray_image)
                
                total_area = 0
                
                if regions is not None:
                    total_area = sum(cv2.contourArea(region) for region in regions)
                
                batch_features.append([total_area])
            
            yield np.array(batch_features), np.array(batch_labels)
                    

In [23]:
# Verification
train_gen_mser = feature_generator_mser(X_train_paths, y_train, BATCH_SIZE)

print('fetching one batch of feature vectors to test')
sample_mser_batch_features, sample_mser_batch_labels = next(train_gen_mser) 

print('pipeline complete, ready for training')
print(f'shape of one batch of features: {sample_mser_batch_features.shape}') # 32 per batch and 10 length
print(f'shape of one batch of labels: {sample_mser_batch_labels.shape}')  # 32 per batch
print(f'example feature vector (first image in batch:\n {sample_mser_batch_features[0]})') # 10 arrays

fetching one batch of feature vectors to test
pipeline complete, ready for training
shape of one batch of features: (32, 1)
shape of one batch of labels: (32,)
example feature vector (first image in batch:
 [2470127.5])


# pre test
a test for all the local descriptor pipelines first so that the pipelines can be evaluated first before doing a rigorous training using model evaluation

In [None]:
# SAMPLE_SIZE = 10000

# def get_descriptor_count_orb(image_path):
#     orb = cv2.ORB_create()
#     img = cv2.imread(image_path)
#     img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#     _, descriptors = orb.detectAndCompute(gray, None)
#     return len(descriptors) if descriptors is not None else 0

# def get_descriptor_count_akaze(image_path):
#     akaze = cv2.AKAZE_create()
#     img = cv2.imread(image_path)
#     img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#     _, descriptors = akaze.detectAndCompute(gray, None)
#     return len(descriptors) if descriptors is not None else 0

# def get_area_mser(image_path):
#     mser = cv2.MSER_create()
#     img = cv2.imread(image_path)
#     img = cv2.resize(img, (IMG_WIDTH, IMG_HEIGHT))
#     gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#     regions, _ = mser.detectRegions(gray)
#     return sum(cv2.contourArea(r) for r in regions) if regions is not None else 0

# results = {}
# diagnostics_to_run = {
#     'ORB': get_descriptor_count_orb,
#     'AKAZE': get_descriptor_count_akaze,
#     'MSER': get_area_mser
# }

# positive_files = [os.path.join(positive_path, fname) for fname in os.listdir(positive_path)[:SAMPLE_SIZE]]
# negative_files = [os.path.join(negative_path, fname) for fname in os.listdir(negative_path)[:SAMPLE_SIZE]]

# for name, helper_function in diagnostics_to_run.items():
#     print(f'analyzing {name}')
    
#     positive_counts = [helper_function(path) for path in tqdm(positive_files, desc=f'Cracked ({name})')]
#     negative_counts = [helper_function(path) for path in tqdm(negative_files, desc=f'Uncracked ({name})')]
    
#     results[name] = {
#         'cracked_avg': np.mean(positive_counts),
#         'uncracked_avg': np.mean(negative_counts)
#     }

# print('final pre-test diagnostic report')

# for name, data in results.items():
#     cracked_avg = data['cracked_avg']
#     uncracked_avg = data['uncracked_avg']
    
#     ratio = cracked_avg / (uncracked_avg + 1e-6)
    
#     print(f'algorithm: {name}')
#     print(f'Avg features on CRACKED images: {cracked_avg}')
#     print(f'Avg features on UNCRACKED images: {uncracked_avg}')
#     print(f'Discriminative ratio (Cracked/Uncracked): {ratio}')

analyzing ORB






[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A

analyzing AKAZE






[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A

analyzing MSER






[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A[A[A[A



[A

final pre-test diagnostic report
algorithm: ORB
Avg features on CRACKED images: 161.965
Avg features on UNCRACKED images: 42.0077
Discriminative ratio (Cracked/Uncracked): 3.8556025715380144
algorithm: AKAZE
Avg features on CRACKED images: 41.587
Avg features on UNCRACKED images: 4.1175
Discriminative ratio (Cracked/Uncracked): 10.100058263495262
algorithm: MSER
Avg features on CRACKED images: 286262.9398
Avg features on UNCRACKED images: 935.64615
Discriminative ratio (Cracked/Uncracked): 305.9521374550067





```
final pre-test diagnostic report
algorithm: ORB
Avg features on CRACKED images: 161.965
Avg features on UNCRACKED images: 42.0077
Discriminative ratio (Cracked/Uncracked): 3.8556025715380144
algorithm: AKAZE
Avg features on CRACKED images: 41.587
Avg features on UNCRACKED images: 4.1175
Discriminative ratio (Cracked/Uncracked): 10.100058263495262
algorithm: MSER
Avg features on CRACKED images: 286262.9398
Avg features on UNCRACKED images: 935.64615
Discriminative ratio (Cracked/Uncracked): 305.9521374550067
```

## result:
ORB will be excluded from the final evaluation. The final score test will be a competition between our top candidates: AKAZE, MSER, LBP, and HOG

### Why FAST and ORB are excluded?
- FAST -> very close to a ratio of 1: not worth it to do because for for almost every one feature there is one noise
- ORB -> a score of 3.8 to 4: the signal is now stronger than the noise, BUT there is still a lot of noise that could confuse the model. These features are logically weak but potentially useable signal, if there are no other one stronger than this, it can be used, but since there are 2 other algorithm much stronger than this, this algorithm is sidelined
  
### Why is AKAZE included when MSER wins by a landslide?
This is a competition between two fundamentally different types of evidence: a simple but powerful quantity (MSER) vs a rich and detailed quality (potentially AKAZE)

MSER's single feature (total_area) is powerful but potentially brittle. It is only a one dimensional view of the problem. Whereas AKAZE's feature vector is a rich, high-dimensional summary of the image's key characteristics. This detailed description might be more robust to confusing scenarios (like shadow or stains) and could allow a model to learn a more nuanced decision boundary

we are including AKAZE because we need to test the hypothesis: "is it better to have one simple, overwhelmingly strong feature (MSER), or a collection of detailed, high-quality features that tell a richer story (AKAZE)? Hence the need of a score test for the final judge

# Feature Selection

In [35]:
NUM_TRAINING_STEPS = len(X_train_paths) // BATCH_SIZE

pipeline_generators = {
    'LBP': feature_generator_lbp,
    'HOG': feature_generator_hog,
    'AKAZE': feature_generator_akaze,
    'MSER': feature_generator_mser
}

def extract_test_features(extractor_name, paths):
    print(f'extracting test features for {extractor_name}')
    
    test_features = []
    
    akaze = cv2.AKAZE_create()
    mser = cv2.MSER_create()
    
    for img_path in tqdm(paths, desc=f'Testing {extractor_name}'):
        image = cv2.imread(img_path)
        image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
        gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        if extractor_name == 'LBP':
            lbp = local_binary_pattern(gray_image, P=8, R=1, method='uniform')
            hist, _ = np.histogram(lbp.ravel(), bins=np.arange(0, 11), range=(0, 10))
            hist = hist.astype('float')
            hist /= (hist.sum() + 1e-6)
            test_features.append(hist)
        elif extractor_name == 'HOG':
            hog_features = hog(gray_image, orientations=9, pixels_per_cell=(8, 8), cells_per_block=(2, 2), transform_sqrt=True, block_norm='L1')
            test_features.append(hog_features)
        elif extractor_name == 'AKAZE':
            _, descriptors = akaze.detectAndCompute(gray_image, None)
            if descriptors is not None: 
                feature_vector = np.mean(descriptors, axis=0)
            else:
                feature_vector = np.zeros(61)
            test_features.append(feature_vector)
        elif extractor_name == 'MSER':
            regions, _ = mser.detectRegions(gray_image)
            total_area = sum(cv2.contourArea(r) for r in regions) if regions is not None else 0
            test_features.append([total_area])
        
    return np.array(test_features)

results = {}

for pipeline_name, generator_function in pipeline_generators.items():
    print(f'training baseline model for: {pipeline_name}')
    
    model = SGDClassifier(max_iter=1000, tol=1e-3, random_state=42)
    train_generator = generator_function(X_train_paths, y_train, BATCH_SIZE)
    
    for _ in tqdm(range(int(NUM_TRAINING_STEPS)), desc=f'Training {pipeline_name}'):
        X_batch, y_batch = next(train_generator)
        model.partial_fit(X_batch, y_batch, classes=np.array([0, 1]))
    
    X_test_features = extract_test_features(pipeline_name, X_test_paths)
    y_pred = model.predict(X_test_features)
    report = classification_report(y_test, y_pred, target_names=['No Crack (0)', 'Crack (1)'], output_dict=True)
    results[pipeline_name] = report

print('final baseline comparison report')

best_pipeline = max(results, key=lambda p: results[p]['Crack (1)']['f1-score'])

for pipeline_name, report in results.items():
    f1_crack = report['Crack (1)']['f1-score']
    print("==========================================")
    print(f"               {pipeline_name} {'WINNER' if pipeline_name == best_pipeline else ''}")
    print("==========================================")
    print(f"              precision    recall  f1-score   support")
    print(f"No Crack (0)      {report['No Crack (0)']['precision']:.2f}         {report['No Crack (0)']['recall']:.2f}      {report['No Crack (0)']['f1-score']:.2f}      {report['No Crack (0)']['support']}")
    print(f"   Crack (1)      {report['Crack (1)']['precision']:.2f}         {report['Crack (1)']['recall']:.2f}      {report['Crack (1)']['f1-score']:.2f}      {report['Crack (1)']['support']}")
    print(f"\n   Accuracy                           {report['accuracy']:.2f}     {report['macro avg']['support']}")
    print(f"   Macro Avg      {report['macro avg']['precision']:.2f}         {report['macro avg']['recall']:.2f}      {report['macro avg']['f1-score']:.2f}      {report['macro avg']['support']}")
    print(f"Weighted Avg      {report['weighted avg']['precision']:.2f}         {report['weighted avg']['recall']:.2f}      {report['weighted avg']['f1-score']:.2f}      {report['weighted avg']['support']}")

training baseline model for: LBP


Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 118.12it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 121.20it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 122.13it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 160.46it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 132.59it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 121.03it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 133.31it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 105.23it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 117.39it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 101.64it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 140.71it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 126.57it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 123.13it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 164.76it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 122.96it/s]
Batch Progress: 100%|████

extracting test features for LBP


Testing LBP: 100%|██████████| 10000/10000 [01:31<00:00, 109.34it/s]


training baseline model for: HOG


Training HOG: 100%|██████████| 937/937 [03:40<00:00,  4.24it/s]


extracting test features for HOG


Testing HOG: 100%|██████████| 10000/10000 [01:25<00:00, 117.24it/s]


training baseline model for: AKAZE


Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 288.60it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 352.00it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 341.78it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 356.99it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 348.74it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 358.34it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 336.27it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 350.85it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 353.18it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 358.68it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 375.72it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 351.56it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 367.88it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 350.23it/s]
Batch Progress: 100%|██████████| 32/32 [00:00<00:00, 354.34it/s]
Batch Progress: 100%|████

extracting test features for AKAZE


Testing AKAZE: 100%|██████████| 10000/10000 [00:28<00:00, 354.67it/s]


training baseline model for: MSER


Training MSER: 100%|██████████| 937/937 [01:28<00:00, 10.58it/s]


extracting test features for MSER


Testing MSER: 100%|██████████| 10000/10000 [00:28<00:00, 351.53it/s]

final baseline comparison report
               LBP 
              precision    recall  f1-score   support
No Crack (0)      0.66         0.86      0.75      5000.0
   Crack (1)      0.80         0.56      0.66      5000.0

   Accuracy                           0.71     10000.0
   Macro Avg      0.73         0.71      0.71      10000.0
Weighted Avg      0.73         0.71      0.71      10000.0
               HOG 
              precision    recall  f1-score   support
No Crack (0)      0.87         0.97      0.92      5000.0
   Crack (1)      0.96         0.86      0.91      5000.0

   Accuracy                           0.91     10000.0
   Macro Avg      0.92         0.91      0.91      10000.0
Weighted Avg      0.92         0.91      0.91      10000.0
               AKAZE WINNER
              precision    recall  f1-score   support
No Crack (0)      0.92         0.96      0.94      5000.0
   Crack (1)      0.96         0.92      0.94      5000.0

   Accuracy                           0.




In [36]:
import json

In [40]:
try:
    with open('./Result/baseline_results.json', 'r') as f:
        loaded_results = json.load(f)

    lbp_f1_score = loaded_results['LBP']['Crack (1)']['f1-score']
    print(f"The LBP F1-score for cracks was: {lbp_f1_score}")
except:
    results_filename = './Result/baseline_results.json'
    print(f'saving to {results_filename}')

    with open(results_filename, 'w') as f:
        json.dump(results, f, indent=4)

    print('saved successfully')

The LBP F1-score for cracks was: 0.6625939849624061
