In [9]:
import cv2
import os 
import numpy as np 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import classification_report, confusion_matrix 
from tqdm import tqdm 
from skimage import feature 
from sklearn.ensemble import RandomForestClassifier 
from sklearn.svm import SVC 
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import GaussianNB

In the initial phases, 'SDGClassifier' is used for its efficiency on handling large datasets via incremental learning (partial_fit). However, given the change to a primary dataset's concentrated size (~2,650 images), memory constraints are no longer a concern

Therefore, a transition to the normal batch training method using standard solvers (e.g. logistic regression, svc, and exact decision tree construction). Because unlike SDG, which approximates the optimal solution, these solvers calculate the global optimum (or stable local optima) by analyzing the entire dataset simultaneously. This pivot will ensure maximum stability, reproducibility, and precision, eliminating all variance associating with stochastic gradient estimation or smaller datasets

In [3]:
DATASET_PATH = 'dataset_augmented'
IMG_WIDTH = 224
IMG_HEIGHT = 224

In [5]:
print('Getting file paths and labels')

image_paths = []
labels = []

positive_path = os.path.join(DATASET_PATH, 'Positive')
negative_path = os.path.join(DATASET_PATH, 'Negative')

for filename in os.listdir(positive_path):
    image_paths.append(os.path.join(positive_path, filename))
    labels.append(1)
    
for filename in os.listdir(negative_path):
    image_paths.append(os.path.join(negative_path, filename))
    labels.append(0)
    
image_paths = np.array(image_paths)
labels = np.array(labels)

X_train_paths, X_test_paths, y_train, y_test = train_test_split(
    image_paths, labels, test_size=0.25, random_state=42, stratify=labels
)

print(f'Training test size: {len(X_train_paths)}')
print(f'Testing test size: {len(X_test_paths)}')

Getting file paths and labels
Training test size: 1987
Testing test size: 663


In [6]:
def load_and_extract_hog(path, label):
    data = []
    labels = []
    files = [f for f in os.listdir(path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    
    for f in tqdm(files, desc=f"Loading label {label}"):
        img_path = os.path.join(path, f)
        image = cv2.imread(img_path)
        if image is None: continue
        
        image = cv2.resize(image, (IMG_WIDTH, IMG_HEIGHT))
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
        
        hog_feat = feature.hog(
            gray, 
            orientations=9, 
            pixels_per_cell=(16, 16), 
            cells_per_block=(2, 2), 
            transform_sqrt=True, 
            block_norm='L1'
        )
        
        data.append(hog_feat)
        labels.append(label)
        
    return data, labels

In [7]:
print("--- 1. PREPARING DATA (Extracting Features) ---")

pos_data, pos_labels = load_and_extract_hog(os.path.join(DATASET_PATH, 'Positive'), 1)
neg_data, neg_labels = load_and_extract_hog(os.path.join(DATASET_PATH, 'Negative'), 0)

X = np.array(pos_data + neg_data)
y = np.array(pos_labels + neg_labels)

print(f"\nFeature Matrix Shape: {X.shape}")
print(f"Labels Shape: {y.shape}")

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42, stratify=y)

--- 1. PREPARING DATA (Extracting Features) ---


Loading label 1: 100%|██████████| 1325/1325 [01:02<00:00, 21.36it/s] 
Loading label 0: 100%|██████████| 1325/1325 [01:12<00:00, 18.32it/s]


Feature Matrix Shape: (2650, 6084)
Labels Shape: (2650,)





In [15]:
print("\n--- 2. STARTING MODEL SELECTION ---")

models = {
    "SGD (Previous Baseline)": SGDClassifier(max_iter=1000, tol=1e-3, random_state=42),
    "Logistic Regression": LogisticRegression(max_iter=1000, random_state=42),
    "KNN (k=5)": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Random Forest": RandomForestClassifier(n_estimators=100, random_state=42),
    "SVM (Linear)": SVC(kernel='linear', C=1.0, probability=True, random_state=42),
    "SVM (RBF Kernel)": SVC(kernel='rbf', C=1.0, gamma='scale', random_state=42)
}

results = {}

for name, model in models.items():
    print(f'\nTraining {name}')
    model.fit(X_train, y_train)
    
    print(f'\nEvaluating {name}')
    y_pred = model.predict(X_test)
    
    report = classification_report(y_test, y_pred, target_names=['No Crack (0)', 'Crack (1)'], output_dict=True)
    results[name] = report


--- 2. STARTING MODEL SELECTION ---

Training SGD (Previous Baseline)

Evaluating SGD (Previous Baseline)

Training Logistic Regression

Evaluating Logistic Regression

Training KNN (k=5)

Evaluating KNN (k=5)

Training Naive Bayes

Evaluating Naive Bayes

Training Random Forest

Evaluating Random Forest

Training SVM (Linear)

Evaluating SVM (Linear)

Training SVM (RBF Kernel)

Evaluating SVM (RBF Kernel)


In [16]:
print('\n\n--- FINAL MODEL SELECTION RESULTS (HOG 16x16) ---')

best_model = max(results, key=lambda p: results[p]['macro avg']['f1-score'])

for name, report in results.items():
    print("==========================================")
    print(f"               {name} {'WINNER' if name == best_model else ''}")
    print("==========================================")
    print(f"   Accuracy                           {report['accuracy']:.2f}")
    print(f"   Crack F1-Score                     {report['Crack (1)']['f1-score']:.2f}")
    print(f"   No Crack F1-Score                  {report['No Crack (0)']['f1-score']:.2f}")
    print(f"   Macro Avg F1                       {report['macro avg']['f1-score']:.2f}")
    print("\n")



--- FINAL MODEL SELECTION RESULTS (HOG 16x16) ---
               SGD (Previous Baseline) 
   Accuracy                           0.68
   Crack F1-Score                     0.62
   No Crack F1-Score                  0.72
   Macro Avg F1                       0.67


               Logistic Regression 
   Accuracy                           0.73
   Crack F1-Score                     0.72
   No Crack F1-Score                  0.74
   Macro Avg F1                       0.73


               KNN (k=5) 
   Accuracy                           0.54
   Crack F1-Score                     0.21
   No Crack F1-Score                  0.68
   Macro Avg F1                       0.44


               Naive Bayes 
   Accuracy                           0.63
   Crack F1-Score                     0.53
   No Crack F1-Score                  0.70
   Macro Avg F1                       0.61


               Random Forest WINNER
   Accuracy                           0.77
   Crack F1-Score                     0.78


```
==========================================
               Random Forest WINNER
==========================================
   Accuracy                           0.77
   Crack F1-Score                     0.78
   No Crack F1-Score                  0.76
   Macro Avg F1                       0.77
   ```
best result metric wise