In [1]:
import cv2  # Handle grayscale loading
import numpy as np  # Array operations
import os  # Directory traversal
from sklearn.model_selection import train_test_split  # Data splitting
from sklearn.svm import SVC  # SVM classifier
from sklearn.metrics import accuracy_score  # Accuracy metric
from skimage.feature import hog  # HOG feature extraction
from sklearn.preprocessing import StandardScaler  # Feature normalization
import pickle  # Model serialization
import logging  # Logging setup
from datetime import datetime  # Timestamp logs

# Sub-action 1: Setup unique log file
log_dir = "activity_logs"  # Reuse directory
if not os.path.exists(log_dir):
    os.makedirs(log_dir)  # Why: Prevent directory errors
train_log = f'{log_dir}/model_build_{datetime.now().strftime("%Y%m%d_%H%M%S")}.log'
logging.basicConfig(filename=train_log, level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Sub-action 2: Constants 
DATA_PATH = "ready_face_data"  # From Phase 1 output
RESIZE_DIMS = (100, 100)  # Fixed size
MODEL_SAVE = "svm_face_model.pkl"  # Unique save name

def start_train_logging():  # Unique function
    """Sub-action: Kick off training logs."""
    logging.info("Initiating model build phase.")

def pull_features_from_gray(gray_input):  # tweaked HOG params
    """Sub-action: Compute HOG descriptors from grayscale input."""
    # Why: HOG captures edge orientations for face uniqueness
    hog_desc, _ = hog(gray_input, orientations=8, pixels_per_cell=(10, 10),  
                      cells_per_block=(2, 2), visualize=True, feature_vector=True)
    print(hog_desc)
    return hog_desc  # Output feature array


def gather_dataset_features(data_path, dims):  # Renamed
    """
    Main action: Traverse directories, load grayscale, extract features, assign labels.
    Sub-action: Validate data path.
    Testable: Log shapes, check non-empty.
    """
    if not os.path.exists(data_path):
        logging.error(f"Dataset path {data_path} absent.")
        raise FileNotFoundError(f"Dataset path {data_path} absent.")

    feature_list = []  # Collect all HOG features
    tag_list = []  # Corresponding student tags
    for subdir in os.listdir(data_path):  # Sub-action: Loop student subdirs
        subdir_path = os.path.join(data_path, subdir)
        if os.path.isdir(subdir_path):
            tag = subdir.split('_')[0]  # Sub-action: Extract tag from folder
            logging.info(f"Processing subdir {subdir} with tag {tag}")  # Log progress
            for file_entry in os.listdir(subdir_path):  # Sub-action: Loop files
                if file_entry.lower().endswith('.jpg'):
                    file_path = os.path.join(subdir_path, file_entry)
                    try:
                        loaded_gray = cv2.imread(file_path, cv2.IMREAD_GRAYSCALE)  # Sub-action: Load as gray
                        if loaded_gray is None:
                            logging.warning(f"Load failed for {file_entry}")
                            continue
                        adjusted_size = cv2.resize(loaded_gray, dims, interpolation=cv2.INTER_LINEAR)  # Sub-action: Resize
                        extracted_feats = pull_features_from_gray(adjusted_size)  # Sub-action: Get HOG
                        feature_list.append(extracted_feats)  # Add to list
                        tag_list.append(tag)  # Add tag
                        logging.debug(f"Feature added for {file_entry}, shape: {extracted_feats.shape}")  # Debug log
                    except Exception as proc_err:
                        logging.error(f"Processing error {file_entry}: {str(proc_err)}")  # Error log
    feature_array = np.array(feature_list)  # Sub-action: Convert to array
    tag_array = np.array(tag_list)
    logging.info(f"Dataset gathered: Features {feature_array.shape}, Tags {len(tag_array)}")  # Test log
    if len(feature_array) == 0:  # Sub-action: Empty check
        raise ValueError("No features gathered - check data path")
    return feature_array, tag_array  # Return for training

def fit_and_evaluate_svm(features, tags):  # Renamed
    """Main action: Normalize, split, fit SVM, evaluate.
    Sub-action: Scale features.
    Testable: Assert accuracy threshold.
    """
    normalizer = StandardScaler()  # Sub-action: Init scaler
    scaled_features = normalizer.fit_transform(features)  # Sub-action: Apply scaling
    logging.info(f"Scaled features shape: {scaled_features.shape}")  # Log shape

    train_feats, test_feats, train_tags, test_tags = train_test_split(scaled_features, tags, test_size=0.2, random_state=202)  # Sub-action: Split
    svm_classifier = SVC(kernel='rbf', probability=True, random_state=202)  # Sub-action: Init SVM
    svm_classifier.fit(train_feats, train_tags)  # Sub-action: Fit model
    predicted_tags = svm_classifier.predict(test_feats)  # Sub-action: Predict on test
    calc_accuracy = accuracy_score(test_tags, predicted_tags)  # Sub-action: Compute accuracy
    logging.info(f"SVM fit complete, accuracy: {calc_accuracy * 100:.2f}%")  # Log result

    # Sub-action: Test accuracy threshold (SRS req)
    assert calc_accuracy >= 0.95, f"Accuracy {calc_accuracy} below 95% - retrain needed"  # Test assertion
    print(f"Test: Model accuracy = {calc_accuracy * 100:.2f}%")  # Testable print

    tag_mapping = {t: i for i, t in enumerate(np.unique(tags))}  # Sub-action: Create label map
    save_bundle = {'svm_model': svm_classifier, 'feature_scaler': normalizer, 'tag_dict': tag_mapping}  # Unique keys
    with open(MODEL_SAVE, 'wb') as save_handle:  # Sub-action: Serialize
        pickle.dump(save_bundle, save_handle)
    logging.info(f"Model serialized to {MODEL_SAVE}")  # Log save
    return save_bundle, calc_accuracy  # Return for verification

if __name__ == "__main__":
    start_train_logging()  # Init logs
    try:
        feat_data, tag_data = gather_dataset_features(DATA_PATH, RESIZE_DIMS)  # Gather data
        model_info, acc_score = fit_and_evaluate_svm(feat_data, tag_data)  # Train
        print(f"Test: Training done - Accuracy {acc_score * 100:.2f}%, Model saved")  # Final test
    except Exception as main_err:
        logging.error(f"Main training error: {str(main_err)}")  # Catch-all

[0.2269072  0.2269072  0.13337903 ... 0.00887762 0.02113509 0.11827905]
[0.19502038 0.10015458 0.12970958 ... 0.0036391  0.02829301 0.05421362]
[0.22513158 0.03074129 0.22513158 ... 0.06585744 0.12107377 0.11444179]
[0.20453156 0.07887632 0.10833154 ... 0.03099074 0.14888167 0.08109303]
[0.29011537 0.14751022 0.29011537 ... 0.00823185 0.27209211 0.33108972]
[0.29136204 0.29136204 0.13604125 ... 0.         0.01019349 0.        ]
[0.07507157 0.         0.         ... 0.00254239 0.00503519 0.17027945]
[0.10995097 0.         0.00378243 ... 0.01041189 0.         0.01077333]
[0.12447274 0.021884   0.02838413 ... 0.17322929 0.02601332 0.0417574 ]
[0.05005176 0.02381888 0.01402057 ... 0.31723807 0.13994062 0.06886828]
[0.07470163 0.22865989 0.35784626 ... 0.01499797 0.35535383 0.22810871]
[0.2598838  0.18968453 0.2598838  ... 0.04176659 0.02432642 0.1582252 ]
[0.23089171 0.04147254 0.15293419 ... 0.00424888 0.0522335  0.20141107]
[0.37607811 0.02751029 0.01107178 ... 0.00525138 0.00215012 0.00