In [1]:
# Import necessary libraries
import os                        # Library to interact with the operating system
import cv2                       # OpenCV library for computer vision tasks
import numpy as np               # NumPy library for numerical operations
from sklearn.datasets import fetch_lfw_people   # Function to load the LFW dataset
from sklearn.model_selection import train_test_split   # Function to split dataset into train and test subsets

In [37]:
# Function to perform image augmentation using OpenCV
def augment_image(image):
    # Check if the image is grayscale (2-dimensional)
    if image.ndim == 2:  # Grayscale image
        # Convert the grayscale image to RGB format
        image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)

    rows, cols, _ = image.shape   # Get the dimensions of the image

    # Random rotation between -10 to 10 degrees
    random_angle = np.random.randint(-10, 11)   # Generate a random angle between -10 and 10 degrees
    M = cv2.getRotationMatrix2D((cols / 2, rows / 2), random_angle, 1)   # Get the rotation matrix for the random angle
    augmented_image = cv2.warpAffine(image, M, (cols, rows))   # Apply the rotation to the image using warpAffine

    # Random horizontal flipping
    if np.random.rand() > 0.5:   # Generate a random number between 0 and 1, and check if it's greater than 0.5
        flipped_image = cv2.flip(augmented_image, 1)  # 1 means horizontal flip. Flip the image horizontally
    else:
        flipped_image = augmented_image   # Keep the image as is (no horizontal flip)

    # Random brightness adjustment
    brightness_factor = np.random.uniform(0.7, 1.3)   # Generate a random brightness factor between 0.7 and 1.3
    hsv_image = cv2.cvtColor(flipped_image, cv2.COLOR_RGB2HSV)   # Convert the RGB image to HSV color space
    hsv_image[:, :, 2] = hsv_image[:, :, 2] * brightness_factor   # Adjust the brightness (V channel) by the brightness factor
    augmented_image = cv2.cvtColor(hsv_image, cv2.COLOR_HSV2RGB)   # Convert the HSV image back to RGB color space

    return augmented_image   # Return the augmented image

In [38]:
def augment_lfw_people_dataset(lfw_people, target_count=10, output_directory='lfw_augmented'):
    # Create a new directory for the augmented dataset
    augmented_dir = os.path.join(lfw_people.target_names[0], output_directory)   # Create the full path of the augmented directory
    os.makedirs(augmented_dir, exist_ok=True)   # Create the augmented directory if it doesn't exist

    # Loop through each label in the dataset
    for label_idx, label_name in enumerate(lfw_people.target_names):
        label_dir = os.path.join(augmented_dir, label_name)   # Create the full path of the label subdirectory
        os.makedirs(label_dir, exist_ok=True)   # Create the label subdirectory if it doesn't exist

        # Get images belonging to the current label
        label_images = lfw_people.images[lfw_people.target == label_idx]   # Fetch the images with the current label

        # Check if the label folder already has enough images (>= target_count)
        if len(label_images) >= target_count:   # If the label already has enough images
            selected_images = label_images[:target_count]   # Select the first target_count number of images
        else:
            # If the label folder has fewer images, duplicate and augment the existing images
            selected_images = []
            while len(selected_images) < target_count:
                for image in label_images:
                    selected_images.append(image)   # Augment the image and add to selected_images
                    if len(selected_images) == target_count:   # Check if we have enough augmented images
                        break

        # Perform augmentation for images with count < target_count
        for idx, image in enumerate(selected_images):
            image_path = os.path.join(label_dir, f'{label_name}_{idx}.png')   # Create the full path of the augmented image
            image = cv2.convertScaleAbs(image, alpha=(255.0))
            cv2.imwrite(image_path, image)   # Write the augmented image to the specified path

In [39]:
# Load the LFW dataset
lfw_people = fetch_lfw_people(min_faces_per_person=2, resize=0.4)   # Fetch the LFW dataset with specified parameters

# Augment the LFW dataset
augment_lfw_people_dataset(lfw_people, target_count=10)   # Augment the dataset with 10 images per class

In [40]:
lfw_people.target_names.shape

(1680,)

In [41]:
#This is splitting code:
import os
import shutil
import random

# Define the directory containing your data
data_directory = './Aaron Peirsol/lfw_augmented'

# Define the output directories for training and test data
train_directory = 'training_data'
test_directory = 'test_data'

# Create the output directories if they don't exist
os.makedirs(train_directory, exist_ok=True)
os.makedirs(test_directory, exist_ok=True)

# Loop through each person's data directory
for person_dir in os.listdir(data_directory):
    person_path = os.path.join(data_directory, person_dir)

    # Get all the image filenames in the person's directory
    image_files = [f for f in os.listdir(person_path) if f.endswith('.png')]

    # Shuffle the image files randomly
    random.shuffle(image_files)

    # Calculate the split index for training and test sets
    split_index = int(0.8 * len(image_files))

    # Split the image files into training and test sets
    train_images = image_files[:split_index]
    test_images = image_files[split_index:]

    # Move the training images to the training data directory
    for train_img in train_images:
        src = os.path.join(person_path, train_img)
        # dest = os.path.join(train_directory, person_dir, train_img)
        
        dest_dir = os.path.join(train_directory, person_dir)
        dest_dir = dest_dir.replace(" ", "_")
        dest = os.path.join(dest_dir, train_img)
        os.makedirs(dest_dir, exist_ok=True)
        shutil.copy(src, dest)

    # Move the test images to the test data directory
    for test_img in test_images:
        src = os.path.join(person_path, test_img)
        # dest = dest.replace(" ", "_")
        dest_dir = os.path.join(test_directory, person_dir)
        dest_dir = dest_dir.replace(" ", "_")
        dest = os.path.join(dest_dir, test_img)
        os.makedirs(dest_dir, exist_ok=True)
        shutil.copy(src, dest)

print("Data split completed successfully!")

Data split completed successfully!


In [2]:
# Reading the images
# Define the output directories for training and test data
train_directory = 'training_data'
test_directory = 'test_data'

In [3]:
# Define the output directories for training and test data
train_directory = './training_data'
test_directory = './test_data'

x_train = []
y_train = []
# Loop through train data
for person_dir in os.listdir(train_directory):
    train_image = os.path.join(train_directory, person_dir)
    # Get all the image filenames in the person's directory
    image_files = [f for f in os.listdir(train_image) if f.endswith('.png')]
    x_train.extend(image_files)
    for label in image_files:
        y_train.append(person_dir)
    # label_image.extend(temp_label)

In [4]:
import numpy as np

In [5]:
from PIL import Image
arr = []
for idx, image_path in enumerate(x_train):
    image_path = f"./training_data/{y_train[idx]}/"+image_path
    image = Image.open(image_path)
    arr.append(np.asarray(image))
x_train = np.array(arr)

In [6]:
x_test = []
y_test = []
# Loop through train data
for person_dir in os.listdir(test_directory):
    test_image = os.path.join(test_directory, person_dir)
    # Get all the image filenames in the person's directory
    image_files = [f for f in os.listdir(test_image) if f.endswith('.png')]
    x_test.extend(image_files)
    for label in image_files:
        y_test.append(person_dir)
    # label_image.extend(temp_label)
    
arr = []
for idx, image_path in enumerate(x_test):
    image_path = f"./test_data/{y_test[idx]}/"+image_path
    image = Image.open(image_path)
    arr.append(np.asarray(image))
x_test = np.array(arr)

In [7]:
x_test.shape, len(y_test)

((3360, 50, 37), 3360)

In [8]:
x_train.shape, len(y_train)

((13440, 50, 37), 13440)

In [9]:
# X_train = X_train.reshape(-1, 50 * 37 * 3)
# X_test = X_test.reshape(-1, 50 * 37 * 3)

In [10]:
from skimage.feature import hog, local_binary_pattern
import xgboost as xgb
from sklearn.metrics import classification_report, accuracy_score
from skimage import exposure
  
def local_binary_patterns(image):
    
    # settings for LBP
    radius = 3
    n_points = 8 * radius
    
    image = cv2.cvtColor(image,cv2.COLOR_GRAY2RGB)
    
    # Convert the image depth to CV_8U
    image8bit = cv2.normalize(image, None, 0, 255, cv2.NORM_MINMAX).astype('uint8')

    # Convert the array to grayscale and reshape it to (height, width)
    gray1 = cv2.cvtColor(image8bit, cv2.COLOR_BGR2GRAY)

    lbp = local_binary_pattern(gray1, n_points, radius, method = "uniform")

    return lbp

def extract_color_histogram(image):
    import matplotlib.pyplot as plt
    image = image.astype("float32")
    
    image = cv2.cvtColor(image, cv2.COLOR_GRAY2RGB)
    

    # Calculate the histogram
    hist = cv2.calcHist([image], [0], None, [256], [0, 256])
    
    # Normalize the histogram
    cv2.normalize(hist, hist)
    
    # Flatten the histogram into a 1D feature vector
    features = hist.flatten()
    
    return features

# Function to extract combined features (HOG, SIFT, and LBP) from an image
def extract_combined_features(image):
    lbp_features = local_binary_patterns(image)
    hist_features = extract_color_histogram(lbp_features)
    return hist_features
    

In [11]:
# Extract combined features from the training and testing images
X_train_features = np.array([extract_combined_features(image) for image in x_train])
X_test_features = np.array([extract_combined_features(image) for image in x_test])

In [12]:
X_train_features.shape

(13440, 256)

In [13]:
from time import time
from sklearn.decomposition import PCA
def pca_implementation(X_train, X_test):
    # Compute a PCA (eigenfaces) on the face dataset (treated as unlabeled
    # dataset): unsupervised feature extraction / dimensionality reduction
    n_components = 60

    print("Extracting the top %d eigenfaces from %d faces"
          % (n_components, X_train.shape[0]))
    t0 = time()
    pca = PCA(n_components=n_components, svd_solver='randomized',
              whiten=True).fit(X_train)
    print("done in %0.3fs" % (time() - t0))

    print("Projecting the input data on the eigenfaces orthonormal basis")
    t0 = time()
    X_train_pca = pca.transform(X_train)
    X_test_pca = pca.transform(X_test)
    print("done in %0.3fs" % (time() - t0))
    return X_train_pca, X_test_pca

In [14]:
X_train, X_test= pca_implementation(X_train_features, X_test_features)

Extracting the top 60 eigenfaces from 13440 faces
done in 0.167s
Projecting the input data on the eigenfaces orthonormal basis
done in 0.016s


In [15]:
X_train.shape

(13440, 60)

In [16]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
yy_train = le.fit_transform(y_train)

In [17]:
yy_test = le.fit_transform(y_test)

In [18]:
yy_test.shape, yy_train.shape

((3360,), (13440,))

#### Xg boost

In [59]:
# Train an XGBoost model
model = xgb.XGBClassifier(n_estimators=250,
                          max_depth=8,
                          objective= 'binary:logistic',
                          seed=27, 
                          tree_method='gpu_hist', gpu_id=0)
model.fit(X_train, yy_train)

In [60]:
X_test.shape

(3360, 60)

In [61]:
# Predict using the XGBoost model
y_pred = model.predict(X_test)

In [62]:
accuracy = accuracy_score(yy_test, y_pred)

# Print the results
print("Accuracy Score:", accuracy)

Accuracy Score: 0.8464285714285714


In [40]:
print("Classification Report:")
# Calculate classification report and accuracy score
classification_report_output = classification_report(yy_test, y_pred)
print(classification_report_output)

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       1.00      0.50      0.67         2
           2       1.00      1.00      1.00         2
           3       0.50      0.50      0.50         2
           4       0.00      0.00      0.00         2
           5       0.00      0.00      0.00         2
           6       0.50      1.00      0.67         2
           7       0.50      0.50      0.50         2
           8       0.50      0.50      0.50         2
           9       0.50      1.00      0.67         2
          10       1.00      1.00      1.00         2
          11       0.00      0.00      0.00         2
          12       1.00      1.00      1.00         2
          13       0.67      1.00      0.80         2
          14       0.50      0.50      0.50         2
          15       0.67      1.00      0.80         2
          16       0.00      0.00      0.00         2
    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


------

#### Logistic Regression

In [19]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, yy_train)

y_pred = model.predict(X_test)

Accuracy: 0.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  score = y_true == y_pred


In [21]:
accuracy = accuracy_score(yy_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.63125


-----

#### Random Forest

In [22]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(n_estimators=250, max_depth=8)
model.fit(X_train, yy_train)

# Predict
y_pred = model.predict(X_test)

# Score
accuracy_score(yy_test, y_pred)

  score = y_true == y_pred


0.0

In [23]:
accuracy_score(yy_test, y_pred)

0.2571428571428571