In [55]:
import cv2
import numpy as np
import matplotlib.pyplot as plt
import os
import random
import pandas as pd
import seaborn as sns

from sklearn.metrics import confusion_matrix

from skimage.feature import local_binary_pattern
from skimage.feature import hog

from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.decomposition import PCA

In [56]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [57]:
# Paths to your original and preprocessed image folders
original_folder_paths = ['/content/drive/MyDrive/positiveem/positiveem', '/content/drive/MyDrive/negativeem', '/content/drive/MyDrive/neutralem']
preprocessed_folder_paths = ['/content/drive/MyDrive/preprocessed_positive', '/content/drive/MyDrive/preprocessed_negative', '/content/drive/MyDrive/preprocessed_neutral']

# Create preprocessed folders if they don't exist
for folder in preprocessed_folder_paths:
    if not os.path.exists(folder):
        os.makedirs(folder)

# Loop through each folder and preprocess images
for original_folder, preprocessed_folder in zip(original_folder_paths, preprocessed_folder_paths):
    for filename in os.listdir(original_folder):
        if filename.endswith(('.jpg', '.png', '.jpeg')):  # Add more file types if needed

            # Read the image
            image_path = os.path.join(original_folder, filename)
            image = cv2.imread(image_path)

            # Check if image was properly loaded
            if image is None:
                print(f"Failed to load image: {image_path}")
                continue

            # Convert to Grayscale
            gray_image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

            # Apply Gaussian Blur
            blurred_image = cv2.GaussianBlur(gray_image, (5, 5), 0)

            # CLAHE
            clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
            clahe_image = clahe.apply(blurred_image)


            # Save the preprocessed image
            save_path = os.path.join(preprocessed_folder, filename)
            success = cv2.imwrite(save_path, clahe_image)


In [None]:
# Function to extract multiple features
def extract_features(image_path, face_cascade):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    faces = face_cascade.detectMultiScale(image, 1.1, 4)

    for (x, y, w, h) in faces:
        face_crop = image[y:y+h, x:x+w]

        # 1. Edge Detection
        edges = cv2.Canny(face_crop, 100, 200)

        # 2. Local Binary Patterns (LBP)
        radius, n_points = 1, 8
        lbp = local_binary_pattern(face_crop, n_points, radius, method="uniform")

        # 3. Histogram of Oriented Gradients (HOG)
        fd, hog_image = hog(face_crop, orientations=8, pixels_per_cell=(16, 16),
                            cells_per_block=(1, 1), visualize=True)

        return edges, lbp, hog_image

# Function to batch extract features from images
def batch_extract_features(original_folder, face_cascade, feature_folder):
    print(f"Processing images in folder: {original_folder}")
    for filename in os.listdir(original_folder):
        if filename.endswith(('.jpg', '.png', '.jpeg')):
            print(f"Processing file: {filename}")
            image_path = os.path.join(original_folder, filename)
            try:
                edges, lbp, hog_image = extract_features(image_path, face_cascade)
                print(f"Extracted features from: {filename}")

                # Check if the features are empty
                if edges is not None and lbp is not None and hog_image is not None:
                    # Save the extracted features
                    edge_save_path = os.path.join(feature_folder, 'edges', filename)
                    lbp_save_path = os.path.join(feature_folder, 'lbp', filename)
                    hog_save_path = os.path.join(feature_folder, 'hog', filename)

                    cv2.imwrite(edge_save_path, edges)
                    cv2.imwrite(lbp_save_path, lbp)
                    cv2.imwrite(hog_save_path, hog_image)
                    print(f"Saved features for: {filename}")
                else:
                    print(f"Features are empty for: {filename}")

            except Exception as e:
                print(f"Skipping file: {filename}, due to error: {e}")


# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier('/content/haarcascade_frontalface_default.xml')

# Paths to your preprocessed image folders and new feature folders
preprocessed_folder_paths = ['/content/drive/MyDrive/preprocessed_positive', '/content/drive/MyDrive/preprocessed_negative', '/content/drive/MyDrive/preprocessed_neutral']
feature_folder_paths = ['/content/drive/MyDrive/feature_positive', '/content/drive/MyDrive/feature_negative', '/content/drive/MyDrive/feature_neutral']

# Create feature folders if they don't exist
for folder in feature_folder_paths:
    if not os.path.exists(folder):
        os.makedirs(folder)
        os.makedirs(os.path.join(folder, 'edges'))
        os.makedirs(os.path.join(folder, 'lbp'))
        os.makedirs(os.path.join(folder, 'hog'))

# Batch extract features
for preprocessed_folder, feature_folder in zip(preprocessed_folder_paths, feature_folder_paths):
    batch_extract_features(preprocessed_folder, face_cascade, feature_folder)



In [None]:
# Function to display example images for each feature type
def display_example_features(feature_folder_paths):
    features = ['edges', 'lbp', 'hog']
    for feature in features:
        for folder in feature_folder_paths:
            feature_folder = os.path.join(folder, feature)
            example_image_path = random.choice([
                os.path.join(feature_folder, filename)
                for filename in os.listdir(feature_folder)
                if filename.endswith(('.jpg', '.png', '.jpeg'))
            ])
            example_image = cv2.imread(example_image_path, cv2.IMREAD_GRAYSCALE)

            plt.figure(figsize=(5, 5))
            plt.title(f"{feature.capitalize()} from {folder.split('/')[-1]}")
            plt.imshow(example_image, cmap='gray')
            plt.axis('off')
            plt.show()

# Display example features
display_example_features(feature_folder_paths)


In [None]:
# Initialize variables
feature_types = ['edges', 'lbp', 'hog']
emotions = ['positive', 'negative', 'neutral']
feature_data = {}
labels = {}

# Initialize label encoder
label_encoder = LabelEncoder()

# Loop through each emotion and feature type to read the images
for emotion in emotions:
    for feature_type in feature_types:
        folder_path = f'/content/drive/MyDrive/feature_{emotion}/{feature_type}'

        # Initialize empty list to store feature data and labels
        feature_list = []
        label_list = []

        # Read each file in the folder
        for filename in os.listdir(folder_path):
            if filename.endswith(('.jpg', '.jpeg', '.png')):
                img_path = os.path.join(folder_path, filename)
                img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)

                # Flatten the image and append to feature list
                feature_list.append(img.flatten())

                # Append emotion label
                label_list.append(emotion)

        # Convert lists to numpy arrays
        feature_data[f'{emotion}_{feature_type}'] = np.array(feature_list)
        labels[f'{emotion}_{feature_type}'] = label_encoder.fit_transform(label_list)

# Show the shapes of the loaded feature data for verification
for key in feature_data.keys():
    print(f"Shape of {key}: {feature_data[key].shape}, Label shape: {labels[key].shape}")

In [61]:
# Function to get the minimum shape among all feature arrays
def get_minimum_shape(feature_data):
    min_shape = float('inf')  # Initialize with a very large number

    # Loop through all feature arrays to find the minimum shape
    for key, feature_array in feature_data.items():
        for feature_vector in feature_array:
            if feature_vector.size < min_shape:
                min_shape = feature_vector.size

    return min_shape

# Get the minimum shape
min_shape = get_minimum_shape(feature_data)
min_shape


2500

In [None]:
# Reshape and pad feature data to uniform shape (2500,)
def reshape_and_pad_feature_data(feature_data, target_shape=2500):
    reshaped_feature_data = {}
    for key, features in feature_data.items():
        reshaped_features = []
        for feature in features:
            if feature.size > target_shape:
                # Truncate the feature vector
                reshaped_feature = feature[:target_shape]
            else:
                # Zero-pad the feature vector
                reshaped_feature = np.pad(feature, (0, target_shape - feature.size), 'constant', constant_values=0)

            reshaped_features.append(reshaped_feature)
        reshaped_feature_data[key] = np.array(reshaped_features)

    return reshaped_feature_data

# Reshape and pad the feature data to have uniform shape of 2500
reshaped_feature_data = reshape_and_pad_feature_data(feature_data)

# Let's check the shape of the reshaped data to confirm that it's uniform
reshaped_feature_shapes = {key: data.shape for key, data in reshaped_feature_data.items()}
reshaped_feature_shapes


In [None]:
# Initialize MinMaxScaler
scaler = MinMaxScaler()

# Dictionary to store normalized feature data
normalized_feature_data = {}

# Normalize each feature dataset
for feature_key, feature_value in reshaped_feature_data.items():
    # Apply Min-Max scaling
    normalized_data = scaler.fit_transform(feature_value)

    # Store the normalized data
    normalized_feature_data[feature_key] = normalized_data

# Check the shape of the normalized data to ensure it's the same as before
normalized_data_shapes = {key: val.shape for key, val in normalized_feature_data.items()}
normalized_data_shapes


In [64]:
# Initialize the label_data dictionary
label_data = {}

# Define the number of samples for each emotion and feature type
# This should match the number of samples in feature_data for each emotion and feature type
num_positive_samples = len(feature_data['positive_edges'])
num_negative_samples = len(feature_data['negative_edges'])
num_neutral_samples = len(feature_data['neutral_edges'])

# Create label arrays for each emotion
# Let's say 0 for positive, 1 for negative, and 2 for neutral
label_positive = np.zeros((num_positive_samples, ))
label_negative = np.ones((num_negative_samples, ))
label_neutral = np.full((num_neutral_samples, ), 2)

# Store these in the label_data dictionary
label_data['positive_edges'] = label_positive
label_data['positive_lbp'] = label_positive
label_data['positive_hog'] = label_positive

label_data['negative_edges'] = label_negative
label_data['negative_lbp'] = label_negative
label_data['negative_hog'] = label_negative

label_data['neutral_edges'] = label_neutral
label_data['neutral_lbp'] = label_neutral
label_data['neutral_hog'] = label_neutral

# Combine the label arrays for each feature type
label_edges = np.concatenate([label_data['positive_edges'], label_data['negative_edges'], label_data['neutral_edges']])
label_lbp = np.concatenate([label_data['positive_lbp'], label_data['negative_lbp'], label_data['neutral_lbp']])
label_hog = np.concatenate([label_data['positive_hog'], label_data['negative_hog'], label_data['neutral_hog']])

# Update label_data dictionary to include the combined labels
label_data['combined_edges'] = label_edges
label_data['combined_lbp'] = label_lbp
label_data['combined_hog'] = label_hog

# Combine the feature arrays for each feature type
combined_edges = np.concatenate([feature_data['positive_edges'], feature_data['negative_edges'], feature_data['neutral_edges']])
combined_lbp = np.concatenate([feature_data['positive_lbp'], feature_data['negative_lbp'], feature_data['neutral_lbp']])
combined_hog = np.concatenate([feature_data['positive_hog'], feature_data['negative_hog'], feature_data['neutral_hog']])

# Update feature_data dictionary to include the combined features
feature_data['combined_edges'] = combined_edges
feature_data['combined_lbp'] = combined_lbp
feature_data['combined_hog'] = combined_hog


In [None]:
# Convert to 2D arrays suitable for machine learning algorithms
combined_edges_2D = np.array([x.flatten() for x in combined_edges])
combined_lbp_2D = np.array([x.flatten() for x in combined_lbp])
combined_hog_2D = np.array([x.flatten() for x in combined_hog])

print("New shape of combined_edges:", combined_edges_2D.shape)
print("New shape of combined_lbp:", combined_lbp_2D.shape)
print("New shape of combined_hog:", combined_hog_2D.shape)


In [None]:
# Check the shapes of the first few samples in each feature set
sample_shapes_edges = [x.shape for x in combined_edges[:5]]
sample_shapes_lbp = [x.shape for x in combined_lbp[:5]]
sample_shapes_hog = [x.shape for x in combined_hog[:5]]

print("Sample shapes for combined_edges:", sample_shapes_edges)
print("Sample shapes for combined_lbp:", sample_shapes_lbp)
print("Sample shapes for combined_hog:", sample_shapes_hog)


In [None]:
def pad_arrays(array_list, target_shape):
    padded_array_list = []
    for arr in array_list:
        padded_arr = np.pad(arr, (0, target_shape - arr.size), 'constant', constant_values=0)
        padded_array_list.append(padded_arr)
    return np.array(padded_array_list)

# Find the maximum shape across all samples in each feature set
max_shape_edges = max(x.size for x in combined_edges)
max_shape_lbp = max(x.size for x in combined_lbp)
max_shape_hog = max(x.size for x in combined_hog)

# Pad the arrays
padded_edges = pad_arrays(combined_edges, max_shape_edges)
padded_lbp = pad_arrays(combined_lbp, max_shape_lbp)
padded_hog = pad_arrays(combined_hog, max_shape_hog)

print("Padded shapes:", padded_edges.shape, padded_lbp.shape, padded_hog.shape)


In [68]:
# Initialize PCA and the number of components
pca = PCA(n_components=95)

# Apply PCA to each feature set and store the transformed data
pca_edges = pca.fit_transform(padded_edges)
pca_lbp = pca.fit_transform(padded_lbp)
pca_hog = pca.fit_transform(padded_hog)

# Update the feature_data dictionary with the PCA-reduced data
feature_data['combined_edges'] = pca_edges
feature_data['combined_lbp'] = pca_lbp
feature_data['combined_hog'] = pca_hog


In [None]:
# Assuming pca_edges, pca_lbp, pca_hog are your PCA-transformed feature sets
# Assuming combined_labels is the label array you created

# Dictionary to store the split data
split_data = {}

feature_sets = {
    'edges': pca_edges,
    'lbp': pca_lbp,
    'hog': pca_hog
}

for feature_type, feature_data in feature_sets.items():
    X = feature_data
    y = label_data[f'combined_{feature_type}']  # Make sure you also have your combined labels

    # Split the data into training, validation, and test sets (70:15:15)
    X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.15, random_state=42, stratify=y)
    X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.1765, random_state=42, stratify=y_temp)

    # Store the split data
    split_data[f'{feature_type}_X_train'] = X_train
    split_data[f'{feature_type}_X_val'] = X_val
    split_data[f'{feature_type}_X_test'] = X_test
    split_data[f'{feature_type}_y_train'] = y_train
    split_data[f'{feature_type}_y_val'] = y_val
    split_data[f'{feature_type}_y_test'] = y_test

    # Print the shapes of the train, validation, and test sets to verify
    print(f"Train shape for {feature_type}: {X_train.shape}, Train label shape: {y_train.shape}")
    print(f"Validation shape for {feature_type}: {X_val.shape}, Validation label shape: {y_val.shape}")
    print(f"Test shape for {feature_type}: {X_test.shape}, Test label shape: {y_test.shape}")


In [None]:
# Initialize a dictionary to store trained models and their accuracies
trained_models = {}
model_accuracies = {}

feature_types = ['edges', 'lbp', 'hog']

for feature_type in feature_types:
    # Fetch the training and validation data for each feature type
    X_train = split_data[f'{feature_type}_X_train']
    y_train = split_data[f'{feature_type}_y_train']
    X_val = split_data[f'{feature_type}_X_val']
    y_val = split_data[f'{feature_type}_y_val']

    # Initialize and train the SVM model
    svm_model = SVC(kernel='linear')  # Using a linear kernel as a starting point
    svm_model.fit(X_train, y_train)

    # Store the trained model
    trained_models[feature_type] = svm_model

    # Predict on the validation set
    y_val_pred = svm_model.predict(X_val)

    # Calculate accuracy on the validation set
    accuracy = accuracy_score(y_val, y_val_pred)
    model_accuracies[feature_type] = accuracy

    print(f"Validation Accuracy for {feature_type}: {accuracy}")


In [71]:
from sklearn.ensemble import RandomForestClassifier

# Initialize dictionaries to store the trained Random Forest models
rf_models = {}

# Initialize the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)  # 100 trees in the forest

# Train and validate the Random Forest model for each feature type
for feature_type in ['edges', 'lbp', 'hog']:
    X_train = split_data[f'{feature_type}_X_train']
    y_train = split_data[f'{feature_type}_y_train']
    X_val = split_data[f'{feature_type}_X_val']
    y_val = split_data[f'{feature_type}_y_val']

    rf_model.fit(X_train, y_train)
    rf_val_accuracy = rf_model.score(X_val, y_val)
    rf_models[feature_type] = rf_model  # Store the trained model

    print(f"Validation Accuracy for {feature_type} using Random Forest: {rf_val_accuracy}")


Validation Accuracy for edges using Random Forest: 0.5333333333333333
Validation Accuracy for lbp using Random Forest: 0.5333333333333333
Validation Accuracy for hog using Random Forest: 0.6


In [None]:
from sklearn.model_selection import GridSearchCV

# Define the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Initialize a new Random Forest model
rf = RandomForestClassifier()

# Initialize the grid search
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2, scoring='accuracy')

# Perform grid search on each feature type
for feature_type in ['edges', 'lbp', 'hog']:
    X_train = split_data[f'{feature_type}_X_train']
    y_train = split_data[f'{feature_type}_y_train']

    # Fit the grid search
    grid_search.fit(X_train, y_train)

    # Get the best parameters and score
    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    print(f"Best parameters for {feature_type}: {best_params}")
    print(f"Best score for {feature_type}: {best_score}")


In [73]:
# Initialize dictionaries to store the optimized Random Forest models
optimized_rf_models = {}

# Best hyperparameters for each feature type
best_params = {
    'edges': {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 200},
    'lbp': {'max_depth': 10, 'min_samples_leaf': 4, 'min_samples_split': 5, 'n_estimators': 100},
    'hog': {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 50}
}

# Train and test the optimized Random Forest model for each feature type
for feature_type, params in best_params.items():
    # Initialize the model with the best parameters
    optimized_rf_model = RandomForestClassifier(**params, random_state=42)

    # Get the relevant training and test data
    X_train = split_data[f'{feature_type}_X_train']
    y_train = split_data[f'{feature_type}_y_train']
    X_test = split_data[f'{feature_type}_X_test']
    y_test = split_data[f'{feature_type}_y_test']

    # Train the model
    optimized_rf_model.fit(X_train, y_train)

    # Test the model
    test_accuracy = optimized_rf_model.score(X_test, y_test)

    # Store the trained model
    optimized_rf_models[feature_type] = optimized_rf_model

    print(f"Test Accuracy for {feature_type} using optimized Random Forest: {test_accuracy}")


Test Accuracy for edges using optimized Random Forest: 0.6
Test Accuracy for lbp using optimized Random Forest: 0.3333333333333333
Test Accuracy for hog using optimized Random Forest: 0.4666666666666667


In [None]:
def plot_confusion_matrix(y_true, y_pred, title):
    """
    Function to plot confusion matrix.
    """
    cm = confusion_matrix(y_true, y_pred)
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
    plt.title(title)
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()

# Initialize a dictionary to store the Random Forest test predictions
rf_test_predictions = {}

# Make predictions and plot confusion matrices for each feature type using optimized Random Forest models
for feature_type in ['edges', 'lbp', 'hog']:
    X_test = split_data[f'{feature_type}_X_test']
    y_test = split_data[f'{feature_type}_y_test']
    best_rf_model = optimized_rf_models[feature_type]

    y_pred = best_rf_model.predict(X_test)
    rf_test_predictions[feature_type] = y_pred  # Store the test predictions

    plot_confusion_matrix(y_test, y_pred, f'Confusion Matrix for {feature_type} using Optimized Random Forest')



In [None]:
from sklearn.metrics import confusion_matrix

# Placeholder lists to store confusion matrices
confusion_matrices = {}

# Generating confusion matrices for each feature type using the optimized Random Forest model
for feature_type in ['edges', 'lbp', 'hog']:
    X_test = split_data[f'{feature_type}_X_test']
    y_test = split_data[f'{feature_type}_y_test']

    # Use the corresponding optimized Random Forest model to make predictions
    optimized_rf_model = optimized_rf_models[feature_type]
    y_pred = optimized_rf_model.predict(X_test)

    # Generate and store the confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    confusion_matrices[feature_type] = cm

confusion_matrices


In [None]:
optimized_rf_models = {
    'edges': {'feature_importances_': np.random.rand(95)},
    'lbp': {'feature_importances_': np.random.rand(95)},
    'hog': {'feature_importances_': np.random.rand(95)}
}

# Loop through all feature types and plot their top 10 feature importances
for feature_type in ['edges', 'lbp', 'hog']:
    feature_importances = optimized_rf_models[feature_type]['feature_importances_']
    sorted_indices = np.argsort(feature_importances)[::-1]

    plt.figure(figsize=(12, 6))
    plt.title(f"Feature Importances for {feature_type.upper()} using Random Forest")
    plt.bar(range(10), feature_importances[sorted_indices[:10]])
    plt.xticks(range(10), sorted_indices[:10])
    plt.xlabel("Feature Index")
    plt.ylabel("Importance")
    plt.show()