In [1]:
import numpy as np
import matplotlib.pyplot as plt
from skimage.feature import hog
from skimage import exposure # The exposure module provides functions for image intensity adjustment and histogram equalization.توفر وحدة التعريض وظائف لضبط شدة الصورة ومعادلة الرسم البياني.
import os
from skimage import io, color, feature, exposure #This line imports multiple modules from the skimage library: io (input/output functions for reading and saving images), color (functions for color space conversion), feature (various image feature extraction techniques), and exposure (functions for image intensity adjustment).
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, classification_report 
import cv2 
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.utils import shuffle

import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation

# Feature Extraction using HOG


In [2]:
# Function to extract HOG features from an image and visualize it
def extract_hog_features(image):
    # Convert the image to grayscale
    gray_image = color.rgb2gray(image)
    
    # Calculate HOG features
    hog_features, hog_image = feature.hog(gray_image, visualize=True) # hog_features: It stores the computed HOG features, which are a representation of the image based on the local gradient orientations.

    # Enhance the contrast of the HOG image for better visualization
    hog_image_rescaled = exposure.rescale_intensity(hog_image, in_range=(0, 10)) # i tried it but no difference as i saw

    return hog_features, hog_image_rescaled

In [3]:
# Path to the root folder of your dataset
dataset_path = '/Users/farrahtharwat/Desktop/Uni/Level 3/Semester 2/Advanced ML/Project/Advanced-Machine-Learning/Decision Trees'

# List all subdirectories (assuming each subdirectory corresponds to a class)
class_folders = [f.path for f in os.scandir(dataset_path) if f.is_dir()]
features_list=[]
labels_list=[]

for class_folder in class_folders:
    class_name = os.path.basename(class_folder) # This line extracts the base name of the class_folder path using the os.path.basename() function. It assigns the extracted name to the variable class_name... then make each folder name as class name

    # Loop through each image in the class folder
    for image_filename in os.listdir(class_folder): #This line uses the listdir function from the os module to retrieve a list of all files and directories within the class_folder directory.
        image_path = os.path.join(class_folder, image_filename)

        # Load the image
        image = io.imread(image_path)

        # Resize the image to 64x64
        resized_image = cv2.resize(image, (64, 64))

        # Extract HOG features and visualize
        hog_features, hog_image = extract_hog_features(resized_image)

        # Display the original image and the HOG features
        fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(2, 2), sharex=True, sharey=True) # fig will hold information about the entire frame, and (ax1, ax2) will hold information about the two subplots.

        ax1.axis('off') # Think of it like turning off the axis labels and ticks....I don't want to see the numbers on the x and y axes.
        ax1.imshow(resized_image, cmap=plt.cm.gray) #  Now, you're putting an image (resized_image) on your canvas.... cmap=plt.cm.gray just means you want to display the image in grayscale.

        ax2.axis('off')
        ax2.imshow(hog_image, cmap=plt.cm.gray)

        plt.show()

        # Append HOG features to the features list
        features_list.append(hog_features)

        # Append the label to the labels list
        labels_list.append(class_name)

FileNotFoundError: [Errno 2] No such file or directory: '/Users/farrahtharwat/Desktop/Uni/Level 3/Semester 2/Advanced ML/Project/Advanced-Machine-Learning/Decision Trees'

In [None]:
# Convert lists to NumPy arrays
features_array = np.array(features_list)
labels_array = np.array(labels_list)
# Use LabelEncoder to convert class names into numeric labels
label_encoder = LabelEncoder()
numeric_labels = label_encoder.fit_transform(labels_array)


print(features_array) # Print the HOG features array
print(numeric_labels) # Print the numeric labels
 

In [None]:
# Split the dataset into training and testing sets
# it is not related with the preprocessing or feature extraction it is gad3naa 3ady
X_train, X_test, y_train, y_test = train_test_split(features_array, numeric_labels, test_size=0.2, random_state=78)

# DECISION TREE MODEL


In [None]:
# Train Decision Tree classifier 
clf = DecisionTreeClassifier(criterion="gini", max_depth=15 ,min_samples_split=10,
    min_samples_leaf = 2)   
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
#accuracy_dt =metrics.accuracy_score(y_test, y_pred) 
#print("Accuracy:",accuracy_dt)

# Model Accuracy, how often is the classifier correct?
x=metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# CONFUSION MATRIX


In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a Confusion Matrix
plt.figure(figsize=(8, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

# ROC_CURVE

In [None]:
from sklearn.metrics import roc_curve, auc

# Predict probabilities for test data
y_pred_proba = clf.predict_proba(X_test)

# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()


# DECISION TREE MODEL WITH NORMALIZATION

In [None]:
from sklearn.preprocessing import StandardScaler
#Initialize StandardScaler
scaler = StandardScaler()

In [None]:
# Fit the scaler to the training data
scaler.fit(features_array)

In [None]:
# Normalize or standardize the feature vectors
features_scaled = scaler.fit_transform(features_array)

# CROSS-VALIDATION

In [None]:
from sklearn.model_selection import cross_val_score, KFold, train_test_split

# Define the number of folds for cross-validation
num_folds = 5  # You can adjust this number based on your preference

# Initialize KFold cross-validation
kf = KFold(n_splits=num_folds, shuffle=True, random_state=56)

# Perform cross-validation and collect accuracy scores
accuracy_scores = []
for train_index, test_index in kf.split(features_array):
    X_train, X_val = features_array[train_index], features_array[test_index]
    y_train, y_val = numeric_labels[train_index], numeric_labels[test_index]

    # Train Decision Tree classifier with the best hyperparameters
    clf = DecisionTreeClassifier(criterion="gini", max_depth=15, min_samples_split=10, min_samples_leaf=2)
    clf.fit(X_train, y_train)

    # Predict the response for validation dataset
    y_pred = clf.predict(X_val)

    # Calculate accuracy score for this fold
    accuracy_scores.append(metrics.accuracy_score(y_val, y_pred))

# Print the accuracy scores for each fold
print("Cross-Validation Accuracy Scores:", accuracy_scores)

# Calculate the mean and standard deviation of the accuracy scores
mean_accuracy = np.mean(accuracy_scores)
std_accuracy = np.std(accuracy_scores)

print("Mean Accuracy:", mean_accuracy)
print("Standard Deviation of Accuracy:", std_accuracy)

In [None]:
# Split the dataset into training and testing sets
# it is not related with the preprocessing or feature extraction it is gad3naa 3ady
X_train, X_test, y_train, y_test = train_test_split(features_array, numeric_labels, test_size=0.2, random_state=78)

In [None]:
# Train Decision Tree classifier with the best hyperparameters
clf = DecisionTreeClassifier(criterion="gini", max_depth=15 ,min_samples_split=10,
    min_samples_leaf = 2)
clf.fit(X_train, y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
#accuracy_dt_cv =metrics.accuracy_score(y_test, y_pred) 
#print("Accuracy:",accuracy_dt_cv)
accuracy_dt_cv=metrics.accuracy_score(y_test, y_pred)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

# PLOTTING TWO MODELS

In [None]:
# Plot the R-squared values
plt.bar(x=['Decision Tree', 'Decision Tree with Normalization'], height=[x, accuracy_dt_cv])
plt.ylabel("Accuracy")
plt.title("Model Accuracy Comparison")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Generate the confusion matrix
cm = confusion_matrix(y_test, y_pred)

# Create a Confusion Matrix
plt.figure(figsize=(8, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Greens')
plt.title('Confusion Matrix')
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()

In [None]:
from sklearn.metrics import roc_curve, auc

# Predict probabilities for test data
y_pred_proba = clf.predict_proba(X_test)

# Compute ROC curve and ROC area for each class
fpr, tpr, _ = roc_curve(y_test, y_pred_proba[:, 1])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc="lower right")
plt.show()