In [1]:
import numpy as np
from data_loading import load_data
from logistic_regression import LogisticRegression
from skimage.feature import hog
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold


# Correct paths to your dataset
csv_file = r"./archive/data.csv"
base_folder = r"./archive/dataset"

# Load images and labels
X, y = load_data(csv_file, base_folder)


### Feature Engineering ###
# Step 1: Convert RGB images to grayscale
def rgb_to_grayscale(images):
    return np.dot(images[..., :3], [0.2989, 0.5870, 0.1140])


X_gray = rgb_to_grayscale(X)


# Step 2: Extract HOG features
def extract_hog_features(images):
    hog_features = []
    for img in images:
        features = hog(img, pixels_per_cell=(8, 8), cells_per_block=(2, 2), feature_vector=True)
        hog_features.append(features)
    return np.array(hog_features)


X_hog = extract_hog_features(X_gray)


# Step 3: Apply PCA for dimensionality reduction
def apply_pca(features, n_components=100):
    pca = PCA(n_components=n_components)
    return pca.fit_transform(features)


X_pca = apply_pca(X_hog, n_components=100)


# Step 4: Normalize the features
def normalize_features(features):
    scaler = StandardScaler()
    return scaler.fit_transform(features)


X_normalized = normalize_features(X_pca)

### Prepare labels ###
# Encode labels to numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# One-hot encode the labels for softmax regression
y_one_hot = np.eye(len(np.unique(y_encoded)))[y_encoded]

### Train/test split ###
# Split into training, validation, and test sets
X_train_full, X_test, y_train_full, y_test = train_test_split(X_normalized, y_one_hot, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=42)

y_test_labels = np.argmax(y_test, axis=1)  # True labels for test data

Current system time: 2024-10-12 20:35:58.253350


KeyboardInterrupt: 

In [1]:
### K-Fold Cross-Validation ###
def cross_validate_model(X, y, num_folds=5):
    kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
    fold_accuracies = []

    for train_index, val_index in kf.split(X):
        x_train, x_val = X[train_index], X[val_index]
        y_train, y_val = y[train_index], y[val_index]

        # Train the model
        model = LogisticRegression(input_size=x_train.shape[1], num_classes=y_train.shape[1], learning_rate=0.1,
                                   regularization=0.001)
        model.train(x_train, y_train, epochs=1000)

        # Validate the model
        y_val_pred = model.predict(x_val)
        val_accuracy = np.mean(np.argmax(y_val, axis=1) == y_val_pred)
        fold_accuracies.append(val_accuracy)
        print(f"Validation Accuracy for fold: {val_accuracy}")

    avg_accuracy = np.mean(fold_accuracies)
    print(f"Average Cross-Validation Accuracy: {avg_accuracy}")
    return avg_accuracy


# Perform cross-validation with 5 folds
cross_validation_accuracy = cross_validate_model(X_normalized, y_one_hot, num_folds=5)

### Final Model Training with the Best Hyperparameters ###
# Use the best hyperparameters directly
final_model = LogisticRegression(input_size=X_train_full.shape[1], num_classes=y_train_full.shape[1], learning_rate=0.1,
                                 regularization=0.001)
final_model.train(X_train_full, y_train_full, epochs=1000)

# Test the final model
y_test_pred = final_model.predict(X_test)
final_accuracy = np.mean(np.argmax(y_test, axis=1) == y_test_pred)
print(f"Final Test Accuracy: {final_accuracy}")

NameError: name 'X_normalized' is not defined