Best PCA, Grid Search on SVC, RF, KNN

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from skimage.feature import hog
from skimage.filters import prewitt_h, prewitt_v
from skimage.morphology import closing, square
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

# Perform best dimensional reduction PCA
def best_pca(X):
    pca = PCA()
    X_pca = pca.fit_transform(X)

    # Calculate explained variance ratio
    explained_variance_ratio = pca.explained_variance_ratio_

    # Plot explained variance ratio
    plt.figure(figsize=(10, 6))
    plt.plot(np.cumsum(explained_variance_ratio), marker='o', linestyle='-')
    plt.xlabel('Number of Components')
    plt.ylabel('Cumulative Explained Variance Ratio')
    plt.title('Cumulative Explained Variance Ratio vs. Number of Components')
    plt.grid(True)
    plt.show()
    return

#perform best SVC
def best_svc(n_components, X_train, X_val, y_train, y_val):
    # Define the pipeline with a scaler, PCA, and SVC
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components)),
        ('svc', SVC())
    ])

    # Define the parameter grid
    param_grid = {
        'svc__C': [0.1, 1, 10, 100, 1000],  # Values for C
        'svc__gamma': [0.001, 0.01, 0.1, 1, 10]  # Values for gamma
    }

    # Perform grid search
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Print the best parameters and best score
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

    # Evaluate the model on the test set
    accuracy = grid_search.score(X_val, y_val)
    print("Validation Accuracy:", accuracy)
    return


#perform best Random Forest
def best_rf(X_train, X_val, y_train, y_val):
    # Define the parameter grid
    param_grid = {
            'n_estimators': [100, 250, 500],
            'max_depth': [None, 5, 10, 20],
            'criterion': ['gini', 'entropy']
            }

    # Instantiate the Random Forest classifier
    rf_classifier = RandomForestClassifier()

    # Instantiate GridSearchCV
    grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

    # Perform grid search
    grid_search.fit(X_train, y_train)

    # Print the best parameters and best score
    print("Best Parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

    # Evaluate the model on the test set
    accuracy = grid_search.score(X_val, y_val)
    print("Validation Accuracy:", accuracy)
    return



#perform best KNN
def best_knn(n_components, X_train, X_val, y_train, y_val):
    # Define the pipeline
    pipe_spm = Pipeline([
        ('scaler', StandardScaler()),  # Feature scaling
        ('pca', PCA(n_components)),  # Dimensionality reduction
        ('knn', KNeighborsClassifier())  # KNN
    ])
    # Define the parameter grid to search
    param_grid = {
    'knn__n_neighbors': [2, 3, 5, 7],  # Number of neighbors for kNN
    'knn__weights': ['uniform', 'distance']  # Weighting method for kNN
    }
    # Setup the grid search with cross-validation
    grid_search = GridSearchCV(pipe_spm, param_grid, cv=5, scoring='accuracy', n_jobs=-1)

    # Perform the grid search on the training data
    grid_search.fit(X_train, y_train)

    # Get the best model
    best_pipeline = grid_search.best_estimator_

    # Print the best parameters found
    print("Best parameters:", grid_search.best_params_)
    print("Best Score:", grid_search.best_score_)

    # Evaluate the model on the test set
    accuracy = best_pipeline.score(X_val, y_val)
    print("Accuracy on the validation set:", accuracy)

    return