In [1]:

import requests
import os
import pandas as pd

# GitHub repository owner and name
owner = "AMoazeni"
repo = "Machine-Learning-Image-Classification"

# Function to fetch contents of a directory from GitHub API
def get_github_directory_contents(owner, repo, path):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print("Failed to fetch directory contents from GitHub API:", response.text)
        return []

# Function to download images
def download_images(image_urls, directory, name):
    os.makedirs(directory, exist_ok=True)
    for i, url in enumerate(image_urls):
        response = requests.get(url)
        if response.status_code == 200:
            image_path = os.path.join(directory, f"{name}_{i}.jpg")
            with open(image_path, 'wb') as f:
                f.write(response.content)
        else:
            print(f"Failed to download image from URL {url}")

# Fetch URLs of the cat images
url_cat = "Data/training_set/cats"
cat_set = get_github_directory_contents(owner, repo, url_cat)
cat_images = [content['download_url'] for content in cat_set if content['type'] == 'file' and content['name'].endswith('.jpg')]

# Fetch URLs of the dog images
url_dog = "Data/test_set/dogs"
dog_set = get_github_directory_contents(owner, repo, url_dog)
dog_images = [content['download_url'] for content in dog_set if content['type'] == 'file' and content['name'].endswith('.jpg')]

# Download cat images
download_images(cat_images[:1000], "cat_set", "cat")

# Download dog images
download_images(dog_images[:1000], "dog_set", "dog")

# Create dataframes with image paths
cat_image_paths = [f"cat_set/cat_{i}.jpg" for i in range(1000)]
dog_image_paths = [f"dog_set/dog_{i}.jpg" for i in range(1000)]

cat_df = pd.DataFrame({"image_path": cat_image_paths, "Target": 1})
dog_df = pd.DataFrame({"image_path": dog_image_paths, "Target": 0})


In [2]:
print(len(cat_df), len(dog_df))

1000 1000


concattenate the dataset and shuffle it.

In [3]:
# Concatenate the dataframes
cat_dog_df = pd.concat([cat_df, dog_df], ignore_index=True)

# Shuffle the combined dataframes
cat_dog_df = cat_dog_df.sample(frac=1).reset_index(drop=True)

preprocess image applying prewitt and morphological edge detection

In [4]:
import numpy as np
import cv2
from skimage.filters import prewitt_h, prewitt_v
from skimage.morphology import closing, square
from skimage.color import rgb2gray

# Function to apply Prewitt operator
def prewitt_operator(image):
    # Convert image to grayscale
    gray_image = rgb2gray(image)
    # Apply Prewitt operator horizontally and vertically
    edges_horizontal = prewitt_h(gray_image)
    edges_vertical = prewitt_v(gray_image)
    # Combine horizontal and vertical edges
    edges = np.sqrt(edges_horizontal**2 + edges_vertical**2)
    return edges

# Function to apply morphological edge detection
def morphological_edge_detection(image):
    # Convert image to grayscale
    gray_image = rgb2gray(image)
    # Apply morphological closing
    closed_image = closing(gray_image, square(3))
    # Compute edges by subtracting the closed image from the original
    edges = gray_image - closed_image
    return edges

create list with images (2000,128,128)(2000) and flat the keys

In [5]:
def preprocess_image(image_paths):
    edge_images = []
    for img_path in image_paths:
        # Load the image
        img = cv2.imread(img_path)
        if img is None:
            print(f"Failed to load image from path: {img_path}")
            continue
        # Apply Prewitt operator
        prewitt_edges = prewitt_operator(img)
        # Apply morphological edge detection
        morphological_edges = morphological_edge_detection(img)
        # Combine both edge images (optional)
        combined_edges = prewitt_edges + morphological_edges  # You can use other operations as well
        # Ensure all images have the same shape
        combined_edges_resized = cv2.resize(combined_edges, (128, 128))  # Adjust desired_width and desired_height as needed
        # Flatten the image
        flattened_image = combined_edges_resized.ravel()
        # Append the flattened image to the list
        edge_images.append(flattened_image)
    return edge_images

store X and y set sizes(2000, 16384) (2000)

In [6]:
# Process images and store combined edge images
cat_dog_df_edge = preprocess_image(cat_dog_df['image_path'])

In [7]:
X = np.array(cat_dog_df_edge)
# Extract labels from the concatenated dataframe
y = cat_dog_df["Target"]

In [8]:
X.shape, y.shape

((2000, 16384), (2000,))

split the dataset training 80% and test 20%

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

performing pipeline PCA and Logistic Regression

In [18]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
pipe_lr = make_pipeline(PCA(n_components=50),
                        LogisticRegression(penalty='l2',
                                           max_iter=1000))
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
test_acc = pipe_lr.score(X_test, y_test)
print(f'Test accuracy: {test_acc:.3f}')

Test accuracy: 0.560


performing Grid Search on PCA and SVC

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline


# Define the pipeline with a scaler, PCA, and SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('svc', SVC())
])

# Define the parameter grid
param_grid = {
    'pca__n_components': [50, 100],  # Number of components for PCA
    'svc__C': [0.1, 1, 10, 100, 1000],  # Values for C
    'svc__gamma': [0.001, 0.01, 0.1, 1, 10]  # Values for gamma
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the test set
accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", accuracy)


Best Parameters: {'pca__n_components': 50, 'svc__C': 1, 'svc__gamma': 0.001}
Best Score: 0.60375
Test Accuracy: 0.6025


performing pipeline PCA and SVC with best parameters

In [19]:
# Define the pipeline with a scaler, PCA, and SVC with fixed hyperparameters
pipe_SVC = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=50)),
    ('svc', SVC(C=1.0, gamma=0.001, kernel='rbf'))
])

# Fit the pipeline to the training data
pipe_SVC.fit(X_train, y_train)

In [20]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = pipe_SVC.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.5925


performing GridSearch with Random Forest

In [21]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
        'n_estimators': [100, 250, 500],
        'max_depth': [None, 5, 10, 20],
        'criterion': ['gini', 'entropy']
        }

# Instantiate the Random Forest classifier
rf_classifier = RandomForestClassifier()

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the test set
accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", accuracy)


Best Parameters: {'criterion': 'entropy', 'max_depth': 10, 'n_estimators': 250}
Best Score: 0.631875
Test Accuracy: 0.6075


Applying best parameters to Random Forest

In [24]:
# Define a function to train a Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_random_forest(X_train, y_train):
    rf_classifier = RandomForestClassifier(n_estimators=250, max_depth=10, criterion='gini')
    rf_classifier.fit(X_train, y_train)

    return rf_classifier

# Train the Random Forest classifier
rf_classifier = train_random_forest(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.61


Performing knn

In [25]:
from sklearn.neighbors import KNeighborsClassifier

# Initialize and train k-NN classifier
knn_classifier = KNeighborsClassifier(n_neighbors=2)
knn_classifier.fit(X_train, y_train)

# Predict labels for test set
y_pred = knn_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.5225
