In [1]:
cat_train = "/content/cat"
dog_train = "/content/dog"
cat_test = "/content/cat test"
dog_test = "/content/dog test"

In [4]:
import os
# List the files in the directories
img_cat = [os.path.join(cat_train, file) for file in os.listdir(cat_train)[:1000]]
img_dog = [os.path.join(dog_train, file) for file in os.listdir(dog_train)[:1000]]

#img_cat_test = os.listdir(cat_test)
#img_dog_test = os.listdir(dog_test)
# Print the number of cat and dog images
print(len(img_cat), len(img_dog))

1000 1000


In [5]:
import pandas as pd
cat_df = pd.DataFrame({"image_path": img_cat, "Target": 1})
dog_df = pd.DataFrame({"image_path": img_dog, "Target": 0})
print(len(cat_df), len(dog_df))

1000 1000


In [6]:
# Concatenate the dataframes
cat_dog_df = pd.concat([cat_df, dog_df], ignore_index=True)

# Shuffle the combined dataframes
cat_dog_df = cat_dog_df.sample(frac=1).reset_index(drop=True)
cat_dog_df.head()

Unnamed: 0,image_path,Target
0,/content/cat/466.jpg,1
1,/content/dog/1192.jpg,0
2,/content/dog/1975.jpg,0
3,/content/cat/46.jpg,1
4,/content/dog/1859.jpg,0


In [7]:
import os
import cv2
import matplotlib.pyplot as plt
import numpy as np
from skimage.feature import hog

# Function to preprocess an image using HOG with fixed size
def preprocess_image_hog(image_path, target_size=(224, 224)):
    features_imag=[]
    img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)  # Read the image in grayscale
    if img is None:
        print(f"Error: Unable to read image at {image_path}")
        return None

    # Resize the image to the target size
    img_resized = cv2.resize(img, target_size)

    # Compute HOG features
    hog_features = hog(img_resized,
                       orientations=9,
                       pixels_per_cell=(8, 8),
                       cells_per_block=(2, 2),
                       block_norm='L2-Hys',
                       visualize=False)
    # Flatten the image
    flattened_image = hog_features.ravel()
    # Append the flattened image to the list
    features_imag.append(flattened_image)
    return features_imag

In [8]:
# Create a new column to store HOG features
cat_dog_df['hog_features'] = None

# Apply HOG feature extraction to each image
for index, row in cat_dog_df.iterrows():
    image_path = row['image_path']
    hog_features = preprocess_image_hog(image_path)
    cat_dog_df.at[index, 'hog_features'] = hog_features


# Filter out rows with None values in the 'hog_features' column
filtered_cat_dog_df = pd.DataFrame(cat_dog_df.dropna(subset=['hog_features']))

# Display the DataFrame with filtered HOG features
print(filtered_cat_dog_df.head())

Error: Unable to read image at /content/cat/850.jpg
Error: Unable to read image at /content/cat/.ipynb_checkpoints
Error: Unable to read image at /content/cat/936.jpg
Error: Unable to read image at /content/cat/660.jpg
Error: Unable to read image at /content/cat/140.jpg
Error: Unable to read image at /content/dog/1308.jpg
Error: Unable to read image at /content/dog/1866.jpg
              image_path  Target  \
0   /content/cat/466.jpg       1   
1  /content/dog/1192.jpg       0   
2  /content/dog/1975.jpg       0   
3    /content/cat/46.jpg       1   
4  /content/dog/1859.jpg       0   

                                        hog_features  
0  [[0.11408510133356714, 0.0, 0.0968044141656458...  
1  [[0.2423017604933486, 0.07411301200428547, 0.0...  
2  [[0.23412080375844677, 0.08609831702301472, 0....  
3  [[0.27927833109173844, 0.014070276392474696, 0...  
4  [[0.18572837335831202, 0.13349969286645502, 0....  


In [9]:
import pandas as pd
# Extract labels from the concatenated dataframe
y = filtered_cat_dog_df["Target"]
array_data = np.array(filtered_cat_dog_df["hog_features"])
X = np.concatenate(array_data)

X.shape, y.shape

((1993, 26244), (1993,))

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

performing pipeline PCA and Logistic Regression

In [16]:
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
pipe_lr = make_pipeline(PCA(n_components=100),
                        LogisticRegression(penalty='l2',
                                           max_iter=1000))
pipe_lr.fit(X_train, y_train)
y_pred = pipe_lr.predict(X_test)
test_acc = pipe_lr.score(X_test, y_test)
print(f'Test accuracy: {test_acc:.3f}')

Test accuracy: 0.677


performing Grid Search on PCA and SVC

In [17]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline


# Define the pipeline with a scaler, PCA, and SVC
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA()),
    ('svc', SVC())
])

# Define the parameter grid
param_grid = {
    'pca__n_components': [50, 100],  # Number of components for PCA
    'svc__C': [0.1, 1, 10, 100, 1000],  # Values for C
    'svc__gamma': [0.001, 0.01, 0.1, 1, 10]  # Values for gamma
}

# Perform grid search
grid_search = GridSearchCV(pipeline, param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the test set
accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", accuracy)


KeyboardInterrupt: 

performing pipeline PCA and SVC with best parameters

In [88]:
# Define the pipeline with a scaler, PCA, and SVC with fixed hyperparameters
pipe_SVC = Pipeline([
    ('scaler', StandardScaler()),
    ('pca', PCA(n_components=200)),
    ('svc', SVC(C=100.0, gamma=0.0001, kernel='rbf'))
])

# Fit the pipeline to the training data
pipe_SVC.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pipe_SVC.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.6917293233082706


performing GridSearch with Random Forest

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Define the parameter grid
param_grid = {
        'n_estimators': [100, 250, 500],
        'max_depth': [None, 5, 10, 20],
        'criterion': ['gini', 'entropy']
        }

# Instantiate the Random Forest classifier
rf_classifier = RandomForestClassifier()

# Instantiate GridSearchCV
grid_search = GridSearchCV(estimator=rf_classifier, param_grid=param_grid, cv=5, scoring='accuracy', n_jobs=-1)

# Perform grid search
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Evaluate the model on the test set
accuracy = grid_search.score(X_test, y_test)
print("Test Accuracy:", accuracy)


Applying best parameters to Random Forest

In [63]:
# Define a function to train a Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

def train_random_forest(X_train, y_train):
    rf_classifier = RandomForestClassifier(n_estimators=200, max_depth=10, criterion='entropy')
    rf_classifier.fit(X_train, y_train)

    return rf_classifier

# Train the Random Forest classifier
rf_classifier = train_random_forest(X_train, y_train)

# Make predictions on the test set
y_pred = rf_classifier.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.6541353383458647


Performing knn

In [98]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score


# Define the pipeline
pipe_spm = Pipeline([
    ('scaler', StandardScaler()),  # Feature scaling
    ('pca', PCA(n_components=50)),  # Dimensionality reduction
    ('knn', KNeighborsClassifier(n_neighbors=3))  # KNN
])

# Fit the pipeline
pipe_spm.fit(X_train, y_train)

# Predict labels for test set
y_pred = pipe_spm.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Test Accuracy:", accuracy)

Test Accuracy: 0.631578947368421


In [106]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

tree = DecisionTreeClassifier(criterion='entropy',
                              random_state=1,
                              max_depth=10)


tree = tree.fit(X_train, y_train)
y_train_pred = tree.predict(X_train)
y_test_pred = tree.predict(X_test)
tree_train = accuracy_score(y_train, y_train_pred)
tree_test = accuracy_score(y_test, y_test_pred)
print(f'Decision tree train/test accuracies '
      f'{tree_train:.3f}/{tree_test:.3f}')

Decision tree train/test accuracies 0.965/0.629


In [107]:
ada = AdaBoostClassifier(estimator=tree,
                         n_estimators=500,
                         learning_rate=0.1,
                         random_state=1)

ada = ada.fit(X_train, y_train)
y_train_pred = ada.predict(X_train)
y_test_pred = ada.predict(X_test)
ada_train = accuracy_score(y_train, y_train_pred)
ada_test = accuracy_score(y_test, y_test_pred)
print(f'AdaBoost train/test accuracies '
      f'{ada_train:.3f}/{ada_test:.3f}')

AdaBoost train/test accuracies 1.000/0.576
