In [1]:
from PIL import Image
import pandas as pd
import numpy as np
import os
import cv2
from sklearn.model_selection import train_test_split
from google.colab import drive

drive.mount('/content/drive')
Train = '/content/drive/My Drive/Train'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
file_names = os.listdir(Train)
df = []
target_size = (64, 64)

for file_name in file_names:
  try:

    with Image.open(os.path.join(Train, file_name)) as img:
      width, height = img.size

      img_resized = img.resize(target_size)
      img_array = np.array(img_resized).astype('float32') / 255.0


      df.append({
                'file_name': file_name,
                'width': width,
                'height': height,
                'species': 'cat' if 'cat' in file_name else 'dog',
                'resized_image': img_array
            })
  except Exception as e:
        # Handle exceptions if image cannot be opened
        print(f"Error processing {file_name}: {e}")

In [3]:
data = pd.DataFrame(df)

In [4]:
data
# print(f"Shape of data: {data.shape}")

Unnamed: 0,file_name,width,height,species,resized_image
0,cat_179.jpg,350,200,cat,"[[[0.7372549, 0.6, 0.627451], [0.76862746, 0.6..."
1,cat_180.jpg,1600,898,cat,"[[[0.6784314, 0.7411765, 0.8392157], [0.674509..."
2,cat_229.jpg,225,225,cat,"[[[0.5294118, 0.37254903, 0.7254902], [0.52941..."
3,cat_235.jpg,640,429,cat,"[[[0.56078434, 0.6156863, 0.68235296], [0.5529..."
4,cat_276.jpg,1536,1024,cat,"[[[0.11764706, 0.13333334, 0.13725491], [0.109..."
...,...,...,...,...,...
552,dog_401.jpg,133,200,dog,"[[[0.95686275, 0.95686275, 0.95686275], [0.956..."
553,dog_35.jpg,640,635,dog,"[[[0.5137255, 0.8235294, 0.8039216], [0.513725..."
554,dog_429.jpg,259,194,dog,"[[[0.1882353, 0.23921569, 0.02745098], [0.2666..."
555,dog_514.jpg,1200,900,dog,"[[[0.7607843, 0.827451, 0.6156863], [0.7568627..."


In [5]:
from sklearn.model_selection import train_test_split
import tensorflow as tf
from sklearn.utils import shuffle

X = []
y = []

def load_images_and_labels(Train):
  for filename in os.listdir(Train):
    image_path = os.path.join(Train, filename)

    if 'cat' in filename.lower():
      label = 1
    elif 'dog' in filename.lower():
      label = 0
    else:
      continue


    try:
            img = tf.keras.utils.load_img(image_path, target_size=(64, 64))
            img_array = tf.keras.utils.img_to_array(img)
            img_vector = img_array.flatten()

            X.append(img_vector)
            y.append(label)
    except Exception as e:
            print(f"Error loading image {image_path}: {e}")

Why Normalize Images?
1. Uniform Scale:
Images often have pixel values in different ranges (e.g., 0 to 255). Normalizing pixel values (e.g., scaling them to the range [0, 1]) ensures that all features are on the same scale, which helps algorithms learn more effectively.

2. Improved Convergence:
Normalization can lead to faster and more stable convergence in training, especially for gradient-based algorithms like neural networks. It helps the model to learn more efficiently.

3. Prevention of Numerical Issues:
Large or small feature values might cause numerical instability during computations. Normalizing values can mitigate such problems.

How Does Splitting Data Help in Model Evaluation?
1. Generalization Assessment:
Splitting data into training and testing sets allows you to evaluate the model on unseen data. This helps you understand how well the model generalizes to new, unseen examples.

2. Prevention of Overfitting:
If a model performs well on the training set but poorly on the testing set, it might be overfitting. Splitting data helps ensure that the model is not just memorizing the training data but can perform well on new data.

3. Model Selection and Tuning:
Splitting data allows you to train models on one subset and evaluate them on another. Techniques like cross-validation can further help in selecting the best model and tuning hyperparameters.

In [6]:
class LogisticRegression:
    def __init__(self, lr=0.01,n_iters=1000):
      self.lr = lr
      self.n_iters = n_iters
      self.weight = None
      self.bias = None

    def fit(self, X, y):
      n_sample, n_features = X.shape

      #init parameters
      self.weight = np.zeros(n_features)
      self.bias = 0

      #gradient descent

      for _ in range(self.n_iters):
        # approximate y with linear combination of weights and x, plus bias
        linear_model = np.dot(X, self.weight) + self.bias

        # apply sigmoid function
        y_predicted = self._sigmoid(linear_model)

        # compute gradients
        dw = (1 / n_sample) * np.dot(X.T,(y_predicted - y))
        db = (1/ n_sample) * np.sum(y_predicted - y)

        #update parameters
        self.weight -= self.lr * dw
        self.bias -= self.lr * db

    def predict(self, X):
      linear_model = np.dot(X, self.weight) + self.bias
      y_predicted = self._sigmoid(linear_model)
      y_predicted_cls = [1 if i > 0.5 else 0 for i in y_predicted]
      return y_predicted_cls

    def _sigmoid(self, x):
      return 1/(1 + np.exp(-x))

In [7]:
load_images_and_labels(Train)

X, y = shuffle(X, y, random_state=42)
X = np.array(X)
y = np.array(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
def accuracy(y_true, y_pred):
  accuracy = np.sum(y_true == y_pred) / len(y_true)
  return accuracy

In [16]:
regressor = LogisticRegression(lr=0.0001, n_iters=1000)
regressor.fit(X_train, y_train)
predictions = regressor.predict(X_test)
lr_acc = accuracy(y_test, predictions)
print(f"LR accuracy: {lr_acc * 100:.2f}%")

  return 1/(1 + np.exp(-x))


LR accuracy: 57.14%


Why Use the Sigmoid Activation Function for Binary Classification?
1. Output Range: The sigmoid function maps any input to a value between 0 and 1. This property makes it suitable for binary classification problems where the output is a probability, indicating the likelihood of the input belonging to a particular class.

2. Interpretability: The sigmoid function provides output in the form of probabilities, which can be directly interpreted as the likelihood of an instance belonging to the positive class (class 1). This is crucial for binary classification tasks.

3. Gradient Behavior: The sigmoid function has a well-defined gradient which facilitates backpropagation during training. Although the gradient can become very small (leading to vanishing gradients), it's still useful for many binary classification problems.

Other Activation Functions and Their Applications
**1. ReLU (Rectified Linear Unit):

Formula:
ReLU
(
𝑥
)
=
max
⁡
(
0
,
𝑥
)
ReLU(x)=max(0,x)
Application: Commonly used in hidden layers of neural networks, especially in deep learning. It helps in mitigating the vanishing gradient problem and allows models to learn faster and perform better.
**2. Tanh (Hyperbolic Tangent):

Formula:
tanh
(
𝑥
)
=
𝑒
𝑥
−
𝑒
−
𝑥
𝑒
𝑥
+
𝑒
−
𝑥
tanh(x)=
e
x
 +e
−x

e
x
 −e
−x

​

Application: Similar to the sigmoid function but outputs values in the range [-1, 1]. It is often used in hidden layers where zero-centered data can help with faster convergence.
**3. Softmax:

Formula:
Softmax
(
𝑥
𝑖
)
=
𝑒
𝑥
𝑖
∑
𝑗
𝑒
𝑥
𝑗
Softmax(x
i
​
 )=
∑
j
​
 e
x
j
​


e
x
i
​


​

Application: Used in the output layer for multi-class classification problems. It converts logits (raw prediction values) into probabilities for each class, which sum to 1.
**4. Leaky ReLU:

Formula:
Leaky ReLU
(
𝑥
)
=
max
⁡
(
0.01
𝑥
,
𝑥
)
Leaky ReLU(x)=max(0.01x,x)
Application: Addresses the problem of dying ReLUs (where neurons become inactive and always output zero) by allowing a small, non-zero gradient when
𝑥
<
0
x<0.
**5. ELU (Exponential Linear Unit):

Formula:
ELU
(
𝑥
)
=
{
𝑥
if
𝑥
>
0
𝛼
(
𝑒
𝑥
−
1
)
if
𝑥
≤
0
ELU(x)={
x
α(e
x
 −1)
​
  
if x>0
if x≤0
​

Application: Similar to ReLU but with an exponential term for negative inputs, which helps reduce the problem of vanishing gradients and speeds up learning.

Why Use Cross-Entropy Loss Instead of Mean Squared Error for Classification?
**1. Probabilistic Interpretation: Cross-Entropy Loss, also known as Log Loss, is specifically designed for classification problems where the output represents probabilities. It measures the performance of a classification model whose output is a probability value between 0 and 1.

**2. Gradient Behavior: Cross-Entropy Loss provides better gradients for classification tasks compared to Mean Squared Error (MSE). This is because the gradient of Cross-Entropy Loss with respect to the model's output is proportional to the difference between the predicted probability and the actual label, leading to more effective weight updates.

**3. Handling Imbalanced Classes: Cross-Entropy Loss tends to work better with imbalanced classes. It penalizes wrong predictions more heavily, which helps in learning from class imbalance.

**4. Convergence: Cross-Entropy Loss generally leads to faster convergence in classification problems compared to MSE. This is due to its nature of penalizing wrong predictions more effectively.

**5. MSE Limitation: MSE is typically used for regression tasks where the output is continuous. For classification tasks, especially binary classification, MSE does not align well with the probabilistic nature of classification and can lead to suboptimal performance.

**Evaluating KNN with the Same Metrics as Logistic Regression**

To evaluate the KNN model using the same metrics as Logistic Regression, follow these steps:

Train and Predict with KNN:

In [17]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)

knn_acc = accuracy(y_test, y_pred_knn)
print(f"k-NN accuracy: {knn_acc * 100:.2f}%")


k-NN accuracy: 55.36%


Performance of K-Nearest Neighbors (KNN) in Classification Tasks
K-Nearest Neighbors (KNN) is a simple and intuitive classification algorithm that classifies data points based on the majority label among their k nearest neighbors in the feature space.

How KNN Works:

Distance Calculation:
For a given test sample, KNN calculates the distance between this sample and all training samples. Common distance metrics include Euclidean distance, Manhattan distance, and Minkowski distance.

Neighbor Selection:
The algorithm selects the k nearest training samples to the test sample.

Majority Voting:
The class label of the test sample is determined by the majority vote of its k nearest neighbors. The most frequent class label among these neighbors is assigned to the test sample.

Performance Characteristics:

Non-parametric:
KNN does not assume any specific distribution for the data, making it flexible but potentially computationally expensive.

Lazy Learner:
KNN does not explicitly train a model; instead, it performs computation during the prediction phase, which can be slow with large datasets.

**Comparison with Logistic Regression**

**1. Model Assumptions:**

Logistic Regression:
Assumes a linear relationship between the input features and the log-odds of the outcome. It models the probability of the positive class directly and uses a sigmoid function for binary classification.

KNN:
Makes no assumptions about the form of the decision boundary. It is a non-parametric method that bases its classification on the local neighborhood of each test point.

**2. Complexity and Computation:**

Logistic Regression:
Generally has lower computational complexity during prediction because it uses a model with learned parameters. Training involves optimization but prediction is straightforward.

KNN:
Computationally intensive during prediction, especially with large datasets, as it requires calculating distances to all training samples. Training is faster because it simply stores the data.

**3. Handling Non-linearity:**

Logistic Regression:
Limited to linear decision boundaries unless extended with polynomial features or interaction terms.

KNN:
Can handle complex and non-linear decision boundaries because it relies on the local structure of the data.

**4. Sensitivity to Outliers:

Logistic Regression:
Sensitive to outliers, as they can affect the fitted decision boundary.

KNN:
Sensitive to noisy or irrelevant features and outliers, as these can influence the distance calculations and neighbor selection.

In [19]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

# Create and train the Decision Tree model
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_dt = dt_model.predict(X_test)
dt_acc = accuracy(y_test, y_pred_dt)
print(f"Decision Tree accuracy: {dt_acc * 100:.2f}%")

Decision Tree accuracy: 56.25%


In [21]:
from sklearn.ensemble import RandomForestClassifier

# Create and train the Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_model.predict(X_test)
rf_accuracy = accuracy(y_test, y_pred_rf)
print(f"Random Forest accuracy: {rf_accuracy * 100:.2f}%")

Random Forest accuracy: 65.18%


Comparison of Models for Image Classification
Based on your project where you've implemented Logistic Regression (LR), K-Nearest Neighbors (KNN), Decision Trees (DT), and Random Forest (RF) on image data, and observed that RF had the highest accuracy, here’s a detailed comparison:

1. Performance of Each Model
Logistic Regression (LR)
Description: Logistic Regression is a linear model that works well for binary classification problems. It might struggle with complex patterns in image data unless you use feature engineering or dimensionality reduction.
Accuracy: Often lower for raw image data without pre-processing.
K-Nearest Neighbors (KNN)
Description: KNN is a non-parametric algorithm that classifies data points based on the majority class among their nearest neighbors. It can be computationally expensive and slower with large image datasets.
Accuracy: Can be high for small datasets but might degrade with larger or more complex data.
Decision Trees (DT)
Description: Decision Trees split the data based on feature values and can capture complex patterns. They can be prone to overfitting if not properly tuned.
Accuracy: Can be effective with proper tuning, but may not perform as well as ensemble methods like Random Forest.
Random Forest (RF)
Description: Random Forest is an ensemble method that builds multiple decision trees and averages their predictions to improve performance and reduce overfitting.
Accuracy: Typically provides high accuracy and generalizes well on diverse datasets, including image data.
2. Why Random Forest Outperforms the Others
**1. Complexity Handling:

RF: Combines multiple decision trees to handle complex patterns and interactions in the data, making it well-suited for image classification tasks.
LR, KNN, DT: May struggle with the high-dimensional and complex nature of image data. LR is linear, KNN can be slow and sensitive to dimensionality, and DT may overfit.
**2. Overfitting:

RF: Reduces overfitting by averaging predictions across multiple trees, which helps in generalizing better to unseen data.
LR, KNN, DT: Can overfit, especially without proper regularization or tuning.
**3. Feature Importance:

RF: Provides feature importance metrics, which can help in understanding which features (or image segments) are most influential in classification.
LR, KNN, DT: May not provide clear insights into feature importance, especially for raw pixel values.
3. Implementation Comparison
Ease of Implementation:

LR: Easier to implement but may require significant pre-processing and feature engineering for image data.
KNN: Simple to implement but can become slow with large datasets. Requires careful selection of the number of neighbors (k).
DT: Easy to understand and visualize but requires tuning hyperparameters to avoid overfitting.
RF: More complex due to multiple trees but typically provides robust performance with fewer hyperparameters to tune compared to DT.
4. Hyperparameter Tuning for Random Forest
Important Hyperparameters to Tune:

n_estimators: Number of trees in the forest. Increasing this usually improves accuracy but also increases computation time.
max_depth: Maximum depth of each tree. Controls the complexity of the trees.
min_samples_split: Minimum number of samples required to split an internal node. Helps in preventing overfitting.
min_samples_leaf: Minimum number of samples required to be at a leaf node. A higher value can reduce overfitting.

5. Summary
Accuracy: Random Forest generally outperforms Logistic Regression, KNN, and Decision Trees in image classification tasks due to its ability to handle complex data and reduce overfitting.
Implementation: Random Forest might be more complex to set up initially but often provides the best performance for image data compared to the other models.
Hyperparameter Tuning: Essential for optimizing Random Forest performance, and can significantly impact the accuracy and effectiveness of the model.
If Random Forest consistently provides the highest accuracy on your dataset, it is likely the most suitable model for your image classification task. Make sure to report these findings and observations in your project to demonstrate the effectiveness of Random Forest in comparison to other models.






