In [3]:
import zipfile
import os
import cv2
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from tqdm import tqdm
from sklearn.decomposition import PCA

In [2]:

# Paths to the zip files
train_zip = '/kaggle/input/dogs-vs-cats/train.zip'
test_zip = '/kaggle/input/dogs-vs-cats/test1.zip'

# Extract directories
train_dir = '/kaggle/working/train'
test_dir = '/kaggle/working/test'

# Unzipping the training dataset
with zipfile.ZipFile(train_zip, 'r') as zip_ref:
    zip_ref.extractall(train_dir)

# Unzipping the test dataset
with zipfile.ZipFile(test_zip, 'r') as zip_ref:
    zip_ref.extractall(test_dir)

print("Data unzipped!")

Data unzipped!


In [4]:
# Paths to the directories
train_dir = '/kaggle/working/train/train'
test_dir = '/kaggle/working/test'

# Check the files in the train and test directories
train_images = os.listdir(train_dir)
test1_dir = os.path.join(test_dir, 'test1')

# Check contents of test1
test_images = os.listdir(test1_dir)

# Print the number of images and a few examples to verify
print(f"Number of training images: {len(train_images)}")
print(f"Number of test images: {len(test_images)}")

# Display a few examples
print("First 5 training images:", train_images[:5])
print("First 5 test images:", test_images[:5])


Number of training images: 25000
Number of test images: 12500
First 5 training images: ['dog.6650.jpg', 'dog.4529.jpg', 'dog.5302.jpg', 'dog.1894.jpg', 'dog.5846.jpg']
First 5 test images: ['3057.jpg', '4893.jpg', '454.jpg', '11706.jpg', '3471.jpg']


In [5]:

# Function to load and preprocess images
def load_and_preprocess_images(image_paths, target_size=(128, 128)):
    images = []
    for image_path in image_paths:
        # Load image using OpenCV
        img = cv2.imread(image_path)
        
        # Check if the image was loaded successfully
        if img is None:
            print(f"Warning: Could not load image {image_path}")
            continue
        
        # Resize the image
        img_resized = cv2.resize(img, target_size)
        
        # Normalize pixel values to [0, 1]
        img_normalized = img_resized / 255.0
        
        # Flatten the image to a 1D array (for SVM)
        img_flattened = img_normalized.flatten()
        
        images.append(img_flattened)
    
    return np.array(images)

# Correct path to training images
train_dir = '/kaggle/working/train/train'
train_image_paths = [os.path.join(train_dir, img) for img in train_images]
X_train = load_and_preprocess_images(train_image_paths)

# Check the shape of the first preprocessed image
print(f"Shape of first preprocessed image: {X_train[0].shape}")
print(f"Total number of training images: {X_train.shape[0]}")

# Correct path to test images
test1_dir = '/kaggle/working/test/test1'
test_image_paths = [os.path.join(test1_dir, img) for img in test_images]
X_test = load_and_preprocess_images(test_image_paths)

print(f"Total number of test images: {X_test.shape[0]}")

Shape of first preprocessed image: (49152,)
Total number of training images: 25000
Total number of test images: 12500


In [None]:

# Step 1: Extract labels
def extract_labels(image_paths):
    labels = []
    for path in tqdm(image_paths, desc="Extracting labels"):
        filename = os.path.basename(path)
        if filename.startswith('cat'):
            labels.append(0)  # 0 for cats
        elif filename.startswith('dog'):
            labels.append(1)  # 1 for dogs
        else:
            print(f"Warning: Unknown label in filename {filename}")
    return np.array(labels)

# Extract labels for training images
y_train = extract_labels(train_image_paths)

# Step 2: Split into training and validation sets
X_train_split, X_val, y_train_split, y_val = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

print(f"Training set size: {X_train_split.shape[0]}")
print(f"Validation set size: {X_val.shape[0]}")

# Optional: Reduce dimensionality with PCA in batches
print("Applying PCA to reduce feature dimensions...")
batch_size = 32 # Adjust batch size based on memory
pca = PCA(n_components=100, random_state=42)

# Fit PCA on training data in batches
for start in tqdm(range(0, len(X_train_split), batch_size), desc="Fitting PCA"):
    end = min(start + batch_size, len(X_train_split))
    if start == 0:
        pca.partial_fit(X_train_split[start:end])  # First batch initializes PCA
    else:
        pca.partial_fit(X_train_split[start:end])  # Incrementally update PCA

# Transform training and validation sets in batches
def transform_in_batches(data, pca, batch_size):
    transformed_data = []
    for start in tqdm(range(0, len(data), batch_size), desc="Transforming data"):
        end = min(start + batch_size, len(data))
        transformed_data.append(pca.transform(data[start:end]))
    return np.vstack(transformed_data)

X_train_pca = transform_in_batches(X_train_split, pca, batch_size)
X_val_pca = transform_in_batches(X_val, pca, batch_size)

# Step 3: Train the SVM classifier with batch processing
print("Training the SVM classifier...")
svm_model = SVC(kernel='linear', C=1.0, random_state=42)

# To simulate batch training, downsample the dataset for each step.
# SVM does not inherently support batch updates, so you must use subsets:
for start in tqdm(range(0, len(X_train_pca), batch_size), desc="Training SVM"):
    end = min(start + batch_size, len(X_train_pca))
    svm_model.fit(X_train_pca[start:end], y_train_split[start:end])

# Step 4: Evaluate on the validation set
val_accuracy = svm_model.score(X_val_pca, y_val)
print(f"Validation Accuracy: {val_accuracy:.2f}")

Extracting labels: 100%|██████████| 25000/25000 [00:00<00:00, 949951.98it/s]
