# Explore here

In [1]:
!pip install kagglehub
import kagglehub

path = kagglehub.dataset_download("salader/dogs-vs-cats")

print("Path to dataset files:", path)

Path to dataset files: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1


In [2]:
import os
import shutil
import random
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import zipfile 

2025-07-20 21:18:05.442569: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-20 21:18:05.443734: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-20 21:18:05.447423: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-07-20 21:18:05.456869: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1753046285.472455   12840 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1753046285.47

# --- Configuration ---

In [3]:
IMAGE_WIDTH, IMAGE_HEIGHT = 200, 200
BATCH_SIZE = 32
EPOCHS = 50

# Define local directories for our structured dataset
BASE_DATA_DIR = 'dogs_vs_cats_dataset'
TRAIN_DIR = os.path.join(BASE_DATA_DIR, 'train')
VALIDATION_DIR = os.path.join(BASE_DATA_DIR, 'validation')

# Model saving path
MODEL_SAVE_PATH = 'trained_models/dogs_vs_cats_vgg_like.h5'
CHECKPOINT_FILEPATH = 'trained_models/best_model_checkpoint.h5'

# --- Step 1: Loading the dataset using kagglehub ---

In [4]:
download_path = None

try:
    download_path = kagglehub.dataset_download("salader/dogs-vs-cats")
    print(f"Dataset downloaded to: {download_path}")

    # Based on your latest 'ls -l' output, the raw images are directly within
    # 'dogs_vs_cats/train/' organized into 'cats/' and 'dogs/' subfolders.
    # We will use this 'train' folder from the cache as our source.
    SOURCE_TRAIN_CATS_DIR = os.path.join(download_path, 'dogs_vs_cats', 'train', 'cats')
    SOURCE_TRAIN_DOGS_DIR = os.path.join(download_path, 'dogs_vs_cats', 'train', 'dogs')

    if not os.path.isdir(SOURCE_TRAIN_CATS_DIR) or not os.path.isdir(SOURCE_TRAIN_DOGS_DIR):
        print(f"CRITICAL ERROR: Expected source directories not found or incomplete:")
        print(f"  Cats source: {SOURCE_TRAIN_CATS_DIR}")
        print(f"  Dogs source: {SOURCE_TRAIN_DOGS_DIR}")
        print("Please manually verify the structure of the KaggleHub downloaded dataset.")
        exit()

    print(f"Identified source directories for images:")
    print(f"  Cats: {SOURCE_TRAIN_CATS_DIR}")
    print(f"  Dogs: {SOURCE_TRAIN_DOGS_DIR}")

except Exception as e:
    print(f"Error during dataset download or initial path identification: {e}")
    print("Please ensure 'kagglehub' is installed and your Kaggle API credentials are correctly set up (kaggle.json in ~/.kaggle/).")
    exit()

# --- Prepare local directory structure for ImageDataGenerator ---
print("\n--- Preparing local directory structure for ImageDataGenerator ---")

# Clean up existing structure if it exists to avoid old files
if os.path.exists(BASE_DATA_DIR):
    print(f"Removing existing '{BASE_DATA_DIR}' directory for a clean start...")
    shutil.rmtree(BASE_DATA_DIR)

os.makedirs(os.path.join(TRAIN_DIR, 'dogs'), exist_ok=True)
os.makedirs(os.path.join(TRAIN_DIR, 'cats'), exist_ok=True)
os.makedirs(os.path.join(VALIDATION_DIR, 'dogs'), exist_ok=True)
os.makedirs(os.path.join(VALIDATION_DIR, 'cats'), exist_ok=True)

# List all image files from the source 'cats' and 'dogs' directories
all_cat_files = [f for f in os.listdir(SOURCE_TRAIN_CATS_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
all_dog_files = [f for f in os.listdir(SOURCE_TRAIN_DOGS_DIR) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

random.shuffle(all_cat_files)
random.shuffle(all_dog_files)

print(f"Found {len(all_cat_files)} cat images and {len(all_dog_files)} dog images in source.")

# Define split ratio (e.g., 80% train, 20% validation)
train_split_ratio = 0.8

# Distribute cat images
num_train_cats = int(len(all_cat_files) * train_split_ratio)
for i, img_name in enumerate(all_cat_files):
    src_path = os.path.join(SOURCE_TRAIN_CATS_DIR, img_name)
    if i < num_train_cats:
        dst_path = os.path.join(TRAIN_DIR, 'cats', img_name)
    else:
        dst_path = os.path.join(VALIDATION_DIR, 'cats', img_name)
    try:
        shutil.copy(src_path, dst_path)
    except Exception as e:
        print(f"ERROR: Could not copy cat image '{img_name}': {e}")

# Distribute dog images
num_train_dogs = int(len(all_dog_files) * train_split_ratio)
for i, img_name in enumerate(all_dog_files):
    src_path = os.path.join(SOURCE_TRAIN_DOGS_DIR, img_name)
    if i < num_train_dogs:
        dst_path = os.path.join(TRAIN_DIR, 'dogs', img_name)
    else:
        dst_path = os.path.join(VALIDATION_DIR, 'dogs', img_name)
    try:
        shutil.copy(src_path, dst_path)
    except Exception as e:
        print(f"ERROR: Could not copy dog image '{img_name}': {e}")

print("Dataset successfully structured into local train/validation directories.")
print(f"  Training Cats: {len(os.listdir(os.path.join(TRAIN_DIR, 'cats')))}")
print(f"  Training Dogs: {len(os.listdir(os.path.join(TRAIN_DIR, 'dogs')))}")
print(f"  Validation Cats: {len(os.listdir(os.path.join(VALIDATION_DIR, 'cats')))}")
print(f"  Validation Dogs: {len(os.listdir(os.path.join(VALIDATION_DIR, 'dogs')))}")


Dataset downloaded to: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1
Identified source directories for images:
  Cats: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1/dogs_vs_cats/train/cats
  Dogs: /home/vscode/.cache/kagglehub/datasets/salader/dogs-vs-cats/versions/1/dogs_vs_cats/train/dogs

--- Preparing local directory structure for ImageDataGenerator ---
Removing existing 'dogs_vs_cats_dataset' directory for a clean start...
Found 10000 cat images and 10000 dog images in source.
Dataset successfully structured into local train/validation directories.
  Training Cats: 8000
  Training Dogs: 8000
  Validation Cats: 2000
  Validation Dogs: 2000


# --- Step 2: Visualize the input information ---

In [8]:
def plot_sample_images(directory, class_name, num_images=9):
    """Plots a grid of sample images from a specified directory and class."""
    print(f"Displaying {num_images} sample {class_name} images from {directory}...")
    class_path = os.path.join(directory, class_name)
    image_files = [f for f in os.listdir(class_path) if f.endswith('.jpg')]
    random.shuffle(image_files) # Shuffle to get different samples each time

    plt.figure(figsize=(8, 8))
    plt.suptitle(f"Sample {class_name.capitalize()} Images", fontsize=16)
    for i in range(min(num_images, len(image_files))):
        img_path = os.path.join(class_path, image_files[i])
        img = mpimg.imread(img_path)
        plt.subplot(3, 3, i + 1)
        plt.imshow(img)
        plt.axis('off')
    plt.show()

plot_sample_images(TRAIN_DIR, 'dogs')
plot_sample_images(TRAIN_DIR, 'cats')

# --- Create ImageDataGenerator objects ---
print("\n--- Setting up ImageDataGenerator ---")

# Data Augmentation for training data
train_datagen = ImageDataGenerator(
    rescale=1./255,             # Normalize pixel values to [0, 1]
    rotation_range=20,          # Randomly rotate images by 20 degrees
    width_shift_range=0.2,      # Randomly shift images horizontally
    height_shift_range=0.2,     # Randomly shift images vertically
    shear_range=0.2,            # Apply shear transformations
    zoom_range=0.2,             # Apply random zoom
    horizontal_flip=True,       # Randomly flip images horizontally
    fill_mode='nearest'         # Fill in newly created pixels
)

# Only rescaling for validation data (no augmentation)
validation_datagen = ImageDataGenerator(rescale=1./255)

# Flow images from directories
train_generator = train_datagen.flow_from_directory(
    TRAIN_DIR,
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    batch_size=BATCH_SIZE,
    class_mode='binary', # For 2 classes (dog/cat)
    shuffle=True # Shuffle training data
)

validation_generator = validation_datagen.flow_from_directory(
    VALIDATION_DIR,
    target_size=(IMAGE_WIDTH, IMAGE_HEIGHT),
    batch_size=BATCH_SIZE,
    class_mode='binary',
    shuffle=False # Do not shuffle validation data for consistent evaluation
)

print(f"Class indices: {train_generator.class_indices}") # {'cats': 0, 'dogs': 1} or vice-versa


Displaying 9 sample dogs images from dogs_vs_cats_dataset/train...


<Figure size 800x800 with 0 Axes>

Displaying 9 sample cats images from dogs_vs_cats_dataset/train...


<Figure size 800x800 with 0 Axes>


--- Setting up ImageDataGenerator ---
Found 0 images belonging to 2 classes.
Found 0 images belonging to 2 classes.
Class indices: {'cats': 0, 'dogs': 1}
