In [10]:
from google.colab import files

# This line will create a button to upload a file.
print("Click the button below to upload your kaggle.json file.")
files.upload()

Click the button below to upload your kaggle.json file.


Saving kaggle.json to kaggle (1).json


{'kaggle (1).json': b'{"username":"aditya2831","key":"085d7cbfcba8807e77211ed05b1ced15"}'}

In [11]:
!mkdir -p ~/.kaggle

In [12]:
!cp kaggle.json ~/.kaggle/

In [13]:
!chmod 600 ~/.kaggle/kaggle.json

In [14]:
!kaggle competitions download -c dogs-vs-cats

dogs-vs-cats.zip: Skipping, found more recently modified local copy (use --force to force download)


In [16]:
!unzip -o dogs-vs-cats.zip

Archive:  dogs-vs-cats.zip
  inflating: sampleSubmission.csv    
  inflating: test1.zip               
  inflating: train.zip               


In [17]:
# Unzip the training images
!unzip -o -q train.zip -d .

# Unzip the test images
!unzip -o -q test1.zip -d .

print("✅ All files are now correctly unzipped!")

✅ All files are now correctly unzipped!


In [18]:
import os
import cv2
import numpy as np
from skimage.feature import hog
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [19]:
IMG_SIZE = (64, 128)
DATA_SUBSET = 1000

In [20]:
train_dir = 'train/'

In [21]:
all_filenames = os.listdir(train_dir)
# Shuffle to get a random subset
np.random.shuffle(all_filenames)

In [22]:
cat_files = [f for f in all_filenames if f.startswith('cat')][:DATA_SUBSET]
dog_files = [f for f in all_filenames if f.startswith('dog')][:DATA_SUBSET]
image_files = cat_files + dog_files

In [23]:
features = []
labels = []

In [24]:
print(f"Extracting HOG features from {len(image_files)} images...")
# tqdm provides a progress bar
for image_file in tqdm(image_files):
    # Create label: 0 for cat, 1 for dog
    label = 0 if 'cat' in image_file else 1

    # Read and resize image
    image_path = os.path.join(train_dir, image_file)
    image = cv2.imread(image_path)
    if image is None:
        print(f"Warning: Could not read image {image_path}. Skipping.")
        continue

    image_resized = cv2.resize(image, IMG_SIZE)

    # Convert to grayscale for HOG
    gray_image = cv2.cvtColor(image_resized, cv2.COLOR_BGR2GRAY)

    # Extract HOG features
    hog_features = hog(gray_image, orientations=9, pixels_per_cell=(8, 8),
                       cells_per_block=(2, 2), block_norm='L2-Hys',
                       visualize=False)

    features.append(hog_features)
    labels.append(label)

Extracting HOG features from 2000 images...


100%|██████████| 2000/2000 [00:14<00:00, 135.64it/s]


In [25]:
X = np.array(features)
y = np.array(labels)

print("\nFeature extraction complete!")
print("Shape of feature matrix (X):", X.shape)
print("Shape of labels vector (y):", y.shape)


Feature extraction complete!
Shape of feature matrix (X): (2000, 3780)
Shape of labels vector (y): (2000,)


In [26]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

print(f"\nData split complete:")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")


Data split complete:
Training samples: 1600
Testing samples: 400


In [27]:
from sklearn.svm import LinearSVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

print("Creating model pipeline...")
# A pipeline chains steps together. First, it will scale the data, then it will train the SVM.
svm_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('svm', LinearSVC(C=1.0, random_state=42, dual=False, max_iter=10000))
])

print("Training the SVM model... This might take a moment.")
# The .fit() command starts the training process.
svm_pipeline.fit(X_train, y_train)

print("Model training complete! ✅")

Creating model pipeline...
Training the SVM model... This might take a moment.
Model training complete! ✅


In [28]:
from sklearn.metrics import accuracy_score, classification_report

print("Evaluating the model on the test data...")

# Use the trained pipeline to make predictions on the test set.
y_pred = svm_pipeline.predict(X_test)

# Calculate the accuracy score.
accuracy = accuracy_score(y_test, y_pred)
print(f"\nModel Accuracy: {accuracy * 100:.2f}%")

# Print a detailed report showing performance for both 'Cat' and 'Dog' classes.
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Cat', 'Dog']))

Evaluating the model on the test data...

Model Accuracy: 64.50%

Classification Report:
              precision    recall  f1-score   support

         Cat       0.66      0.61      0.63       200
         Dog       0.64      0.68      0.66       200

    accuracy                           0.65       400
   macro avg       0.65      0.65      0.64       400
weighted avg       0.65      0.65      0.64       400



In [30]:
import pandas as pd
from skimage.feature import hog
import subprocess

In [31]:
# --- SETUP: Ensure the test1 directory exists ---
test_dir = 'test1/'
if not os.path.isdir(test_dir):
    print(f"Directory '{test_dir}' not found. Unzipping 'test1.zip'...")
    subprocess.run(['unzip', '-o', '-q', 'test1.zip', '-d', '.'])
    print("Unzipping complete.")

In [32]:
# We are using the 'svm_pipeline' you already trained
IMG_SIZE = (64, 128)

In [33]:
# --- PREDICTION ON TEST SET ---
print("\nPreparing to make predictions on the official test set...")
test_filenames = os.listdir(test_dir)
# Sort files by their ID number to match the submission format
test_filenames.sort(key=lambda x: int(x.split('.')[0]))

results = []
print(f"Making predictions on {len(test_filenames)} test images...")

for filename in tqdm(test_filenames):
    # Extract ID from filename (e.g., '123.jpg' -> '123')
    image_id = filename.split('.')[0]

    # Preprocess the image
    image_path = os.path.join(test_dir, filename)
    image = cv2.imread(image_path)
    label = 1 # Default label if image can't be read
    if image is not None:
        image_resized = cv2.resize(image, IMG_SIZE)
        gray_image = cv2.cvtColor(image_resized, cv2.COLOR_BGR2GRAY)

        # Extract HOG features
        hog_features = hog(gray_image, orientations=9, pixels_per_cell=(8, 8),
                           cells_per_block=(2, 2), block_norm='L2-Hys')

        # Reshape for a single prediction
        hog_features_reshaped = hog_features.reshape(1, -1)

        # Predict using the trained pipeline
        prediction = svm_pipeline.predict(hog_features_reshaped)
        label = prediction[0]

    results.append({'id': int(image_id), 'label': int(label)})


Preparing to make predictions on the official test set...
Making predictions on 12500 test images...


100%|██████████| 12500/12500 [01:19<00:00, 157.24it/s]


In [34]:
# --- CREATE AND SAVE DATAFRAME ---
submission_df = pd.DataFrame(results)
submission_df.to_csv('submission.csv', index=False)

print("\n✅ submission.csv file created successfully!")
print("Here are the first 5 rows of your submission file:")
print(submission_df.head())


✅ submission.csv file created successfully!
Here are the first 5 rows of your submission file:
   id  label
0   1      1
1   2      1
2   3      1
3   4      1
4   5      0
