In [1]:
pip install tensorflow scikit-learn pandas numpy opencv-python

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.2
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
#import libraries
import os
import numpy as np
import cv2
import joblib
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [3]:
# Configuration
TRAIN_DIR = 'train'
IMG_SIZE = (128, 128)  # Smaller size for faster processing
BATCH_SIZE = 256
FEATURE_DIM = 1280     # MobileNetV2 feature dimension

In [4]:
# Load MobileNetV2 (faster than VGG)
model = MobileNetV2(
    weights='imagenet',
    include_top=False,
    pooling='avg',
    input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3)
)

def extract_features(image_paths):
    """Batch feature extraction with progress bar"""
    features = []
    for i in tqdm(range(0, len(image_paths), BATCH_SIZE), 
                  desc="Extracting Features"):
        batch_paths = image_paths[i:i+BATCH_SIZE]
        batch_images = []
        
        for path in batch_paths:
            img = cv2.imread(path)
            if img is None:
                print(f"Warning: Could not read {path}, using zeros")
                batch_images.append(np.zeros((IMG_SIZE[0], IMG_SIZE[1], 3)))
                continue
                
            img = cv2.resize(img, IMG_SIZE)
            img = preprocess_input(img)
            batch_images.append(img)
        
        batch_images = np.array(batch_images)
        batch_features = model.predict(batch_images, verbose=0)
        features.append(batch_features)
    
    return np.vstack(features)

In [5]:
# Prepare training data
print("Loading image paths...")
cat_files = [os.path.join(TRAIN_DIR, f"cat.{i}.jpg") for i in range(12500)]
dog_files = [os.path.join(TRAIN_DIR, f"dog.{i}.jpg") for i in range(12500)]

all_files = cat_files + dog_files
labels = [0]*12500 + [1]*12500  # 0=cat, 1=dog

Loading image paths...


In [6]:
# Extract features 
features = extract_features(all_files)

# Train SVM
print("Training SVM...")
X_train, X_val, y_train, y_val = train_test_split(
    features, labels, test_size=0.2, random_state=42
)

svm = LinearSVC(
    C=0.1,           # Regularization parameter
    max_iter=5000,   # Increased iterations for convergence
    class_weight='balanced'  # Handle class imbalance
)
svm.fit(X_train, y_train)

Extracting Features: 100%|█████████████████████████████████████████████████████████████| 98/98 [15:37<00:00,  9.57s/it]


Training SVM...


In [7]:
# Evaluate
train_acc = svm.score(X_train, y_train)
val_acc = svm.score(X_val, y_val)
print(f"\nTraining Accuracy: {train_acc:.4f}")
print(f"Validation Accuracy: {val_acc:.4f}")

# Save model
joblib.dump(svm, 'svm_model.pkl')
print("Model saved as 'svm_model.pkl'")


Training Accuracy: 0.9951
Validation Accuracy: 0.9542
Model saved as 'svm_model.pkl'


In [8]:
# Configuration 
TEST_DIR = 'test1'  
MODEL_PATH = 'svm_model.pkl'
IMG_SIZE = (128, 128)
BATCH_SIZE = 256

In [15]:
def process_test_images():
    """Process test images with numeric sorting"""
    test_files = [f for f in os.listdir(TEST_DIR) if f.endswith('.jpg')]
    
    # Sort numerically: 1.jpg, 2.jpg, ... 10000.jpg
    test_files.sort(key=lambda x: int(x.split('.')[0]))
    
    test_paths = [os.path.join(TEST_DIR, f) for f in test_files]
    test_ids = [os.path.splitext(f)[0] for f in test_files]
    
    # Extract features
    features = []
    for i in tqdm(range(0, len(test_paths), BATCH_SIZE), 
                  desc="Processing Test Images"):
        batch_paths = test_paths[i:i+BATCH_SIZE]
        batch_images = []
        
        for path in batch_paths:
            img = cv2.imread(path)
            if img is None:
                print(f"Warning: Could not read {path}, using zeros")
                batch_images.append(np.zeros((IMG_SIZE[0], IMG_SIZE[1], 3)))
                continue
                
            img = cv2.resize(img, IMG_SIZE)
            img = preprocess_input(img)
            batch_images.append(img)
        
        batch_images = np.array(batch_images)
        batch_features = feature_model.predict(batch_images, verbose=0)
        features.append(batch_features)
    
    return test_ids, np.vstack(features)

In [16]:
# Process test images
test_ids, test_features = process_test_images()

# Predict
print("Making predictions...")
predictions = svm.predict(test_features)

Processing Test Images: 100%|██████████████████████████████████████████████████████████| 49/49 [08:57<00:00, 10.96s/it]


Making predictions...


In [18]:
# Create submission
import pandas as pd
submission = pd.DataFrame({
    'id': test_ids,
    'label': predictions
})
submission.to_csv('submission.csv', index=False)
print("Submission saved as 'submission.csv'")
print(f"Sample predictions:\n{submission.head(10)}")

Submission saved as 'submission.csv'
Sample predictions:
   id  label
0   1      1
1   2      1
2   3      1
3   4      1
4   5      0
5   6      0
6   7      0
7   8      0
8   9      0
9  10      0
