<a href="https://colab.research.google.com/github/Deekshith-Proj/CVD/blob/main/Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!kaggle competitions download -c dogs-vs-cats
import zipfile
zip_ref=zipfile.ZipFile('/content/dogs-vs-cats.zip','r')
zip_ref.extractall('/content')
zip_ref.close()
zip_ref=zipfile.ZipFile('/content/train.zip','r')
zip_ref.extractall('/content')
zip_ref.close()
zip_ref=zipfile.ZipFile('/content/test1.zip','r')
zip_ref.extractall('/content')
zip_ref.close()

Downloading dogs-vs-cats.zip to /content
 99% 807M/812M [00:06<00:00, 183MB/s]
100% 812M/812M [00:06<00:00, 128MB/s]


In [1]:
# CELL A: config & helpers
import os, gc
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import EfficientNetB0, ResNet50, MobileNetV2
from tensorflow.keras.applications.efficientnet import preprocess_input as eff_pre
from tensorflow.keras.applications.resnet50 import preprocess_input as res_pre
from tensorflow.keras.applications.mobilenet_v2 import preprocess_input as mob_pre

# Paths (match your Colab)
DATA_DIR = "/content/train"
IMG_SIZE = (224,224)   # if memory tight, try (160,160)
BATCH_SIZE = 16        # reduce if still high memory
SEED = 42

print("Config:", DATA_DIR, IMG_SIZE, "batch:", BATCH_SIZE)


Config: /content/train (224, 224) batch: 16


In [8]:
# CELL B: count images and sanity-check folder layout
cats_dir = os.path.join(DATA_DIR, "cats")
dogs_dir = os.path.join(DATA_DIR, "dogs")

n_cats = len([f for f in os.listdir(cats_dir) if f.lower().endswith(('.jpg','.jpeg','.png'))])
n_dogs = len([f for f in os.listdir(dogs_dir) if f.lower().endswith(('.jpg','.jpeg','.png'))])
n_samples = n_cats + n_dogs
print("cats:", n_cats, "dogs:", n_dogs, "total:", n_samples)


cats: 12500 dogs: 12500 total: 25000


In [9]:

train_ds = tf.keras.utils.image_dataset_from_directory(
    DATA_DIR,
    labels='inferred',
    label_mode='int',
    batch_size=BATCH_SIZE,
    image_size=IMG_SIZE,
    shuffle=True,
    seed=SEED
)

print("Number of batches:", tf.data.experimental.cardinality(train_ds).numpy())
for imgs, labels in train_ds.take(1):
    print("Batch images shape:", imgs.shape, "Batch labels shape:", labels.shape)


Found 25000 files belonging to 2 classes.
Number of batches: 1563
Batch images shape: (16, 224, 224, 3) Batch labels shape: (16,)


In [10]:
# CELL C: streaming tf.data for inference (no caching)
def make_dataset_for_predict(data_dir, image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False):
    ds = tf.keras.utils.image_dataset_from_directory(
        data_dir,
        labels='inferred',
        label_mode='int',
        image_size=image_size,
        batch_size=batch_size,
        shuffle=shuffle
    )
    # convert to float32 (not dividing by 255 here ‚Äî preprocessors handle that)
    ds = ds.map(lambda x,y: (tf.cast(x, tf.float32), y), num_parallel_calls=tf.data.AUTOTUNE)
    ds = ds.prefetch(tf.data.AUTOTUNE)
    return ds

predict_ds = make_dataset_for_predict(DATA_DIR, image_size=IMG_SIZE, batch_size=BATCH_SIZE, shuffle=False)


Found 25000 files belonging to 2 classes.


In [11]:
# CELL D: build feature extractors (global avg pooled)
def build_extractor(model_cls):
    base = model_cls(weights="imagenet", include_top=False,
                     input_shape=(*IMG_SIZE,3), pooling="avg")
    return base

# instantiate
efficientnet = build_extractor(EfficientNetB0)
resnet = build_extractor(ResNet50)
mobilenet = build_extractor(MobileNetV2)
print("Built extractors.")


Built extractors.


In [12]:
# CELL E: extract features to a memmap file (streaming)
def extract_to_memmap(model, preprocess_fn, ds, out_path, n_samples, batch_size=BATCH_SIZE, dtype=np.float32):
    # first pass: get feature vector length by predicting on single batch
    for batch_imgs, _ in ds.take(1):
        imgs_np = batch_imgs.numpy().astype(np.float32)
        imgs_pre = preprocess_fn(imgs_np.copy())
        feat_sample = model.predict(imgs_pre, batch_size=imgs_pre.shape[0])
        feat_dim = feat_sample.shape[1]
        break

    # create memmap file
    mm = np.memmap(out_path, dtype=dtype, mode='w+', shape=(n_samples, feat_dim))
    print(f"Created memmap {out_path} shape {(n_samples, feat_dim)}")

    # write features batchwise
    idx = 0
    for batch_imgs, _ in ds:
        imgs_np = batch_imgs.numpy().astype(np.float32)
        imgs_pre = preprocess_fn(imgs_np.copy())
        feats = model.predict(imgs_pre, batch_size=imgs_pre.shape[0], verbose=0)
        b = feats.shape[0]
        mm[idx:idx+b] = feats
        idx += b
        # flush to disk and free mem
        mm.flush()
        del feats, imgs_np, imgs_pre
        gc.collect()
    print("Done writing to", out_path)
    return out_path

# Example usage will be in next cell


In [None]:
# CELL F: run extraction sequentially to avoid memory spikes
n = n_samples
out_dir = "/content/features"
os.makedirs(out_dir, exist_ok=True)

# IMPORTANT: use predict_ds created earlier (same ordering)
# EfficientNet
eff_path = os.path.join(out_dir, "feat_eff.dat")
extract_to_memmap(efficientnet, eff_pre, predict_ds, eff_path, n_samples)

# clear session if memory still high
tf.keras.backend.clear_session()
gc.collect()

# ResNet
res_path = os.path.join(out_dir, "feat_res.dat")
extract_to_memmap(resnet, res_pre, predict_ds, res_path, n_samples)
tf.keras.backend.clear_session()
gc.collect()

# MobileNet
mob_path = os.path.join(out_dir, "feat_mob.dat")
extract_to_memmap(mobilenet, mob_pre, predict_ds, mob_path, n_samples)
tf.keras.backend.clear_session()
gc.collect()


[1m1/1[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m16s[0m 16s/step
Created memmap /content/features/feat_eff.dat shape (25000, 1280)


In [None]:
# CELL G: combine feature memmaps into a single memmap by slicing (streaming)
import numpy.lib.format as nplf

def combine_to_memmap(paths, out_path, n_samples):
    # open each memmap read-only
    mms = [np.memmap(p, mode='r') for p in paths]
    dims = []
    for mm in mms:
        feat_dim = mm.size // n_samples
        dims.append(feat_dim)
    total_dim = sum(dims)
    out_mm = np.memmap(out_path, dtype=np.float32, mode='w+', shape=(n_samples, total_dim))
    start = 0
    for mm, d in zip(mms, dims):
        mm_reshaped = mm.reshape(n_samples, d)
        out_mm[:, start:start+d] = mm_reshaped
        start += d
        # free
        del mm_reshaped
    out_mm.flush()
    return out_path

combined_path = "/content/features/feat_combined.dat"
combine_to_memmap([eff_path, res_path, mob_path], combined_path, n_samples)
print("Combined memmap at", combined_path)


In [None]:
# CELL H: use memmap for sklearn training ‚Äî load in chunks to train or use partial_fit
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

total_dim = np.memmap(combined_path, mode='r').shape[1] // 1
combined_mm = np.memmap(combined_path, dtype=np.float32, mode='r', shape=(n_samples, -1))
idxs = np.arange(n_samples)
from sklearn.model_selection import train_test_split
train_idxs, test_idxs = train_test_split(idxs, test_size=0.2, random_state=SEED, stratify=np.concatenate([np.zeros(n_cats), np.ones(n_dogs)]))

# If RAM allows, load train features in chunks and fit scaler & classifiers.
def load_rows(mm, indices):
    # returns a numpy array for the given indices (this creates a copy)
    return mm[indices]

X_train = load_rows(combined_mm, train_idxs)
X_test  = load_rows(combined_mm, test_idxs)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test  = scaler.transform(X_test)

# train base classifiers (same as before)
rf = RandomForestClassifier(n_estimators=200, random_state=SEED)
svm = SVC(kernel='rbf', probability=True, random_state=SEED)
knn = KNeighborsClassifier(n_neighbors=5)
lr  = LogisticRegression(max_iter=2000, random_state=SEED)

rf.fit(X_train, np.take(np.concatenate([np.zeros(n_cats), np.ones(n_dogs)]), train_idxs))
svm.fit(X_train, np.take(np.concatenate([np.zeros(n_cats), np.ones(n_dogs)]), train_idxs))
knn.fit(X_train, np.take(np.concatenate([np.zeros(n_cats), np.ones(n_dogs)]), train_idxs))
lr.fit(X_train, np.take(np.concatenate([np.zeros(n_cats), np.ones(n_dogs)]), train_idxs))

print("Trained base models on memmap-derived arrays.")


In [None]:
# Evaluate ---
y_pred = meta.predict(test_meta)
y_prob = meta.predict_proba(test_meta)[:,1]

print("Accuracy:", accuracy_score(y_test, y_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_prob))
print("\nConfusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred))


In [None]:
# Predict single image ---
from tensorflow.keras.utils import load_img, img_to_array

def predict_image(path):
    img = load_img(path, target_size=IMG_SIZE)
    arr = img_to_array(img).astype(np.float32)
    arr = np.expand_dims(arr, axis=0)

    f1 = extract_batch_features(efficientnet, eff_pre, arr)
    f2 = extract_batch_features(resnet, res_pre, arr)
    f3 = extract_batch_features(mobilenet, mob_pre, arr)

    feats = np.concatenate([f1, f2, f3], axis=1)
    feats = scaler.transform(feats)

    meta_feats = np.column_stack([
        rf.predict_proba(feats)[:,1],
        svm.predict_proba(feats)[:,1],
        knn.predict_proba(feats)[:,1],
        lr.predict_proba(feats)[:,1]
    ])

    prob = meta.predict_proba(meta_feats)[0,1]
    print(f"Dog probability: {prob:.3f}")
    print("Prediction:", "Dog üê∂" if prob >= 0.5 else "Cat üê±")
