# 🧠 07 - Feature Extraction with ResNet50

This notebook performs high-level feature extraction from real-world facial images using a pretrained **ResNet50** model.  
Each image is transformed into a **2048-dimensional feature vector**, capturing key visual patterns such as facial structure, emotion cues, and textures.

These image embeddings will serve as the input for downstream tasks like:

- 🎯 Emotion classification modeling
- 📊 Clustering / t-SNE visualization
- 🧠 Model explainability using SHAP

By leveraging a powerful transfer learning backbone (ResNet50 pretrained on ImageNet), we significantly reduce the need for custom feature engineering and enable fast, scalable training for real-world applications.


In [1]:

from google.colab import auth
auth.authenticate_user()

from google.cloud import storage
import pandas as pd
import os

# GCS Config
project_id = 'exalted-summer-454012-d2'
bucket_name = 'boothill2001-dataset'
source_path = 'dataset/final_emotion_dataset.parquet'
local_parquet_path = '/content/final_emotion_dataset.parquet'

# Download from GCS
client = storage.Client(project=project_id)
bucket = client.bucket(bucket_name)
blob = bucket.blob(source_path)
blob.download_to_filename(local_parquet_path)

print(f"✅ Downloaded final dataset from GCS → {local_parquet_path}")


✅ Downloaded final dataset from GCS → /content/final_emotion_dataset.parquet


In [2]:

df = pd.read_parquet("/content/final_emotion_dataset.parquet")
print(f"✅ Loaded {len(df)} samples")
df.head()


✅ Loaded 31783 samples


Unnamed: 0,filename,age,gender,dominant_emotion,dominant_race
0,1000092795.jpg,31,"{'Woman': np.float32(8.204366), 'Man': np.floa...",neutral,asian
1,10002456.jpg,30,"{'Woman': np.float32(2.7630906), 'Man': np.flo...",neutral,white
2,1000268201.jpg,29,"{'Woman': np.float32(4.2036314), 'Man': np.flo...",sad,white
3,1000344755.jpg,33,"{'Woman': np.float32(8.685013), 'Man': np.floa...",fear,white
4,1000366164.jpg,41,"{'Woman': np.float32(0.96645916), 'Man': np.fl...",fear,white


In [3]:

import numpy as np
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.models import Model
from tqdm import tqdm
import PIL
import io

# Load ResNet50
base_model = ResNet50(weights='imagenet', include_top=False, pooling='avg')
model = Model(inputs=base_model.input, outputs=base_model.output)

print("✅ ResNet50 model loaded.")


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5
[1m94765736/94765736[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 0us/step
✅ ResNet50 model loaded.


In [4]:

def load_and_preprocess_from_gcs(gcs_path, bucket):
    blob = bucket.blob(gcs_path)
    img_data = blob.download_as_bytes()
    img = PIL.Image.open(io.BytesIO(img_data)).resize((224, 224)).convert("RGB")
    x = image.img_to_array(img)
    x = np.expand_dims(x, axis=0)
    return preprocess_input(x)

features = []
errors = []

for idx, row in tqdm(df.iterrows(), total=len(df)):
    try:
        x = load_and_preprocess_from_gcs(row['filename'], bucket)
        feat = model.predict(x, verbose=0).flatten()
        features.append(feat)
    except Exception as e:
        errors.append((row['filename'], str(e)))

print(f"✅ Extracted features for {len(features)} images. Errors: {len(errors)}")


100%|██████████| 31783/31783 [12:50<00:00, 41.26it/s]

✅ Extracted features for 0 images. Errors: 31783





In [5]:

features_array = np.array(features)
np.save("/content/image_vectors.npy", features_array)
print("✅ Saved extracted features to image_vectors.npy")


✅ Saved extracted features to image_vectors.npy


In [6]:

vector_blob = bucket.blob("features/image_vectors.npy")
vector_blob.upload_from_filename("/content/image_vectors.npy")
print("☁️ Uploaded image vectors to GCS: features/image_vectors.npy")


☁️ Uploaded image vectors to GCS: features/image_vectors.npy
