In [15]:
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.layers import GlobalMaxPool2D
from tensorflow.keras.preprocessing.image import load_img, img_to_array
# from tensorflow.keras.preprocessing import image 

import numpy as np
from tensorflow.keras.models import Sequential
from numpy.linalg import norm
from tqdm import tqdm 

In [16]:
model = ResNet50(weights='imagenet', include_top=False, input_shape=(224, 224, 3))
model.trainable = False 
model = Sequential([model, GlobalMaxPool2D()])

In [17]:
def feature_extraction(img, model):
    
    # image = load_img(img, target_size=(224, 224))
    image = img 
    image = image.resize((224, 224))
    image_array = img_to_array(image)
    expanded_image_array = np.expand_dims(image_array, axis=0)
    processed_input = preprocess_input(expanded_image_array)
    extracted_feature = model.predict(processed_input).flatten()
    normalized_extracted_feature = extracted_feature/norm(extracted_feature)
    
    return normalized_extracted_feature

In [18]:
import pandas as pd
from PIL import Image
import io

In [19]:
# Read the Parquet file
df = pd.read_parquet('data/first.parquet')

In [None]:
import os
feature_list =[]

from tqdm import tqdm
total_iterations = df.shape[0]
with tqdm(total=total_iterations, desc="Outer loop") as pbar:
    for i in range(df.shape[0]):
        image_file = df['image'].iloc[i]  # Access the first image
        image_data =image_file['bytes']
        # Decode the image
        image = Image.open(io.BytesIO(image_data))
        # image = Image.open(ds["image"][i])
        feature_list.append(feature_extraction(image, model))
        pbar.update(1)

  0%|          | 0/1456 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 214ms/step


  0%|          | 1/1456 [00:00<07:42,  3.14it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 199ms/step


  0%|          | 2/1456 [00:00<07:01,  3.45it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step


  0%|          | 3/1456 [00:00<07:43,  3.13it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 159ms/step


  0%|          | 4/1456 [00:01<07:03,  3.43it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 223ms/step


  0%|          | 5/1456 [00:01<07:24,  3.26it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 158ms/step


  0%|          | 6/1456 [00:01<06:52,  3.51it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


  0%|          | 7/1456 [00:02<06:55,  3.49it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 167ms/step


  1%|          | 8/1456 [00:02<06:48,  3.54it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step


  1%|          | 9/1456 [00:02<06:56,  3.47it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step


  1%|          | 10/1456 [00:02<07:00,  3.44it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step


  1%|          | 11/1456 [00:03<06:42,  3.59it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 202ms/step


  1%|          | 12/1456 [00:03<06:51,  3.51it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 224ms/step


  1%|          | 13/1456 [00:03<07:02,  3.42it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 216ms/step


  1%|          | 14/1456 [00:04<07:16,  3.31it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  1%|          | 15/1456 [00:04<07:03,  3.40it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 177ms/step


  1%|          | 16/1456 [00:04<06:48,  3.52it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 120ms/step


  1%|          | 17/1456 [00:04<06:30,  3.69it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step


  1%|          | 18/1456 [00:05<06:37,  3.62it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step


  1%|▏         | 19/1456 [00:05<06:56,  3.45it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 171ms/step


  1%|▏         | 20/1456 [00:06<08:50,  2.71it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step


  1%|▏         | 21/1456 [00:06<08:23,  2.85it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 233ms/step


  2%|▏         | 22/1456 [00:06<08:20,  2.86it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218ms/step


  2%|▏         | 23/1456 [00:06<07:56,  3.01it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step


  2%|▏         | 24/1456 [00:07<07:45,  3.07it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step


  2%|▏         | 25/1456 [00:07<07:13,  3.30it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 175ms/step


  2%|▏         | 26/1456 [00:07<07:04,  3.37it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 242ms/step


  2%|▏         | 27/1456 [00:08<07:16,  3.28it/s]


KeyboardInterrupt: 

In [14]:
import os
feature_list =[]

from tqdm.auto import tqdm
for i in tqdm(range(df.shape[0])):
    image_file = df['image'].iloc[i]  # Access the first image
    image_data =image_file['bytes']
    # Decode the image
    image = Image.open(io.BytesIO(image_data))
    # image = Image.open(ds["image"][i])
    feature_list.append(feature_extraction(image, model))
    

  0%|          | 0/1456 [00:00<?, ?it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 156ms/step


  0%|          | 1/1456 [00:00<07:02,  3.45it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step


  0%|          | 2/1456 [00:00<05:48,  4.18it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 111ms/step


  0%|          | 3/1456 [00:00<05:27,  4.44it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 130ms/step


  0%|          | 4/1456 [00:00<05:32,  4.37it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step


  0%|          | 5/1456 [00:01<05:03,  4.79it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 218ms/step


  0%|          | 6/1456 [00:01<05:39,  4.27it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 192ms/step


  0%|          | 7/1456 [00:01<06:04,  3.98it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 152ms/step


  1%|          | 8/1456 [00:01<05:56,  4.06it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 197ms/step


  1%|          | 9/1456 [00:02<06:10,  3.91it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step


  1%|          | 10/1456 [00:02<06:33,  3.68it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step


  1%|          | 11/1456 [00:02<06:01,  4.00it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 178ms/step


  1%|          | 12/1456 [00:02<06:15,  3.85it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 200ms/step


  1%|          | 13/1456 [00:03<06:33,  3.67it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 204ms/step


  1%|          | 14/1456 [00:03<06:43,  3.57it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 187ms/step


  1%|          | 15/1456 [00:03<07:03,  3.40it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 140ms/step


  1%|          | 16/1456 [00:04<06:40,  3.60it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 181ms/step


  1%|          | 17/1456 [00:04<06:39,  3.60it/s]

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 173ms/step


  1%|          | 18/1456 [00:04<06:16,  3.82it/s]


KeyboardInterrupt: 

In [None]:
# short_image = images[:100]

In [None]:
# np.array(short_image).shape
# np.array(images).shape

(21526,)

In [12]:
print(np.array(feature_list).shape)

(1456, 2048)


In [13]:
import pickle 

pickle.dump(feature_list, open('features2.pkl', 'wb'))