In [None]:

import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("/content/merged_electronics_dataset.csv", on_bad_lines='skip')

print(df.head())
print(df.tail())
print(df.shape)



                                                name        main_category  \
0  Redmi 10 Power (Power Black, 8GB RAM, 128GB St...  tv, audio & cameras   
1  OnePlus Nord CE 2 Lite 5G (Blue Tide, 6GB RAM,...  tv, audio & cameras   
2  OnePlus Bullets Z2 Bluetooth Wireless in Ear E...  tv, audio & cameras   
3  Samsung Galaxy M33 5G (Mystique Green, 6GB, 12...  tv, audio & cameras   
4  OnePlus Nord CE 2 Lite 5G (Black Dusk, 6GB RAM...  tv, audio & cameras   

      sub_category                                              image  \
0  All Electronics  https://m.media-amazon.com/images/I/81eM15lVcJ...   
1  All Electronics  https://m.media-amazon.com/images/I/71AvQd3Vzq...   
2  All Electronics  https://m.media-amazon.com/images/I/51UhwaQXCp...   
3  All Electronics  https://m.media-amazon.com/images/I/81I3w4J6yj...   
4  All Electronics  https://m.media-amazon.com/images/I/71V--WZVUI...   

                                                link no_of_ratings  \
0  https://www.amazon.in/Red

In [None]:
print(df.columns)


Index(['name', 'main_category', 'sub_category', 'image', 'link',
       'no_of_ratings', 'discount_price', 'actual_price', 'review_rating',
       'review_text'],
      dtype='object')


In [None]:
# Create 'brand' column by taking the first word of 'name'
df['brand'] = df['name'].apply(lambda x: str(x).split()[0])

# Check
print(df[['name', 'brand']].head())


                                                name    brand
0  Redmi 10 Power (Power Black, 8GB RAM, 128GB St...    Redmi
1  OnePlus Nord CE 2 Lite 5G (Blue Tide, 6GB RAM,...  OnePlus
2  OnePlus Bullets Z2 Bluetooth Wireless in Ear E...  OnePlus
3  Samsung Galaxy M33 5G (Mystique Green, 6GB, 12...  Samsung
4  OnePlus Nord CE 2 Lite 5G (Black Dusk, 6GB RAM...  OnePlus


In [None]:
print(df.columns)


Index(['name', 'main_category', 'sub_category', 'image', 'link',
       'no_of_ratings', 'discount_price', 'actual_price', 'review_rating',
       'review_text', 'brand'],
      dtype='object')


In [None]:
df_clean = df.dropna(subset=['image', 'brand','name']).reset_index(drop=True)
print(df_clean.shape)


(5010, 11)


In [None]:
df_clean = df_clean[['brand', 'image','name']]


In [None]:
#download images
import os
import requests
from tqdm import tqdm

# Folder to save images
image_dir = '/content/images'
os.makedirs(image_dir, exist_ok=True)

# Download images
df_clean['image_path'] = None  # new column for local image path

for idx, row in tqdm(df_clean.iterrows(), total=df_clean.shape[0]):
    url = row['image']
    if pd.isna(url):
        continue
    try:
        response = requests.get(url, timeout=5)
        ext = url.split('.')[-1].split('?')[0]  # get jpg/png extension
        file_path = os.path.join(image_dir, f"{idx}.{ext}")
        with open(file_path, 'wb') as f:
            f.write(response.content)
        df_clean.at[idx, 'image_path'] = file_path
    except:
        pass

# Keep only rows where download succeeded
df_clean = df_clean.dropna(subset=['image_path']).reset_index(drop=True)
print(df_clean.shape)


100%|██████████| 5010/5010 [17:16<00:00,  4.84it/s]

(5002, 4)





In [None]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, GlobalAveragePooling2D, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# Correct imports for loading and converting images
from tensorflow.keras.utils import load_img, img_to_array


In [None]:
# Encode brands
le_brand = LabelEncoder()
df_clean['brand_encoded'] = le_brand.fit_transform(df_clean['brand'])
num_brands = df_clean['brand_encoded'].nunique()
print("Number of brands:", num_brands)


Number of brands: 1045


In [None]:
df_clean['image_path'] = df_clean.index.map(lambda i: f"/content/images/{i}.jpg")


In [None]:
# Convert brand_encoded to strings
df_clean['brand_encoded_str'] = df_clean['brand_encoded'].astype(str)


In [None]:
from PIL import Image

valid_paths = []
for path in df_clean['image_path']:
    try:
        img = Image.open(path)
        img.verify()  # checks if image can be opened
        valid_paths.append(path)
    except:
        pass

# Keep only rows with valid images
df_clean = df_clean[df_clean['image_path'].isin(valid_paths)].reset_index(drop=True)
print("Number of valid images:", len(df_clean))


NameError: name 'df_clean' is not defined

In [None]:
#prepare image data generators
# Data augmentation for brand classification
datagen = ImageDataGenerator(
    rescale=1./255,
    validation_split=0.2,  # 80% train, 20% validation
    horizontal_flip=True,
    rotation_range=20,
    zoom_range=0.2
)

train_gen = datagen.flow_from_dataframe(
    dataframe=df_clean,
    x_col='image_path',
    y_col='brand_encoded_str',  # use string labels
    target_size=(224,224),
    batch_size=32,
    class_mode='categorical',   # categorical works with string labels
    subset='training'
)

val_gen = datagen.flow_from_dataframe(
    dataframe=df_clean,
    x_col='image_path',
    y_col='brand_encoded_str',
    target_size=(224,224),
    batch_size=32,
    class_mode='categorical',
    subset='validation'
)

#CNN will learn to classify brands, not individual products yet.

Found 2506 validated image filenames belonging to 826 classes.
Found 626 validated image filenames belonging to 826 classes.


In [None]:
#build the cnn transfer learning
# Pretrained CNN as feature extractor
base_model = EfficientNetB0(include_top=False, input_shape=(224,224,3), weights='imagenet')
base_model.trainable = False  # freeze pretrained layers

x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
output = Dense(num_brands, activation='softmax')(x)

model = Model(inputs=base_model.input, outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.summary()


Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb0_notop.h5
[1m16705208/16705208[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 0us/step


In [None]:
# After filtering invalid images
num_brands = df_clean['brand_encoded_str'].nunique()
print("Number of brands after filtering:", num_brands)

# Rebuild the model output layer
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.layers import GlobalAveragePooling2D, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

base_model = EfficientNetB0(include_top=False, input_shape=(224,224,3), weights='imagenet')
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.5)(x)
output = Dense(num_brands, activation='softmax')(x)
model = Model(inputs=base_model.input, outputs=output)

# Compile
model.compile(optimizer=Adam(learning_rate=1e-4), loss='categorical_crossentropy', metrics=['accuracy'])


Number of brands after filtering: 826


In [None]:
history = model.fit(
    train_gen,
    validation_data=val_gen,
    epochs=10,
    steps_per_epoch=train_gen.samples // train_gen.batch_size,
    validation_steps=val_gen.samples // val_gen.batch_size
)

#After training, the CNN can predict the brand from a new product image

  self._warn_if_super_not_called()


Epoch 1/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m169s[0m 1s/step - accuracy: 0.0026 - loss: 6.7503 - val_accuracy: 0.0000e+00 - val_loss: 6.8174
Epoch 2/10
[1m 1/78[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m13s[0m 177ms/step - accuracy: 0.0000e+00 - loss: 6.4896



[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 93ms/step - accuracy: 0.0000e+00 - loss: 6.4896 - val_accuracy: 0.0000e+00 - val_loss: 6.8234
Epoch 3/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 499ms/step - accuracy: 0.0123 - loss: 6.3221 - val_accuracy: 0.0066 - val_loss: 6.8211
Epoch 4/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 85ms/step - accuracy: 0.0625 - loss: 5.6615 - val_accuracy: 0.0066 - val_loss: 6.8263
Epoch 5/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 499ms/step - accuracy: 0.0249 - loss: 5.9363 - val_accuracy: 0.0280 - val_loss: 6.5177
Epoch 6/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 95ms/step - accuracy: 0.0625 - loss: 5.7605 - val_accuracy: 0.0362 - val_loss: 6.5079
Epoch 7/10
[1m78/78[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 516ms/step - accuracy: 0.0364 - loss: 

In [None]:
# Save the entire CNN model (architecture + weights)
model.save('/content/brand_cnn_model.h5')




In [None]:
import pickle

# Save LabelEncoder
with open('/content/brand_encoder.pkl', 'wb') as f:
    pickle.dump(embeddings, f)


NameError: name 'embeddings' is not defined

We built a CNN for brand classification.

We used EfficientNetB0 as the base, which is pretrained on ImageNet, so the convolutional layers are not trained from scratch ,they provide learned feature extraction.

We added a GlobalAveragePooling2D layer, a Dropout layer, and a Dense layer with softmax on top to classify into your num_brands classes.

So, we designed the classification head (pooling, dropout, dense) from scratch, but the core CNN features come from EfficientNetB0, which is pretrained.

✅ In short: the brand-prediction CNN head is our design, but the convolutional backbone is transfer learning, not from scratch.

In [None]:
import os
from PIL import Image

valid_files = []
for filename in os.listdir('/content/images'):
    path = os.path.join('/content/images', filename)
    try:
        img = Image.open(path)
        img.verify()  # Check if image is readable
        valid_files.append(path)
    except:
        print(f"Removing corrupted image: {path}")
        os.remove(path)  # Delete corrupted file immediately


FileNotFoundError: [Errno 2] No such file or directory: '/content/images'

In [None]:
df_clean = df_clean[df_clean['image_path'].isin(valid_files)].reset_index(drop=True)
print("Remaining valid images:", df_clean.shape[0])


In [None]:
df_clean = df_clean[['name', 'image']].copy()
print(df_clean.head())


In [None]:
import os

# Keep only rows with actual image files
df_clean = df_clean[df_clean['image_path'].apply(os.path.isfile)].reset_index(drop=True)

print("Remaining images:", df_clean.shape[0])


In [None]:
#Step 5: Extract image embeddings for similarity search
#We will create feature vectors for all images to compare images within the same brand.

from tensorflow.keras.preprocessing import image
import numpy as np
from tqdm import tqdm
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D

embedding_model = Model(inputs=base_model.input, outputs=GlobalAveragePooling2D()(base_model.output))
embeddings = {}

for idx, row in tqdm(df_clean.iterrows(), total=df_clean.shape[0]):
    img_path = row['image_path']
    img = image.load_img(img_path, target_size=(224,224))
    img_array = image.img_to_array(img)/255.0
    img_array = np.expand_dims(img_array, axis=0)
    emb = embedding_model.predict(img_array, verbose=0)
    embeddings[row['name']] = emb.flatten()




In [None]:
#Predict brand + find most similar product

def predict_product(img_path, top_k=1):
    # 1️⃣ Predict brand
    img = image.load_img(img_path, target_size=(224,224))
    x = image.img_to_array(img)/255.0
    x = np.expand_dims(x, axis=0)

    brand_pred = model.predict(x)#brand_pred: array of size (1, num_brands) → probability for each brand.
    brand_idx = np.argmax(brand_pred)#brand_idx: integer → index of predicted brand.
    brand_name = le_brand.inverse_transform([brand_idx])[0] #brand_name: string → the predicted brand.

    # 2️⃣ Find embeddings of products with the same brand
    brand_products = df_clean[df_clean['brand'] == brand_name]['name'].tolist()
    brand_embeddings = np.array([embeddings[name] for name in brand_products])

    # 3️⃣ Compute similarity with input image embedding
    img_emb = embedding_model.predict(x).flatten().reshape(1, -1)
    similarities = cosine_similarity(img_emb, brand_embeddings).flatten()

    # 4️⃣ Pick most similar product
    best_idx = np.argmax(similarities)
    predicted_product = brand_products[best_idx]

    return brand_name, predicted_product


In [None]:
# Extract brand as first word of the product name
df['brand'] = df['name'].apply(lambda x: str(x).split()[0])

# Keep only the columns you need
df_clean = df.dropna(subset=['image', 'name']).reset_index(drop=True)

# Add the brand column to df_clean
df_clean['brand'] = df['brand']


In [None]:
brand, product = predict_product('/content/images/0.jpg')
print("Predicted brand:", brand)
print("Predicted product:", product)


In [None]:
model.save('brand_classifier.h5')


In [None]:
# embeddings = {product_name: embedding_vector}
import pickle

with open('embeddings.pkl', 'wb') as f:
    pickle.dump(embeddings, f)


In [None]:
import pickle

with open('le_brand.pkl', 'wb') as f:
    pickle.dump(le_brand, f)


Input image
     │
     ▼
[ CNN predicts brand ]
     │
     ▼
Filter dataset to same brand
     │
     ▼
Compute cosine similarity with embeddings
     │
     ▼
Return most visually similar product


1️⃣ What the hybrid approach is

 goal:

Give the system a product image → it tells you exactly which product it is.

The problem with a direct approach:

If you try to classify all products directly with a CNN, it has to learn thousands of unique classes (e.g., 5,000+ product names).

This is very hard, because many products look very similar. Accuracy drops a lot.

Hybrid solution:
It uses two stages:

2️⃣ Stage 1: CNN predicts the brand

 train a CNN to classify brands only, not individual products.

Example: Given an image of a phone, the CNN predicts Samsung or OnePlus.

Why this works:

There are fewer brands than products (e.g., 50 brands vs. 5,000 products).

CNN focuses on brand-level visual features like logo, style, or design patterns.

✅ This reduces the complexity and makes the model more accurate.

3️⃣ Stage 2: Image similarity search for exact product

After knowing the brand, we don’t need to compare the input image with all products—only products of that brand.

For each product image, we extract a feature vector (embedding) using the CNN (without the classification layer) and GlobalAveragePooling2D.

These embeddings are numerical representations of images.

Images that look visually similar have closer embeddings in the vector space.

We compute similarity between the input image embedding and all embeddings of that brand using cosine similarity.

The product with the highest similarity score is returned as the predicted product.

✅ This allows you to identify the exact product, even if the CNN only learned brand-level features.

4️⃣ Why this is a hybrid approach

Hybrid = combines two methods:

CNN classification → predict the brand (coarse-level classification)

Image similarity search → pick the exact product (fine-grained recognition)

This approach reduces the number of classes the CNN must handle and still allows you to distinguish individual products.

5️⃣ Why it solves your problem

Direct classification on thousands of product names → too hard, low accuracy.

Hybrid method:

CNN handles the easy task (brand-level classification).

Similarity search handles the hard task (distinguishing individual products visually).

Result → more accurate and scalable solution for product recognition.

Results are not good ---New approach


✔ Extracts deep visual features
✔ Embeds all products in the same vector space
✔ Compares similarity using cosine distance
✔ Predicts brand + product name based on nearest embedding

In [None]:
import os, glob, pickle
from tqdm import tqdm
from PIL import Image
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications import EfficientNetB3
from tensorflow.keras.applications.efficientnet import preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import GlobalAveragePooling2D
from sklearn.metrics.pairwise import cosine_similarity

# ---------------- Configuration ----------------
CSV_PATH = "/content/merged_electronics_dataset.csv"
IMAGE_DIR = "/content/images"
INPUT_SIZE = 300
RANDOM_SEED = 42

np.random.seed(RANDOM_SEED)
tf.random.set_seed(RANDOM_SEED)

# ---------------- Load CSV ----------------
df = pd.read_csv(CSV_PATH, on_bad_lines='skip')
if 'name' not in df.columns or 'image' not in df.columns:
    raise ValueError("CSV must contain 'name' and 'image' columns.")

# ---------------- Map CSV rows to existing images ----------------
existing_images = sorted(glob.glob(os.path.join(IMAGE_DIR, "*.jpg")))  # adjust extension if needed
df = df.iloc[:len(existing_images)].copy()
df['image_path'] = existing_images

# ---------------- Extract brand ----------------
df['brand'] = df['name'].astype(str).apply(lambda x: str(x).split()[0].strip())
df = df.dropna(subset=['name','brand','image_path']).reset_index(drop=True)
print("Images and rows:", len(df))

# ---------------- Build embedding model ----------------
base_model = EfficientNetB3(include_top=False, input_shape=(INPUT_SIZE, INPUT_SIZE, 3), weights='imagenet')
embedding_output = GlobalAveragePooling2D()(base_model.output)
embedding_model = Model(inputs=base_model.input, outputs=embedding_output)
print("Embedding model created.")

# ---------------- Create embeddings ----------------
print("Creating embeddings for all product images...")
embeddings = {}
product_rows = []

for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        img = Image.open(row['image_path']).convert('RGB').resize((INPUT_SIZE, INPUT_SIZE))
        arr = np.expand_dims(preprocess_input(np.array(img, dtype=np.float32)), 0)
        emb = embedding_model.predict(arr, verbose=0)
        embeddings[row['name']] = emb.flatten()
        product_rows.append({'name': row['name'], 'brand': row['brand'], 'image_path': row['image_path']})
    except:
        continue

with open("/content/embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

products_df = pd.DataFrame(product_rows)
products_df.to_csv("/content/products_index.csv", index=False)
print("Saved embeddings and product index.")

# ---------------- Prediction function (top-1) ----------------
def predict_product(img_path):
    img = Image.open(img_path).convert('RGB').resize((INPUT_SIZE, INPUT_SIZE))
    arr = np.expand_dims(preprocess_input(np.array(img, dtype=np.float32)), 0)
    emb = embedding_model.predict(arr, verbose=0).reshape(1, -1)

    # Compare with all products
    candidate_names = list(embeddings.keys())
    candidate_embs = np.array([embeddings[n] for n in candidate_names])
    sims = cosine_similarity(emb, candidate_embs).flatten()

    top_idx = sims.argmax()  # top-1
    top_name = candidate_names[top_idx]
    top_brand = products_df[products_df['name'] == top_name]['brand'].values[0]
    top_score = float(sims[top_idx])

    return top_brand, top_name, top_score

# ---------------- Quick test ----------------
sample_image = sorted(glob.glob(os.path.join(IMAGE_DIR, "*.jpg")))[0]
brand, name, score = predict_product(sample_image)

print("Sample prediction:")
print("Brand:", brand)
print("Product Name:", name)
print("Similarity:", score)


Images and rows: 0
Embedding model created.
Creating embeddings for all product images...


0it [00:00, ?it/s]

Saved embeddings and product index.





IndexError: list index out of range