In [5]:
import pandas as pd
import requests
from PIL import Image
from io import BytesIO
import numpy as np
import torch
from torchvision.models import mobilenet_v3_large, MobileNet_V3_Large_Weights
import torchvision.transforms as transforms
from sklearn.metrics.pairwise import cosine_similarity

# Load the Excel file
file_path = 'C:\\Users\\occid\\cleaned_data_no_duplicates.xlsx'
sheet_name = 'Sheet1'
df = pd.read_excel(file_path, sheet_name=sheet_name)

# Load the MobileNetV3 model (pretrained on ImageNet)
model = mobilenet_v3_large(weights=MobileNet_V3_Large_Weights.IMAGENET1K_V1)
model.eval()  # Set the model to evaluation mode

# Define image preprocessing transformations
preprocess = transforms.Compose([
    transforms.Resize((224, 224)),  # Resize to the input size expected by MobileNet
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Function to fetch and preprocess an image from a URL
def process_image_from_url(url):
    try:
        response = requests.get(url, timeout=10)
        if response.status_code == 200:
            image = Image.open(BytesIO(response.content)).convert('RGB')
            return preprocess(image).unsqueeze(0)  # Add batch dimension
        else:
            print(f"Failed to fetch image from {url}")
            return None
    except Exception as e:
        print(f"Error fetching image from {url}: {e}")
        return None

# Function to extract feature vector from an image tensor using MobileNet
def extract_features(image_tensor):
    with torch.no_grad():
        features = model(image_tensor)
    return features.squeeze(0).numpy()  # Remove batch dimension and convert to numpy

# List of image URLs from the dataset
image_urls = df['image'].tolist()
barcodes = df['barcode'].tolist()
features = []

# Process each image URL and extract features
valid_barcodes = []
for url, barcode in zip(image_urls, barcodes):
    image_tensor = process_image_from_url(url)
    if image_tensor is not None:
        feature_vector = extract_features(image_tensor)
        features.append(feature_vector)
        valid_barcodes.append(barcode)

# Flatten feature vectors for similarity computation
features = np.vstack(features)

# Calculate similarity matrix
similarity_matrix = cosine_similarity(features)

# Create a similarity matrix DataFrame using valid barcodes
similarity_df = pd.DataFrame(similarity_matrix, columns=valid_barcodes, index=valid_barcodes)

# Save the similarity matrix to a file for review
output_file = "C:\\Users\\occid\\Image_Similarity_Matrix_MobileNet.xlsx"
similarity_df.to_excel(output_file)

print(f"Image similarity matrix saved to {output_file}")


Failed to fetch image from https://cdn-images.kiotviet.vn/khotonghuynhphuong/aac760da977740ddb985ae8292ae6415.png
Failed to fetch image from https://bizweb.dktcdn.net/100/363/802/files/8847e9b9dgjki.jpg
Error fetching image from https://api.balance.ari.com.vn/api/v1/supermarket/util/download?key=supermarket-service/product/lcg9rf0v/nuoc-cot-gung-mat-ong-350ml-jXPySN.png: HTTPSConnectionPool(host='api.balance.ari.com.vn', port=443): Max retries exceeded with url: /api/v1/supermarket/util/download?key=supermarket-service/product/lcg9rf0v/nuoc-cot-gung-mat-ong-350ml-jXPySN.png (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x0000019071C54550>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Failed to fetch image from https://cdn-images.kiotviet.vn/ccshop10/8fd7c0e649e1429184ddde98041ae23c.jpg
Error fetching image from https://cdn.nhanh.vn/cdn/store1/36027/ps/20210106/623202132356_bvlgari_omnia_amethyste__edt.jpg: HTTPSConnectionPool(

Error fetching image from http://douongcaocap.vn/wp-content/uploads/2017/10/Vang-Montes-Limited-Selection-Pinot-Noir-13.png: HTTPConnectionPool(host='no.access', port=80): Max retries exceeded with url: / (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x000001901FB10B20>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Error fetching image from https://cdn.nhanh.vn/cdn/store1/36027/ps/20200227/2749202094925_85216751_2917044995023515_7701405733627101184_n.jpg: HTTPSConnectionPool(host='cdn.nhanh.vn', port=443): Max retries exceeded with url: /cdn/store1/36027/ps/20200227/2749202094925_85216751_2917044995023515_7701405733627101184_n.jpg (Caused by NewConnectionError('<urllib3.connection.HTTPSConnection object at 0x000001901FF23CA0>: Failed to establish a new connection: [Errno 11001] getaddrinfo failed'))
Failed to fetch image from https://cdn2-retail-images.kiotviet.vn/tteuro1964/51f79217ece44e3994c0e547ca39098f.png
Error fetching i

In [2]:
import pandas as pd
import numpy as np

# Load the full similarity matrix
file_path = 'C:\\Users\\occid\\Image_Similarity_Matrix_MobileNet.xlsx'
similarity_matrix = pd.read_excel(file_path, index_col=0)

# Define similarity threshold
similarity_threshold = 0.9

# Extract image IDs
image_ids = similarity_matrix.columns.astype(str).tolist()

# Convert the matrix to a NumPy array for efficient computation
matrix_values = similarity_matrix.values

# Find pairs of images with similarity above the threshold
similar_pairs = []
for i in range(len(matrix_values)):
    for j in range(i + 1, len(matrix_values)):  # Upper triangle only
        if matrix_values[i, j] > similarity_threshold:
            similar_pairs.append((image_ids[i], image_ids[j], matrix_values[i, j]))

# Create a DataFrame for better visualization
similar_pairs_df = pd.DataFrame(similar_pairs, columns=["Image ID 1", "Image ID 2", "Similarity Score"])

# Save or display results
similar_pairs_df.to_csv('similar_image_pairs_mobilenet0.9.csv', index=False)
print("Similar pairs saved to 'similar_image_pairs.csv'")


Similar pairs saved to 'similar_image_pairs.csv'
