In [1]:
import setuptools.dist

In [14]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd
import os

In [3]:
# Load the CSV file
df = pd.read_csv('Coles_cleaned.csv')

In [4]:
# Load DistilBERT tokenizer and model
pretrained_weights = 'distilbert-base-uncased'
tokenizer = DistilBertTokenizer.from_pretrained(pretrained_weights)
bert_model = TFDistilBertModel.from_pretrained(pretrained_weights)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [None]:
# Extract product names for tokenization
product_names = df['item_name'].fillna("").tolist()

In [7]:
# Generate embeddings for product names
product_embeddings = []
for name in product_names:
    # Tokenize product names
    tokenized = tokenizer.encode(name, add_special_tokens=True, max_length=20, truncation=True)
    padded = tokenized + [0] * (20 - len(tokenized))  # Pad to max length of 20
    input_tensor = tf.convert_to_tensor([padded])  # Convert to tensor
    
    # Pass through DistilBERT model
    output = bert_model(input_tensor)
    embedding = tf.reduce_mean(output.last_hidden_state, axis=1).numpy()  # Mean pooling
    product_embeddings.append(embedding[0])

product_embeddings = np.array(product_embeddings)  # Convert to NumPy array

In [12]:
# Parameters
batch_size = 32  # Number of product names per batch
embedding_save_path = "product_embeddings"  # Directory to save the embeddings

In [15]:
product_embeddings = []
for i in range(0, len(product_names), batch_size):
    # Get the current batch
    batch = product_names[i:i + batch_size]
    
    # Tokenize the batch
    tokenized_batch = tokenizer(batch, padding=True, truncation=True, max_length=20, return_tensors="tf")
    
    # Generate embeddings using DistilBERT
    output = bert_model(tokenized_batch['input_ids'])
    mean_pooled = tf.reduce_mean(output.last_hidden_state, axis=1).numpy()  # Mean pooling

    # Append to the embeddings list
    product_embeddings.extend(mean_pooled)

    # Save each batch inside the folder
    batch_file_path = os.path.join(embedding_save_path, f"batch_{i // batch_size}.npy")
    np.save(batch_file_path, mean_pooled)

# Convert all embeddings to a NumPy array
product_embeddings = np.array(product_embeddings)

# Save all embeddings to a single file inside the folder
all_embeddings_file_path = os.path.join(embedding_save_path, "all_embeddings.npy")
np.save(all_embeddings_file_path, product_embeddings)

# Load saved embeddings (for reuse)
loaded_embeddings = np.load(all_embeddings_file_path)

print("Embeddings shape:", loaded_embeddings.shape)
print(f"Embeddings saved in folder: {embedding_save_path}")

Embeddings shape: (20608, 768)
Embeddings saved in folder: product_embeddings


In [16]:
product_embeddings = np.load("product_embeddings/all_embeddings.npy")

In [17]:
# Compute similarity matrix
similarity_matrix = cosine_similarity(product_embeddings)

In [18]:
# Define a recommendation function
def recommend_products(product_name, top_n=5):
    # Find the index of the input product
    if product_name not in product_names:
        return f"Product '{product_name}' not found in the dataset."
    
    idx = product_names.index(product_name)
    # Get indices of top N similar products (excluding the input product itself)
    similar_indices = np.argsort(similarity_matrix[idx])[::-1][1:top_n + 1]
    # Fetch product names based on indices
    recommendations = [product_names[i] for i in similar_indices]
    return recommendations

In [20]:
# Example recommendation for the first product
example_product = product_names[0]
recommendations = recommend_products(example_product, top_n=5)
print(f"Recommendations for '{example_product}': {recommendations}")

Recommendations for 'Coles Boneless Pork Leg Roast | approx 2.1kg': ['Coles Boneless Pork Shoulder Roast | approx 2.6kg', 'Coles Pork Belly Roast Boneless | approx 1.3kg', 'Coles Lamb Boneless Shoulder Roast | approx 1.3kg', 'Coles Butcher Lamb Leg Roast Boneless | approx 1.08kg', 'Coles Lamb Whole Lamb Leg Roast | approx 2.8kg']
