In [None]:
%pip install -r data-requirements.txt

In [None]:
import json
import os
import typing as t

# data prep
import pandas as pd
import numpy as np
import pyodbc

# for creating image vector embeddings
from PIL import Image
from img2vec_pytorch import Img2Vec

# for creating semantic (text-based) vector embeddings
from sentence_transformers import SentenceTransformer

# for Redis
import redis
from redis.commands.search.field import (
    NumericField,
    TagField,
    TextField,
    VectorField,
)
from redis.commands.search.indexDefinition import IndexDefinition, IndexType
from redis.commands.search.query import Query

from dotenv import load_dotenv

# load connection info from .env
load_dotenv('../.env')
DB_SERVER=os.environ.get('DB_SERVER')
DB_NAME=os.environ.get('DB_NAME')
DB_USERNAME=os.environ.get('DB_USERNAME')
DB_PASSWORD=os.environ.get('DB_PASSWORD')
DB_LIMIT=int(os.environ.get('DB_LIMIT'))


In [None]:
# connect to database, load in data
connection_string = f'DRIVER={{ODBC Driver 18 for SQL Server}};SERVER={DB_SERVER};DATABASE={DB_NAME};UID={DB_USERNAME};PWD={DB_PASSWORD};Encrypt=yes;TrustServerCertificate=yes'
conn = pyodbc.connect(connection_string) 

product_query = f'SELECT TOP {DB_LIMIT} [id],[gender],[masterCategory],[subCategory],[articleType],[baseColour],[season],[year],[usage],[productDisplayName] FROM [aidemo].[styles]'

df = pd.read_sql_query(product_query, conn)

# Display the DataFrame
df.info()

# Close the connection
conn.close()

In [None]:
df["product_text"] = df.apply(lambda row: f"name {row['productDisplayName']} category {row['masterCategory']} subcategory {row['subCategory']} color {row['baseColour']} gender {row['gender']}".lower(), axis=1)
df.rename({"id":"product_id"}, inplace=True, axis=1)

df.info()


In [None]:
# check out one of the texts we will use to create semantic embeddings
df["product_text"][0]

In [None]:
# Resnet-18 to create image embeddings
img2vec = Img2Vec(cuda=False)

# bert variant to create text embeddings
model = SentenceTransformer('sentence-transformers/all-distilroberta-v1')

In [None]:
def get_batch(seq, size):
    return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def generate_image_vectors(products, image_base_path, batch_size=1000):
    output_dict={}

    for batch in get_batch(products, batch_size):
        product_ids = batch['product_id'].values.tolist()
        image_filenames = [image_base_path + "/" + str(_id) + ".jpg" for _id in product_ids]
        images=[]
        converted=[]

        for img_path, _id in zip(image_filenames, product_ids):
            try:
                img = Image.open(img_path).convert('RGB')
                img = img.resize((224, 224))
                images.append(img)
                converted.append(_id)
            except:
                #unable_to_convert -> skip to the next image
                continue

        #Generate vectors for all images in this batch
        vec_list = img2vec.get_vec(images)

        #update the dictionary to be returned
        batch_dict= dict(zip(converted, vec_list))
        output_dict.update(batch_dict)
        print(f"Processed {str(batch_size)} product images")

    return output_dict

def generate_text_vectors(products_df):
    text_vectors = {}
    # generate text vector
    for index, row in products_df.iterrows():
        text_vector = model.encode(row["product_text"])
        text_vectors[row["product_id"]] = text_vector.astype(np.float32)
    
    print(f"Processed {str(len(text_vectors))} product text fields")
    return text_vectors

# combine into a single json file
def combine_vector_dicts(txt_vectors, img_vectors, products):
    product_vectors = []
    for _, row in products.iterrows():
        try:
            _id = row["product_id"]
            text_vector = txt_vectors[_id].tolist()
            img_vector = img_vectors[_id].tolist()
            vector_dict = {
                "text_vector": text_vector,
                "img_vector": img_vector,
                "product_id": _id
            }
            product_vectors.append(vector_dict)
        except KeyError:
            continue
    return product_vectors

def write_product_vector_json(vector_dict):
    product_vector_json = json.dumps(vector_dict)
    with open("./product_vectors.json", "w") as f:
        f.write(product_vector_json)

def write_product_metadata_json(metadata):

    products_json = json.dumps(metadata)
    with open("./product_metadata.json", "w") as f:
        f.write(products_json)

def create_product_metadata(metadata_df, image_base_path):
    products = []
    for _, row in metadata_df.iterrows():
        product = {
            "product_id": row["product_id"],
            # create a text based representation to create a semantic embedding with
            "product_metadata": {
                "name": row["productDisplayName"],
                "gender": row["gender"],
                "master_category": row["masterCategory"],
                "sub_category": row["subCategory"],
                "article_type": row["articleType"],
                "base_color": row["baseColour"],
                "season": row["season"],
                "year": row["year"],
                "usage": row["usage"],
                "image_url": image_base_path + "/" + str(row["product_id"]) + ".jpg",
                "keywords": '',
                "brand": '',
                "age_group": ''
            }
        }
        products.append(product)

    return products


In [None]:
#process vector and metadata for products
data_path = "../app/vecsim_app/static/images"
images_base_path = "/images"

image_vectors = generate_image_vectors(df[:DB_LIMIT], data_path, DB_LIMIT)
text_vectors = generate_text_vectors(df[:DB_LIMIT])
vector_dict = combine_vector_dicts(text_vectors, image_vectors, df)
image_dim = [len(i) for i in image_vectors.values()][0]
text_dim = [len(i) for i in text_vectors.values()][0]

metadata = create_product_metadata(df[:DB_LIMIT], images_base_path)
#optional write to file system
write_product_metadata_json(metadata)
write_product_vector_json(vector_dict)