In [11]:
import pandas as pd

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
import pandas as pd

In [14]:
import pickle
with open("/content/drive/MyDrive/graph.pkl", "rb") as f:
    graph = pickle.load(f)

In [15]:
metadata = pd.read_csv("/content/drive/MyDrive/metadata.csv")

In [16]:
metadata = metadata[['pid','product_url','image','uniq_id','brand','brand_id', 'retail_price', 'discounted_price', 'product_rating',
           'overall_rating', 'is_FK_Advantage_product', 'product_name',
           'product_category_tree', 'product_specifications', 'description']]

In [17]:
df_final = pd.read_csv("/content/drive/MyDrive/pocre_embedding.csv")

In [18]:

TARGET_COLUMN = "overall_rating"

X_train = df_final.drop(columns=[TARGET_COLUMN])[:10000]  # Features
y_train = df_final[TARGET_COLUMN][:10000]
X_test = df_final.drop(columns=[TARGET_COLUMN])[10000:]  # Features
# y_test =  df_final[TARGET_COLUMN][10000:]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [19]:
import torch
import time

In [20]:

class GraphSageItem:
    def __init__(self, A, X, Y, learning_rate, n_iterations, n_n_1, n_n_2, n_n_3):
        self.H = X
        self.Y = Y
        self.A = self.A = {k: torch.tensor(list(v), dtype=torch.long) for k, v in A.items()}

        self.n_records = self.H.shape[0]
        self.n_features = self.H.shape[1]
        self.n_iterations = n_iterations
        self.learning_rate = learning_rate

        # Initialize Weights using Xavier Initialization
        self.W1 = torch.nn.Parameter(torch.empty(self.n_features, n_n_1))
        torch.nn.init.xavier_uniform_(self.W1)

        self.W2 = torch.nn.Parameter(torch.empty(n_n_1, n_n_2))
        torch.nn.init.xavier_uniform_(self.W2)

        self.W3 = torch.nn.Parameter(torch.empty(n_n_2, n_n_3))
        torch.nn.init.xavier_uniform_(self.W3)

        # Adam optimizer variables
        self.m1, self.v1 = torch.zeros_like(self.W1), torch.zeros_like(self.W1)
        self.m2, self.v2 = torch.zeros_like(self.W2), torch.zeros_like(self.W2)
        self.m3, self.v3 = torch.zeros_like(self.W3), torch.zeros_like(self.W3)

        self.beta1, self.beta2 = 0.9, 0.999  # Momentum coefficients
        self.epsilon = 1e-8  # Small constant to prevent division by zero
        self.t = 0  # Time step counter


    def mean_aggregate(self, H, sampled_neighbors):
        H_new = torch.zeros_like(H)
        num_nodes = H.shape[0]  # Total number of nodes in H

        for v in sampled_neighbors:
            if v >= num_nodes:  # Skip if v is out of bounds
                continue

            neighbors = list(sampled_neighbors[v])  # Convert set to list
            valid_neighbors = [n for n in neighbors if n < num_nodes]  # Keep only valid indices

            if valid_neighbors:  # Avoid division by zero
                H_new[v] = torch.mean(H[torch.tensor(valid_neighbors, dtype=torch.long)], dim=0)

        return H_new




    def relu(self, H):
        return torch.where(H > 0, H, 0.01 * H)

    def relu_derivative(self, H):
        return torch.where(H > 0, torch.ones_like(H), torch.full_like(H, 0.01))

    def predictor(self, predict_in, predict_W):
        return predict_in @ predict_W  # Matrix multiplication in PyTorch

    def forward_propagation(self):
        A_H = self.mean_aggregate(self.H, self.A)
        self.H1 = A_H @ self.W1
        self.relu_H1 = self.relu(self.H1)

        A_H1 = self.mean_aggregate(self.relu_H1, self.A)
        self.H2 = A_H1 @ self.W2
        self.relu_H2 = self.relu(self.H2)

        self.y_cap = self.predictor(self.relu_H2, self.W3)
    def backward_propagation(self):
        for i in range(self.n_iterations):
            start_time = time.time()
            self.forward_propagation()

            # Compute gradients
            dL_dy_cap = (2 * (self.Y - self.y_cap)) / self.n_records
            dy_cap_dreluH2 = self.W3
            dreluH2_dH2 = self.relu_derivative(self.H2)
            dH2_dreluH1 = self.W2
            dreluH1_dH1 = self.relu_derivative(self.H1)

            dL_dW1 = (self.H.T @ ((((dL_dy_cap @ dy_cap_dreluH2.T) * dreluH2_dH2) @ dH2_dreluH1.T) * dreluH1_dH1)) / self.n_records
            dL_dW2 = (self.relu_H1.T @ ((dL_dy_cap @ dy_cap_dreluH2.T) * dreluH2_dH2)) / self.n_records
            dL_dW3 = self.relu_H2.T @ dL_dy_cap

            # Update weights using Adam
            self.update_weights(dL_dW1, dL_dW2, dL_dW3)

            epoch_end = time.time()
            epoch_time = epoch_end - start_time
            print(f"Epoch {i+1}/{self.n_iterations} - Time: {epoch_time:.4f} sec")

    def update_weights(self, dL_dW1, dL_dW2, dL_dW3):
        """ Updates weights using the Adam optimizer. """
        self.t += 1  # Increment time step

        # **W1 Update**
        self.m1 = self.beta1 * self.m1 + (1 - self.beta1) * dL_dW1
        self.v1 = self.beta2 * self.v1 + (1 - self.beta2) * (dL_dW1 ** 2)
        m1_hat = self.m1 / (1 - self.beta1 ** self.t)
        v1_hat = self.v1 / (1 - self.beta2 ** self.t)

        with torch.no_grad():  # Prevent gradient tracking during update
            self.W1 -= self.learning_rate * m1_hat / (torch.sqrt(v1_hat) + self.epsilon)

        # **W2 Update**
        self.m2 = self.beta1 * self.m2 + (1 - self.beta1) * dL_dW2
        self.v2 = self.beta2 * self.v2 + (1 - self.beta2) * (dL_dW2 ** 2)
        m2_hat = self.m2 / (1 - self.beta1 ** self.t)
        v2_hat = self.v2 / (1 - self.beta2 ** self.t)

        with torch.no_grad():
            self.W2 -= self.learning_rate * m2_hat / (torch.sqrt(v2_hat) + self.epsilon)

        # **W3 Update**
        self.m3 = self.beta1 * self.m3 + (1 - self.beta1) * dL_dW3
        self.v3 = self.beta2 * self.v3 + (1 - self.beta2) * (dL_dW3 ** 2)
        m3_hat = self.m3 / (1 - self.beta1 ** self.t)
        v3_hat = self.v3 / (1 - self.beta2 ** self.t)

        with torch.no_grad():
            self.W3 -= self.learning_rate * m3_hat / (torch.sqrt(v3_hat) + self.epsilon)


    def get_embedding(self):
        return self.relu_H2

X = torch.tensor(X_train.values, dtype=torch.float32)
Y = torch.tensor(y_train.values, dtype=torch.float32)
Y = Y.unsqueeze(1)
A = graph.copy()
n_n_1 = int(X.shape[1])
n_n_2 = int(X.shape[1])
n_n_3 = 1
learning_rate = 0.001
n_iterations = 10
graphsageitem = GraphSageItem(A, X, Y, learning_rate, n_iterations, n_n_1, n_n_2, n_n_3)


In [21]:
graphsageitem.backward_propagation()

Epoch 1/10 - Time: 18.8520 sec
Epoch 2/10 - Time: 18.0059 sec
Epoch 3/10 - Time: 18.7052 sec
Epoch 4/10 - Time: 18.3507 sec
Epoch 5/10 - Time: 20.8390 sec
Epoch 6/10 - Time: 17.5446 sec
Epoch 7/10 - Time: 18.4504 sec
Epoch 8/10 - Time: 18.2202 sec
Epoch 9/10 - Time: 19.4157 sec
Epoch 10/10 - Time: 22.1084 sec


In [23]:
item_Embeddings = graphsageitem.get_embedding()

In [None]:
# trained_embeddings.to_csv('trained_embeddings.csv',index=False)

In [None]:
# trained_embeddings = pd.read_csv('trained_embeddings.csv',index=False)

In [97]:
import torch
from torch.nn.functional import cosine_similarity

class Local_Sensitive_Hashing:
    def __init__(self, threshold, num_vectors):
        self.threshold = threshold
        self.num_vectors = num_vectors

    def separation(self):
        buckets = {}
        for item_index, item in enumerate(self.items):
            bucket_key = ""
            for vector in self.random_vector:
                similarity = cosine_similarity(item.unsqueeze(0), vector.unsqueeze(0))
                bucket_key += "1" if similarity.item() > self.threshold else "0"
                # print(similarity.item())

            if bucket_key not in buckets:
                buckets[bucket_key] = []
            buckets[bucket_key].append(item_index)

        return buckets

    def fit(self, items):
        num_columns = items.shape[1]
        self.items = items  # Expecting a PyTorch tensor
        self.random_vector = torch.randn(self.num_vectors, num_columns)  # Torch tensor
        self.buckets = self.separation()

    def predict(self, query_embedding):
        bucket_key = ""
        for vector in self.random_vector:
            similarity = cosine_similarity(query_embedding, vector)
            bucket_key += "1" if similarity > self.threshold else "0"

        return self.buckets.get(bucket_key)

    def get_buckets(self):
        return self.buckets

    def sort_by_similarity(self, query_embedding, bucket_indices, X):
        if len(bucket_indices) == 0:
            return []

        similarities = cosine_similarity(query_embedding.unsqueeze(0), X[bucket_indices])
        sorted_indices = torch.argsort(similarities, descending=True)

        return torch.tensor(bucket_indices, dtype=torch.long)[sorted_indices][:5]

    def recommend_products(self, query_embedding, X):
        bucket_indices = self.predict(query_embedding)
        sorted_indices = self.sort_by_similarity(query_embedding, bucket_indices, X)

        return sorted_indices[0][:5]

# Example Usage:
threshold = 0.01
num_vectors = 8
lsh_model = Local_Sensitive_Hashing(threshold, num_vectors)
lsh_model.fit(item_Embeddings)


In [27]:
from sentence_transformers import SentenceTransformer
EMBEDDING_MODEL = SentenceTransformer("sentence-transformers/all-mpnet-base-v2")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.4k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [29]:
X_test = torch.tensor(X_test.values, dtype=torch.float32)

In [117]:
metadata.iloc[0]['product_url']

'http://www.flipkart.com/alisha-solid-women-s-cycling-shorts/p/itmeh2ffvzetthbb?pid=SRTEH2FF9KEDEFGF'

In [123]:
query = list(item_Embeddings[0])

In [124]:
import numpy as np
def recommender(query, embedding_model, lsh_model, item_embedding,metadata):

    if isinstance(query, str):
        query_embedding = torch.tensor(embedding_model.encode(query)).unsqueeze(0)

    elif isinstance(query, list):
        query_embedding = torch.tensor(query, dtype=torch.float32).unsqueeze(0)
    else:
        return "Invalid input. Please enter a product name or an embedding."
    recommended_items = np.array(lsh_model.recommend_products(query_embedding, item_embedding))
    return [metadata.iloc[index]['product_url'] for index in recommended_items]

recommender(query,EMBEDDING_MODEL,lsh_model,item_Embeddings,metadata)

['http://www.flipkart.com/fabiya-women-s-bikini-panty/p/itmejq4zhcgqtywz?pid=PANEJQ5YHAHQYD4F',
 'http://www.flipkart.com/ipg-cotton-solid-women-s-dupatta/p/itmegny45nghyxmg?pid=DUPEGNY4FJZUFGGQ',
 'http://www.flipkart.com/indistar-self-design-viscose-women-s-stole/p/itmehdjks9fzqt6y?pid=SCFEHDJKTM826ZRA',
 'http://www.flipkart.com/acm-pouch-swipe-elite-plus/p/itmejgyywrztzmzf?pid=ACCEJGYYMTQR4JXY',
 'http://www.flipkart.com/viha-net-embroidered-semi-stitched-salwar-suit-dupatta-material/p/itmeh3phkyuukspk?pid=FABEH3PHASHTTANZ']

In [115]:
import gradio as gr
import torch
import ast  # For safely converting string to a list

def gradio_recommendation(query):
    try:
        # Try to parse the input as a list
        if isinstance(query, str) and query.startswith("[") and query.endswith("]"):
            query = ast.literal_eval(query)  # Convert string to list
        return recommender(query, EMBEDDING_MODEL, lsh_model, item_Embeddings, metadata)
    except Exception as e:
        return f"Error: {str(e)}"

ui = gr.Interface(
    fn=gradio_recommendation,
    inputs=gr.Textbox(placeholder="Enter a product name or an embedding (as a list)"),
    outputs="text",
    title="LSH-Based Product Recommender",
    description="Enter a product name (text) or embedding (list), and get similar product recommendations.",
)

ui.launch()


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://8799f8f105ef46c418.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [34]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.22.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.11-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.8.0 (from gradio)
  Downloading gradio_client-1.8.0-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.1-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.6 

In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def normalize(array):
#     return (array - np.min(array)) / (np.max(array) - np.min(array) + 1e-9)

# def sort_by_similarity_and_attributes(query_embedding, bucket_indices, X, metadata, weights):

#     bucket_embeddings = X[bucket_indices]  # Fetch bucket embeddings
#     similarities = cosine_similarity([query_embedding], bucket_embeddings)[0]  # Compute similarity

#     # Retrieve additional attributes
#     prices = metadata.loc[bucket_indices, "price"].values
#     discount = metadata.loc[bucket_indices, "discount"].values  # Assume higher value means newer

#     # Normalize attributes
#     norm_prices = normalize(prices)  # Lower price is better, so use (1 - normalized price)
#     norm_discount = normalize(discount)  # More recent is better

#     # Compute final ranking score
#     final_scores = (
#         weights["similarity"] * similarities +
#         weights["price"] * (1 - norm_prices) +
#         weights["discount"] * norm_discount
#     )

#     # Sort by final score (descending)
#     sorted_indices = np.argsort(final_scores)[::-1]

#     return [bucket_indices[i] for i in sorted_indices]  # Return sorted indices

# # Define weights for each factor (sum should be 1)
# weights = {
#     "similarity": 0.6,  # 50% importance to similarity
#     "price": 0.3,       # 20% importance to price (lower is better)
#     "discount":0.1
# }

# # Sort bucket items considering similarity + additional attributes
# sorted_indices = sort_by_similarity_and_attributes(query_embedding, bucket_indices, X, metadata, weights)

# # Print metadata of sorted items
# for idx in sorted_indices:
#     print(metadata.iloc[idx])


In [None]:
# from sklearn.metrics.pairwise import cosine_similarity
# import numpy as np

# def normalize(array):
#     """Min-max normalization to scale values between 0 and 1"""
#     return (array - np.min(array)) / (np.max(array) - np.min(array) + 1e-9)

# def sort_by_similarity_and_attributes(query_embedding, bucket_indices, X, metadata, user_preferences):
#     """
#     Sorts items based on similarity and additional attributes (price, popularity, recency).

#     Parameters:
#     - query_embedding: (1D numpy array) Embedding of the query product
#     - bucket_indices: (List or array) Indices of items in the bucket
#     - X: (2D numpy array) Dataset containing all embeddings
#     - metadata: (Pandas DataFrame) Contains product details (price, popularity, recency, category)
#     - user_preferences: (Dict) User-defined importance of similarity, price, popularity, recency

#     Returns:
#     - Sorted indices based on final ranking score
#     """
#     bucket_embeddings = X[bucket_indices]  # Fetch bucket embeddings
#     similarities = cosine_similarity([query_embedding], bucket_embeddings)[0]  # Compute similarity

#     # Retrieve additional attributes
#     prices = metadata.loc[bucket_indices, "price"].values
#     popularity = metadata.loc[bucket_indices, "popularity"].values
#     recency = metadata.loc[bucket_indices, "recency"].values
#     categories = metadata.loc[bucket_indices, "category"].values

#     # **Step 1: Filter items by the same category as the query product**
#     query_category = metadata.loc[bucket_indices[0], "category"]  # Assume first item is the query
#     filtered_indices = [i for i, cat in zip(bucket_indices, categories) if cat == query_category]

#     if not filtered_indices:
#         print("⚠ No items found in the same category. Showing all items.")
#         filtered_indices = bucket_indices  # Use full bucket if no category match

#     # **Step 2: Normalize attributes**
#     norm_prices = normalize(metadata.loc[filtered_indices, "price"].values)
#     norm_popularity = normalize(metadata.loc[filtered_indices, "popularity"].values)
#     norm_recency = normalize(metadata.loc[filtered_indices, "recency"].values)
#     similarities_filtered = similarities[[bucket_indices.index(i) for i in filtered_indices]]

#     # **Step 3: Compute final ranking score based on user preferences**
#     final_scores = (
#         user_preferences["similarity"] * similarities_filtered +
#         user_preferences["price"] * (1 - norm_prices) +  # Lower price is better
#         user_preferences["popularity"] * norm_popularity +
#         user_preferences["recency"] * norm_recency
#     )

#     # **Step 4: Sort by final score (descending)**
#     sorted_indices = np.argsort(final_scores)[::-1]

#     return [filtered_indices[i] for i in sorted_indices]  # Return sorted indices

# # **🔹 Get user preferences (adjust weights dynamically)**
# user_preferences = {
#     "similarity": 0.4,   # Reduced similarity weight
#     "price": 0.3,        # Increased price weight (cheaper is better)
#     "popularity": 0.2,   # Keep popularity factor
#     "recency": 0.1       # Keep recency factor
# }

# # **🔹 Sort bucket items considering similarity + additional attributes**
# sorted_indices = sort_by_similarity_and_attributes(query_embedding, query_bucket, X, metadata, user_preferences)

# # **🔹 Print metadata of sorted items**
# print("🔹 Recommended Products:")
# for idx in sorted_indices:
#     print(metadata.iloc[idx])


In [108]:
query

tensor([[ 1.7281e-01, -3.9376e-04,  1.0157e-01,  2.2259e-01,  2.7685e-01,
         -1.5954e-03,  2.5096e-01,  2.0724e-01,  2.0892e-01,  1.5455e-01,
         -2.6606e-03,  2.4616e-01, -1.7586e-03,  2.3979e-01,  1.7420e-01,
         -2.0923e-03, -2.2541e-03, -1.9108e-03, -2.3071e-03, -1.0503e-03,
         -2.5498e-03,  2.2197e-01, -1.6787e-03, -2.9925e-03, -2.5939e-03,
          1.8560e-01, -2.2199e-03, -1.2200e-03,  1.0356e-01, -1.3715e-03,
         -8.9116e-04,  1.8168e-01, -1.5964e-03, -1.0560e-03, -2.8544e-03,
          2.8379e-01,  2.9645e-01, -2.2130e-03,  2.3982e-01,  1.2618e-01,
         -6.7096e-04,  2.5307e-01, -7.1954e-04, -7.8887e-04, -1.7040e-03,
         -1.6482e-03,  7.9684e-02,  1.1348e-01, -8.7080e-04,  1.2891e-01,
         -2.2185e-03,  1.5517e-01, -1.5313e-03, -1.8035e-03,  2.0709e-01,
          1.6508e-01, -1.7558e-03,  3.1696e-01, -9.4234e-04, -2.1809e-03,
          2.6273e-01, -2.5267e-03,  2.2659e-01,  1.3252e-01,  2.7735e-01,
         -2.1189e-03, -1.5725e-03, -1.