# How It Works

Preprocessing: Combines relevant features into a single column (clean_text).

Embedding Generation: Creates vector representations of product descriptions using Sentence-BERT.

Cosine Similarity: Matches the query to the most similar product embeddings.

Recommendation: Returns the top-k recommendations based on similarity scores.

# Load Dataset

In [None]:
import pandas as pd

df = pd.read_csv("/content/data.csv")
df.head()

Unnamed: 0,id,slug,title,imgs,brand,category,vendor,used,address,availability,...,discounted_price,specifications,description,delivery_fee,delivery_details,warranty,warranty_type,average_rating,num_ratings,reviews
0,0,https://www.mega.pk/mobiles_products/23522/Not...,Nothing Phone 1 8GB RAM 256GB Storage Non PTA ...,['https://www.mega.pk/items_images/Nothing+Pho...,,Mobile,MEGA.PK,0,"Office 11, 12, 14 Basement Ahmed Center, I-8 M...",,...,,"{'RAM': '8GB', 'Memory quantity': '', 'Interna...",,,,,,,,[]
1,1,https://www.mega.pk/mobiles_products/23458/Opp...,Oppo F21 Pro 8GB Ram 128GB Storage 5G PTA Appr...,['https://www.mega.pk/items_images/Oppo+F21+Pr...,OPPO,Mobile,MEGA.PK,0,"Office 11, 12, 14 Basement Ahmed Center, I-8 M...",,...,,"{'RAM': '8gb', 'Memory quantity': '', 'Interna...",,,,,,,,[]
2,2,https://www.mega.pk/mobiles_products/24393/Tec...,Tecno Spark 10,['https://www.mega.pk/items_images/Tecno+Spark...,Tecno,Mobile,MEGA.PK,0,"Office 11, 12, 14 Basement Ahmed Center, I-8 M...",Coming Soon,...,,"{'RAM': '4GB,8GB', 'Memory quantity': '', 'Int...",,,,1 year,,,,[]
3,3,https://www.mega.pk/mobiles_products/24259/Viv...,Vivo V27 5G,['https://www.mega.pk/items_images/Vivo+V27+5G...,Vivo,Mobile,MEGA.PK,0,"Office 11, 12, 14 Basement Ahmed Center, I-8 M...",Coming Soon,...,,"{'RAM': '8GB,12GB', 'Memory quantity': '', 'In...",,,,1 year,,,,[]
4,4,https://www.mega.pk/mobiles_products/24204/App...,Apple Iphone 15 Pro Max,['https://www.mega.pk/items_images/Apple+Iphon...,Apple,Mobile,MEGA.PK,0,"Office 11, 12, 14 Basement Ahmed Center, I-8 M...",Coming Soon,...,,"{'RAM': '8GB', 'Memory quantity': '', 'Interna...",,,,,,,,[]


# Preprocessing & Cleaning

In [None]:
# Combine relevant features into a single text column
def clean_text(row):
    title = row['title'] if pd.notna(row['title']) else ""
    brand = row['brand'] if pd.notna(row['brand']) else ""
    category = row['category'] if pd.notna(row['category']) else ""
    specs = ' '.join([f"{k}:{v}" for k, v in eval(row['specifications']).items() if v]) if pd.notna(row['specifications']) else ""
    return f"{title} {brand} {category} {specs}".lower()

df['clean_text'] = df.apply(clean_text, axis=1)
df = df.dropna(subset=['clean_text'])  # Drop rows without descriptions


# Create Embeddings

In [None]:
from sentence_transformers import SentenceTransformer

# Load a pre-trained Sentence-BERT model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Generate embeddings for product descriptions
df['embeddings'] = df['clean_text'].apply(lambda x: model.encode(x))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

# Load and Save Embeddings

In [None]:
# Save the embeddings to a file
df.to_pickle('product_embeddings.pkl')

# # Load the embeddings later
df = pd.read_pickle('product_embeddings.pkl')

# Recommendation Engine

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Function to get recommendations based on a user query
def recommend_products(query, top_k=5):
    # lower case
    query = query.lower()
    # Embed the query
    query_embedding = model.encode(query)

    # Compute cosine similarity with all products
    df['similarity'] = df['embeddings'].apply(lambda x: cosine_similarity([query_embedding], [x]).flatten()[0])

    # Sort products by similarity score
    recommendations = df.sort_values(by='similarity', ascending=False).head(top_k)
    return recommendations[['title', 'brand', 'category', 'similarity','imgs']]

# Example usage
query = "8GB RAM smartphone"
recommendations = recommend_products(query)
recommendations

Unnamed: 0,title,brand,category,similarity,imgs
524,Samsung Galaxy Tab S8 Ultra 12GB (RAM) + 256GB...,Samsung,Mobile,0.59769,['https://www.czone.com.pk/images/thumbnails-l...
516,"Samsung Galaxy Tab S8 11"" 128GB | Czone.com.pk",Samsung,Mobile,0.550169,['https://www.czone.com.pk/images/thumbnails-l...
131,Samsung Galaxy A23 6GB Ram 128GB Storage LTE P...,Samsung,Mobile,0.542552,['https://www.mega.pk/items_images/Samsung+Gal...
130,Samsung Galaxy A53 8GB Ram 128GB Storage 5G PT...,Samsung,Mobile,0.540563,['https://www.mega.pk/items_images/Samsung+Gal...
488,"Samsung Galaxy Tab A7 Lite 8.7"" - 32GB",Samsung,Mobile,0.538458,['https://www.czone.com.pk/images/thumbnails-l...


In [None]:
# Example usage
query = "Oppo Smart phone"
recommendations = recommend_products(query)
recommendations

Unnamed: 0,title,brand,category,similarity,imgs
1358,Oppo A16e (Activated),,Mobile,0.725794,['https://images.priceoye.pk/oppo-a16e-activat...
1457,Oppo A54 (Activated),,Mobile,0.720386,['https://images.priceoye.pk/oppo-a54-activate...
1347,Oppo A16e,,Mobile,0.713075,['https://images.priceoye.pk/oppo-a16e-pakista...
1359,Oppo A54,,Mobile,0.710593,['https://images.priceoye.pk/oppo-a54-pakistan...
1489,Oppo A16,,Mobile,0.709044,['https://images.priceoye.pk/oppo-a16-pakistan...


Website

In [None]:
pip install streamlit

Collecting streamlit
  Downloading streamlit-1.46.1-py3-none-any.whl.metadata (9.0 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m44.3/44.3 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.46.1-py3-none-any.whl (10.1 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.1/10.1 MB[0m [31m67.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m 

In [None]:
import streamlit as st
import pandas as pd
import numpy as np
import requests
import pickle
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import io

# -------------------- CSS Styling --------------------
st.markdown("""
    <style>
        .main {
            background-color: #f9f9f9;
            font-family: 'Segoe UI', sans-serif;
        }
        .title {
            color: #2c3e50;
            font-size: 3em;
            font-weight: 700;
            text-align: center;
            margin-bottom: 10px;
        }
        .recommendation {
            border: 1px solid #e0e0e0;
            border-radius: 15px;
            padding: 15px;
            background-color: white;
            margin: 10px 0;
            box-shadow: 0 2px 10px rgba(0,0,0,0.05);
        }
        .recommendation img {
            max-width: 100%;
            border-radius: 10px;
        }
        .info {
            font-size: 1.1em;
        }
        .similarity {
            color: #16a085;
            font-weight: bold;
        }
    </style>
""", unsafe_allow_html=True)

# -------------------- App Title --------------------
st.markdown("<div class='title'>üõçÔ∏è E-Commerce Product Recommender</div>", unsafe_allow_html=True)

# -------------------- Load Precomputed Dataset (.pkl from same folder) --------------------
@st.cache_data(show_spinner=True)
def load_data():
    with open("product_embeddings.pkl", "rb") as f:
        return pickle.load(f)

df = load_data()

# -------------------- Load Sentence-BERT Model --------------------
@st.cache_resource
def load_model():
    return SentenceTransformer('all-MiniLM-L6-v2')

model = load_model()

# -------------------- Product Recommendation Function --------------------
def recommend_products(query, top_k=5):
    query = query.lower()
    query_embedding = model.encode(query)

    df['similarity'] = df['embeddings'].apply(lambda x: cosine_similarity([query_embedding], [x]).flatten()[0])
    recommendations = df.sort_values(by='similarity', ascending=False).head(top_k)
    return recommendations

# -------------------- Search Box --------------------
query = st.text_input("üîç Search for products (e.g. '8GB RAM smartphone')", '')

if query:
    results = recommend_products(query)

    if results.empty:
        st.warning("No products found.")
    else:
        st.markdown("### üîé Top Recommendations:")
        for _, row in results.iterrows():
            st.markdown(f"""
            <div class='recommendation'>
                <img src="{row['imgs']}" alt="Product Image">
                <div class='info'>
                    <b>Title:</b> {row['title']}<br>
                    <b>Brand:</b> {row['brand']}<br>
                    <b>Category:</b> {row['category']}<br>
                    <b>Similarity Score:</b> <span class='similarity'>{row['similarity']:.2f}</span>
                </div>
            </div>
            """, unsafe_allow_html=True)


2025-07-11 17:26:32.905 No runtime found, using MemoryCacheStorageManager
2025-07-11 17:26:32.906 No runtime found, using MemoryCacheStorageManager
2025-07-11 17:26:33.516 Session state does not function when running a script without `streamlit run`
