In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Load the dataset
file_path = '/content/drive/MyDrive/bigBasketProducts.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset for initial inspection
data.head()


Unnamed: 0,index,product,category,sub_category,brand,sale_price,market_price,type,rating,description
0,1,Garlic Oil - Vegetarian Capsule 500 mg,Beauty & Hygiene,Hair Care,Sri Sri Ayurveda,220.0,220.0,Hair Oil & Serum,4.1,This Product contains Garlic Oil that is known...
1,2,Water Bottle - Orange,"Kitchen, Garden & Pets",Storage & Accessories,Mastercook,180.0,180.0,Water & Fridge Bottles,2.3,"Each product is microwave safe (without lid), ..."
2,3,"Brass Angle Deep - Plain, No.2",Cleaning & Household,Pooja Needs,Trm,119.0,250.0,Lamp & Lamp Oil,3.4,"A perfect gift for all occasions, be it your m..."
3,4,Cereal Flip Lid Container/Storage Jar - Assort...,Cleaning & Household,Bins & Bathroom Ware,Nakoda,149.0,176.0,"Laundry, Storage Baskets",3.7,Multipurpose container with an attractive desi...
4,5,Creme Soft Soap - For Hands & Body,Beauty & Hygiene,Bath & Hand Wash,Nivea,162.0,162.0,Bathing Bars & Soaps,4.4,Nivea Creme Soft Soap gives your skin the best...


In [3]:
# Data cleaning and preprocessing steps

# Lowercasing text fields
text_columns = ['product', 'category', 'sub_category', 'brand', 'type', 'description']
for col in text_columns:
    data[col] = data[col].str.lower().str.strip()

# Handling missing values - Check for missing values
missing_values = data.isnull().sum()

# Checking for duplicates
duplicate_rows = data.duplicated().sum()

# Checking data types
data_types = data.dtypes

# Basic statistics for numerical columns to identify potential outliers
numerical_stats = data.describe()

# Output the findings
missing_values, duplicate_rows, data_types, numerical_stats



(index              0
 product            1
 category           0
 sub_category       0
 brand              1
 sale_price         0
 market_price       0
 type               0
 rating          8626
 description      115
 dtype: int64,
 0,
 index             int64
 product          object
 category         object
 sub_category     object
 brand            object
 sale_price      float64
 market_price    float64
 type             object
 rating          float64
 description      object
 dtype: object,
              index    sale_price  market_price        rating
 count  27555.00000  27555.000000  27555.000000  18929.000000
 mean   13778.00000    322.514808    382.056664      3.943410
 std     7954.58767    486.263116    581.730717      0.739063
 min        1.00000      2.450000      3.000000      1.000000
 25%     6889.50000     95.000000    100.000000      3.700000
 50%    13778.00000    190.000000    220.000000      4.100000
 75%    20666.50000    359.000000    425.000000      4.300000

In [4]:
# Handling missing values
# For 'product' and 'brand', we'll remove rows with missing values as they are crucial for identification
# For 'rating', we'll fill missing values with the median (to avoid the influence of extreme values)
# For 'description', we'll fill missing values with a placeholder text

data.dropna(subset=['product', 'brand'], inplace=True)
data['rating'].fillna(data['rating'].median(), inplace=True)
data['description'].fillna('no description available', inplace=True)

# Text Data Preprocessing for 'description'
# Removing special characters
data['description'] = data['description'].str.replace('[^\w\s]', '', regex=True)

# Validate Categorical Data
# Checking for unique values in categorical columns for inconsistencies
category_unique = data['category'].unique()
sub_category_unique = data['sub_category'].unique()
brand_unique = data['brand'].unique()
type_unique = data['type'].unique()

# Output the unique values for manual inspection
category_unique, sub_category_unique, brand_unique, type_unique



(array(['beauty & hygiene', 'kitchen, garden & pets',
        'cleaning & household', 'gourmet & world food',
        'foodgrains, oil & masala', 'snacks & branded foods', 'beverages',
        'bakery, cakes & dairy', 'baby care', 'fruits & vegetables',
        'eggs, meat & fish'], dtype=object),
 array(['hair care', 'storage & accessories', 'pooja needs',
        'bins & bathroom ware', 'bath & hand wash', 'all purpose cleaners',
        'skin care', 'mops, brushes & scrubs', 'cooking & baking needs',
        'chocolates & biscuits', 'fresheners & repellents',
        'snacks, dry fruits, nuts', 'dairy & cheese',
        'pasta, soup & noodles', 'dry fruits', 'drinks & beverages',
        'kitchen accessories', 'flask & casserole', 'breakfast cereals',
        'frozen veggies & snacks', 'fruit juices & drinks',
        'cookies, rusk & khari', 'fragrances & deos', 'tea',
        'masalas & spices', "men's grooming", 'chocolates & candies',
        'steel utensils', 'tinned & processe

In [None]:
from sentence_transformers import SentenceTransformer

columns_to_include = ['product', 'brand', 'category', 'sub_category',  'type', 'description']

# Combine the selected columns into one string per row
data['combined_text'] = data[columns_to_include].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)

# Initialize the Sentence Transformer model with a different model that produces larger embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')

# Create embeddings for the combined text of each row
embeddings = model.encode(data['combined_text'].tolist())


.gitattributes:   0%|          | 0.00/391 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.95k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/399 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

In [None]:
import numpy as np

# Save embeddings to a file
np.save('embeddings_combined.npy', embeddings)

In [None]:
print(len(embeddings[0]))

768


In [5]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence-transformers-2.2.2.tar.gz (85 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting sentencepiece (from sentence_transformers)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: sentence_transformers
  Building wheel for sentence_transformers (setup.py) ... [?25l[?25hdone
  Created wheel for sentence_transformers: filename=sentence_transformers-2.2.2-py3-none-any.whl size=125923 sha256=1b6755943847aff521c96a9433a618c3d6371ca6f9d240f6fac9b713f77b374f
  Stored in directory: /root/.cache/pip/wheels/62/f2/10/1e606fd5f02395388f74e7462910fe851042f97238cbbd902f
Successfully built sentence_tr