# WordToVec

In [None]:
!pip install pandas==1.5.3 gensim==4.3.1 nltk==3.8.1 scipy==1.9.3 tensorflow==2.15.0 --force-reinstall

In [None]:
import pandas as pd
import re
import json
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, GlobalMaxPooling1D, Dense
from tensorflow.keras.optimizers import Adam
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

In [None]:
data1 = pd.read_csv('../BigBasket Products.csv')

In [None]:
data1.head()

In [None]:
data1.isna().sum()

# Model Training

In [None]:
df = pd.read_csv('BigBasket Products.csv')

rmv_spc = lambda a: a.strip()
get_list = lambda a: list(map(rmv_spc, re.split(r'& |, |\*|\n', a)))

for col in ['category', 'sub_category', 'type']:
    df[col] = df[col].apply(get_list)

In [None]:
def cleaner(x):
    if isinstance(x, list):
        return [i.lower().replace(' ', '') for i in x]
    elif isinstance(x, str):
        return x.lower().replace(' ', '')
    else:
        return ''
for col in ['category', 'sub_category', 'type', 'brand']:
    df[col] = df[col].apply(cleaner)

In [None]:
def create_soup(x):
    description_text = ''
    if isinstance(x['description'], (list, tuple)):
        description_text = ' '.join(x['description'])
    elif isinstance(x['description'], str):
        description_text = x['description']

    return ' '.join(x['category']) + ' ' + \
           ' '.join(x['sub_category']) + ' ' + \
           x['brand'] + ' ' + \
           ' '.join(x['type']) + ' ' + description_text

df['soup'] = df.apply(create_soup, axis=1)

In [None]:
tokenizer = Tokenizer(lower=True, oov_token='<OOV>')
tokenizer.fit_on_texts(df['soup'])
seqs = tokenizer.texts_to_sequences(df['soup'])
maxlen = max(len(s) for s in seqs)
X = pad_sequences(seqs, maxlen=maxlen, padding='post')

vocab_size = len(tokenizer.word_index) + 1
embed_dim = 128

In [None]:
# encoder-decoder Architecture
inp = Input(shape=(maxlen,), name='encoder_input')
x = Embedding(vocab_size, embed_dim, input_length=maxlen, name='emb')(inp)

x = Bidirectional(LSTM(128, return_sequences=True), name='bilstm_1')(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)

x = Bidirectional(LSTM(64, return_sequences=True), name='bilstm_2')(x)
x = Dropout(0.3)(x)
x = BatchNormalization()(x)

x = GlobalMaxPooling1D(name='pool')(x)

# Bottleneck layer
encoded = Dense(256, activation='relu', name='bottleneck_1')(x)
encoded = Dropout(0.3)(encoded)
encoded = Dense(128, activation='relu', name='bottleneck_2')(encoded)

decoded = Dense(vocab_size, activation='softmax', name='decoder_output')(encoded)

In [None]:
autoencoder = Model(inp, decoded, name='autoencoder')
autoencoder.compile(
    optimizer=Adam(1e-3),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

In [None]:
y = X[:, 0]
autoencoder.fit(X, y, epochs=10, batch_size=128, validation_split=0.1)

In [None]:
encoder = Model(inp, encoded, name='encoder')
encoder.save('bb_encoder.h5')

with open('tokenizer.json', 'w') as f:
    f.write(tokenizer.to_json())

# Embedding

Adding the embedding column to every row.

In [None]:
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import tokenizer_from_json

In [None]:
with open('tokenizer.json') as f:
    tok_json = f.read()
tokenizer = tokenizer_from_json(tok_json)
encoder = load_model('bb_encoder.h5')

In [None]:
seqs = tokenizer.texts_to_sequences(df['soup'])
maxlen = encoder.input_shape[1]
X = pad_sequences(seqs, maxlen=maxlen, padding='post')
embeddings = encoder.predict(X, batch_size=128)

In [None]:
df['embedding'] = [e.tolist() for e in embeddings]
df.to_csv('BigBasket_Products_emb.csv', index=False)

# Test (main.py)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import unquote
import ast

In [None]:
df = pd.read_csv('BigBasket_Products_emb.csv')

In [None]:
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [None]:
embeddings = np.array(df['embedding'].to_list())

In [None]:
cosine_sim = cosine_similarity(embeddings, embeddings)

In [None]:
df = df.reset_index(drop=True)
indices = pd.Series(df.index, index=df['product']).drop_duplicates()

In [None]:
def parse_list_fields(row, list_fields):
    for field in list_fields:
        if isinstance(row[field], str) and row[field].startswith("["):
            try:
                row[field] = ast.literal_eval(row[field])
            except:
                row[field] = []
    return row

In [None]:
def get_recommendations(product_name, topn=10):
    try:
        decoded = unquote(product_name)
        logger.info(f"Getting recommendations for: {decoded}")
        idx = indices[decoded]
        if isinstance(idx, pd.Series):
            idx = idx.iloc[0]
    except KeyError:
        logger.warning(f"Product not found: {product_name}")
        return None

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: topn+1]
    rec_idxs = [i for i, _ in sim_scores]

    exclude_keys = {'embedding', 'rating', 'soup', 'index'}
    list_fields = ['category', 'sub_category', 'type']

    recommendations = []
    for _, row in df.iloc[rec_idxs].copy().iterrows():
        row = parse_list_fields(row, list_fields)
        filtered = {k: v for k, v in row.items() if k not in exclude_keys}
        recommendations.append(filtered)

    logger.info(f"Found {len(recommendations)} recommendations for {decoded}")
    return recommendations

In [None]:
def get_random_products(n=15):
    logger.info(f"Fetching {n} random products")
    sample_df = df.sample(n=n).copy()
    list_fields = ['category', 'sub_category', 'type']
    sample_df = sample_df.apply(lambda row: parse_list_fields(row, list_fields), axis=1)
    logger.info("Random products fetched successfully")
    return sample_df.to_dict('records')

In [None]:
get_recommendations('Whisky Glass - Elegan')