# Autoencoders

In [None]:
!pip install pandas==1.5.3 gensim==4.3.1 nltk==3.8.1 scipy==1.9.3 tensorflow==2.15.0 --force-reinstall

In [None]:
import pandas as pd
import re
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import tokenizer_from_json
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import unquote
import json

In [None]:
data1 = pd.read_csv('../BigBasket Products.csv')

In [None]:
data1.head()

In [None]:
data1.isna().sum()

In [None]:
df = pd.read_csv('BigBasket Products.csv')

rmv_spc = lambda a: a.strip()
get_list = lambda a: list(map(rmv_spc, re.split(r'& |, |\*|\n', a)))

for col in ['category', 'sub_category', 'type']:
    df[col] = df[col].apply(get_list)

In [None]:
def cleaner(x):
    if isinstance(x, list):
        return [i.lower().replace(' ', '') for i in x]
    elif isinstance(x, str):
        return x.lower().replace(' ', '')
    else:
        return ''
for col in ['category', 'sub_category', 'type', 'brand']:
    df[col] = df[col].apply(cleaner)

In [None]:
def create_soup(x):
    description_text = ''
    if isinstance(x['description'], (list, tuple)):
        description_text = ' '.join(x['description'])
    elif isinstance(x['description'], str):
        description_text = x['description']

    return ' '.join(x['category']) + ' ' + \
           ' '.join(x['sub_category']) + ' ' + \
           x['brand'] + ' ' + \
           ' '.join(x['type']) + ' ' + description_text

df['soup'] = df.apply(create_soup, axis=1)

In [None]:
with open('tokenizer.json') as f:
    tok_json = f.read()
tokenizer = tokenizer_from_json(tok_json)
encoder = load_model('bb_encoder.h5')

In [None]:
seqs = tokenizer.texts_to_sequences(df['soup'])
maxlen = encoder.input_shape[1]  # same maxlen used during training
X = pad_sequences(seqs, maxlen=maxlen, padding='post')
embeddings = encoder.predict(X, batch_size=128)

In [None]:
df['embedding'] = [e.tolist() for e in embeddings]
df.to_csv('BigBasket_Products_emb.csv', index=False)

# Test (main.py)

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from urllib.parse import unquote
import ast

In [None]:
df = pd.read_csv('BigBasket_Products_emb.csv')

In [None]:
df['embedding'] = df['embedding'].apply(ast.literal_eval)

In [None]:
embeddings = np.array(df['embedding'].to_list())

In [None]:
cosine_sim = cosine_similarity(embeddings, embeddings)

In [None]:
df = df.reset_index(drop=True)
indices = pd.Series(df.index, index=df['product']).drop_duplicates()

In [None]:
def get_recommendations(product_name, topn=10):
    try:
        decoded = unquote(product_name)
        idx = indices[decoded]
        if isinstance(idx, pd.Series):
            idx = idx.iloc[0]
    except KeyError:
        return None

    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1: topn+1]
    rec_idxs = [i for i, _ in sim_scores]

    exclude_keys = {'embedding', 'rating', 'soup', 'index'}

    recommendations = []
    for _, row in df.iloc[rec_idxs].iterrows():
        filtered = {k: v for k, v in row.items() if k not in exclude_keys}
        recommendations.append(filtered)

    return recommendations

In [None]:
def get_random_products(n=15):
    return df.sample(n=n).to_dict('records')

In [None]:
get_recommendations('Whisky Glass - Elegan')