<a href="https://colab.research.google.com/github/Adithyan773/IKEA_recomendation_system/blob/main/IKEA_Text_Embedings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q faiss-cpu

In [None]:
import numpy as np
import pandas as pd
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input, Layer
from tensorflow.keras.models import Model
import faiss

# Load the DataFrame
df = pd.read_csv('/content/ikea_data_img_fixed.csv')

# Define image directory
image_dir = '/content/drive/MyDrive/images/images'

# Define columns
text_columns = ['name', 'category', 'short_description', 'designer']
numeric_columns = ['depth', 'height', 'width', 'price', 'old_price']
all_columns = text_columns + numeric_columns

# Emphasize image_description by repeating it three times
df['combined_text'] = (df['image_description'] + ' ' + df['image_description'] + ' ' +
                       df['image_description'] + ' ' + df['name'] + ' ' + df['category'] + ' ' +
                       df['short_description'])

# Load fine-tuned DistilBERT model and tokenizer
model_path = '/content/drive/MyDrive/distilbert_v3'
tokenizer = DistilBertTokenizer.from_pretrained(model_path)
distilbert_base = TFDistilBertModel.from_pretrained(model_path, from_pt=True)

# Custom DistilBERT embedding layer
class DistilBertEmbeddingLayer(Layer):
    def __init__(self, distilbert_model, **kwargs):
        super(DistilBertEmbeddingLayer, self).__init__(**kwargs)
        self.distilbert_model = distilbert_model

    def call(self, inputs):
        input_ids, attention_mask = inputs
        outputs = self.distilbert_model(input_ids=input_ids, attention_mask=attention_mask)
        return outputs.last_hidden_state[:, 0, :]

    def get_config(self):
        config = super(DistilBertEmbeddingLayer, self).get_config()
        return config

# Build text embedding model
text_input_ids = Input(shape=(None,), dtype=tf.int32, name='input_ids')
text_attention_mask = Input(shape=(None,), dtype=tf.int32, name='attention_mask')
bert_layer = DistilBertEmbeddingLayer(distilbert_base)([text_input_ids, text_attention_mask])
text_embedding = Dense(256, activation='relu', name='text_embedding')(bert_layer)
text_model = Model(inputs=[text_input_ids, text_attention_mask], outputs=text_embedding)

# Tokenize the combined text
inputs = tokenizer(df['combined_text'].tolist(), return_tensors='tf', padding=True, truncation=True, max_length=128)

# Generate text embeddings
combined_embeddings = text_model.predict([inputs['input_ids'], inputs['attention_mask']])

# Normalize embeddings for cosine similarity
combined_embeddings = combined_embeddings / np.linalg.norm(combined_embeddings, axis=1, keepdims=True)

# Create and populate FAISS index
d = 256  # Embedding dimension
text_index = faiss.IndexFlatIP(d)
text_index.add(combined_embeddings)

# Save FAISS index and metadata
faiss.write_index(text_index, '/content/text_embeddings_fixed.faiss')
metadata = df[all_columns + ['image_description']].to_numpy()  # Include image_description
np.save('/content/full_metadata_fixed.npy', metadata)

# Save model weights
filepath = '/content/text_model_weights_fixed.weights.h5'
text_model.save_weights(filepath)
print(f"Weights saved to: {filepath}")

print(f"Number of texts indexed: {text_index.ntotal}")
print(f"Metadata saved with shape: {metadata.shape}")