# NoSQL J Comp
- Amal G - 20MIA1121                    
- Shashank Singh - 21MIA1110                  
- A J Dazzle -21MIA1119           




# Installing required packages

In [None]:
pip install gradio

Collecting gradio
  Downloading gradio-5.5.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.5-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.4.2 (from gradio)
  Downloading gradio_client-1.4.2-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart==0.0.12 (from gradio)
  Downloading python_multipart-0.0.12-py3-none-any.whl.metadata (1.9 kB)
Collecting ruff>=0.2.2 (from gradio)
  Downloading ruff-0.7.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metad

In [None]:
pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/647.5 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m645.1/647.5 kB[0m [31m25.5 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m647.5/647.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py) ... [?25l[?25hdone
  Created wheel for annoy: filename=annoy-1.17.3-cp310-cp310-linux_x86_64.whl size=552449 sha256=6d25bb644b1c7d2c7627952b124ed647bfd8766e760fcbb53f794e7b1dca52db
  Stored in directory: /root/.cache/pip/wheels/64/8a/da/f714bcf46c5efdcfcac0559e63370c21abe961c48e3992465a
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


# Testing of ML Model

- The images were converted to base 64 format
- Multiple dataset mergining  were done

- The final Dataset was then converted from CSV to JSON.
---



In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import transformers
from annoy import AnnoyIndex
import json

# Load the dataset from JSON
with open('/content/recommendation_dataset_with_images.json', 'r') as f:
    data = json.load(f)

# Convert JSON data into a DataFrame
df_merged = pd.json_normalize(data)

# Check and fill missing values in the 'name' and 'review_summary' columns
df_merged['user_id'] = df_merged['user_id'].fillna('')
df_merged['review_summary'] = df_merged['review_summary'].fillna('')

# Collaborative Filtering Setup (assuming 'rating' and 'user_id' exist)
user_item_matrix = pd.pivot_table(df_merged, values='rating', index='user_id', columns='name').fillna(0)

# BERT Embeddings Setup
pretrained_weights = 'distilbert-base-uncased'
tokenizer = transformers.DistilBertTokenizer.from_pretrained(pretrained_weights)
bert_model = transformers.TFDistilBertModel.from_pretrained(pretrained_weights)

# Tokenize 'review_summary' for BERT embeddings
tokenized_descriptions = df_merged['review_summary'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
maxlen = 40  # Adjust maxlen based on BERT's max input length

# Padding the tokenized descriptions
for i, token in enumerate(tokenized_descriptions):
    if len(token) >= maxlen:
        token = token[:maxlen]
    else:
        token = token + [0] * (maxlen - len(token))
    tokenized_descriptions[i] = list(token)
tokenized_descriptions = np.array(list(tokenized_descriptions))

# Function to generate recommendations for a specific user
def generate_recommendations(user_id, top_n):
    # Collaborative Filtering Recommendations
    if user_id not in user_item_matrix.index:
        return f"No data for user_id {user_id}"

    user_interactions = user_item_matrix.loc[user_id]
    collaborative_recommendations = user_interactions[user_interactions == 0].index.tolist()[:top_n]

    # BERT Embeddings-based Recommendations
    product_summaries = df_merged['review_summary'].tolist()
    product_names = df_merged['name'].tolist()
    description_embeddings = get_bert_embeddings(tokenized_descriptions)
    annoy_index = build_annoy_index(description_embeddings)

    bert_embeddings_recommendations = []
    for item_name in collaborative_recommendations:
        if item_name in product_names:
            idx = product_names.index(item_name)
            similar_items = annoy_index.get_nns_by_vector(description_embeddings[idx], top_n + 1)[1:]
            similar_item_names = [product_names[i] for i in similar_items]
            bert_embeddings_recommendations.extend(similar_item_names)

    # Combine Recommendations
    combined_recommendations = collaborative_recommendations + bert_embeddings_recommendations
    combined_recommendations = list(set(combined_recommendations))[:top_n]

    return combined_recommendations

# Function to get BERT embeddings
def get_bert_embeddings(input_ids):
    last_hidden_states = bert_model(input_ids)[0][:, 1:maxlen + 1, :]
    return np.mean(last_hidden_states.numpy(), axis=1)

# Function to build Annoy index for embeddings
def build_annoy_index(embeddings):
    embedding_size = embeddings.shape[1]
    t = AnnoyIndex(embedding_size, 'euclidean')
    for i, embedding in enumerate(embeddings):
        t.add_item(i, embedding)
    t.build(100)  # Using 100 trees
    return t

# Example Usage
user_id = "AVCGYZL8FQQTD"  # Replace with the actual user ID (must be in string format if necessary)
top_n = 10  # Number of recommendations
recommended_items = generate_recommendations(user_id, top_n)
print("Combined Recommendations for user:", recommended_items)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Combined Recommendations for user: ['A Dose of Sanity: Mind, Medicine, and Misdiagnosis', 'By Honor Bound (The Lassiter Law) (Silhouette Intimate Moments)', 'History of Magic and the Occult', 'Open marriage;: A new life style for couples,', 'Audi Quattro: The Complete Story', 'Whispers of the Wicked Saints', "Dealing With Disappointment: Helping Kids Cope When Things Don't Go Their Way", 'Academic Freedom after September 11', 'The Complete Illustrated Guide to Chinese Medicine: A Comprehensive System for Health and Fitness', 'Alaska Sourdough']


# Removal of null values and storing as new dataset

In [None]:
import json

# Load JSON data from a file
with open('/content/recommendation_dataset_with_images.json', 'r') as f:
    data = json.load(f)

# Function to recursively check for null values
def find_nulls(obj, path=""):
    null_paths = []
    if isinstance(obj, dict):
        for key, value in obj.items():
            new_path = f"{path}.{key}" if path else key
            null_paths.extend(find_nulls(value, new_path))
    elif isinstance(obj, list):
        for i, item in enumerate(obj):
            new_path = f"{path}[{i}]"
            null_paths.extend(find_nulls(item, new_path))
    elif obj is None:
        null_paths.append(path)
    return null_paths

# Find paths with null values
null_paths = find_nulls(data)

# Display results
if null_paths:
    print("Null values found at the following paths:")
    for path in null_paths:
        print(path)
else:
    print("No null values found.")


Null values found at the following paths:
[46].image
[51].image
[52].image
[53].image
[54].image
[57].image
[58].image
[59].image
[80].image
[173].image
[174].image
[192].image
[193].image
[194].image
[207].image
[303].image
[329].image
[330].image
[331].image
[338].image
[339].image
[340].image
[341].image
[342].image
[343].image
[344].image
[345].image
[346].image
[347].image
[348].image
[349].image
[370].image
[371].image
[372].image
[384].image


In [None]:
import json

# Load JSON data from a file
with open('/content/recommendation_dataset_with_images.json', 'r') as f:
    data = json.load(f)

# Function to filter out rows with null values
def remove_null_rows(data):
    cleaned_data = []
    for entry in data:
        # Check if entry (row) has any null value
        if isinstance(entry, dict) and all(value is not None for value in entry.values()):
            cleaned_data.append(entry)
    return cleaned_data

# Filter out rows with null values
cleaned_data = remove_null_rows(data)

# Save the cleaned data to a new JSON file
with open('cleaned_file.json', 'w') as f:
    json.dump(cleaned_data, f, indent=4)

print("Rows with null values removed. New JSON file saved as 'cleaned_file.json'.")


Rows with null values removed. New JSON file saved as 'cleaned_file.json'.


# Final ML Model implementation integrated with GRADIO UI after pre-processing.

In [None]:
import pandas as pd
import numpy as np
import transformers
from annoy import AnnoyIndex
import gradio as gr
import base64
import json

# Load the dataset from JSON
with open('/content/cleaned_file.json', 'r') as f:
    data = json.load(f)

# Convert JSON data into a DataFrame
df_merged = pd.json_normalize(data)

# Check and fill missing values in the 'name' and 'review_summary' columns
df_merged['user_id'] = df_merged['user_id'].fillna('')
df_merged['review_summary'] = df_merged['review_summary'].fillna('')

# Collaborative Filtering Setup (assuming 'rating' and 'user_id' exist)
user_item_matrix = pd.pivot_table(df_merged, values='rating', index='user_id', columns='name').fillna(0)

# BERT Embeddings Setup
pretrained_weights = 'distilbert-base-uncased'
tokenizer = transformers.DistilBertTokenizer.from_pretrained(pretrained_weights)
bert_model = transformers.TFDistilBertModel.from_pretrained(pretrained_weights)

# Tokenize 'review_summary' for BERT embeddings
tokenized_descriptions = df_merged['review_summary'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
maxlen = 40  # Adjust maxlen based on BERT's max input length

# Padding the tokenized descriptions
for i, token in enumerate(tokenized_descriptions):
    if len(token) >= maxlen:
        token = token[:maxlen]
    else:
        token = token + [0] * (maxlen - len(token))
    tokenized_descriptions[i] = list(token)
tokenized_descriptions = np.array(list(tokenized_descriptions))

# Placeholder image (transparent 1x1 pixel image in base64)
placeholder_img = "data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAQAAAC1HAwCAAAAC0lEQVR42mP8/wcAAwAB/6EnnTkAAAAASUVORK5CYII="

# Function to get BERT embeddings
def get_bert_embeddings(input_ids):
    last_hidden_states = bert_model(input_ids)[0]
    sentence_embeddings = np.mean(last_hidden_states, axis=1)  # Mean pooling
    return sentence_embeddings

# Function to build Annoy index for embeddings
def build_annoy_index(embeddings):
    embedding_size = embeddings.shape[1]
    t = AnnoyIndex(embedding_size, 'euclidean')
    for i, embedding in enumerate(embeddings):
        t.add_item(i, embedding.flatten())  # Flatten each embedding
    t.build(10)  # Using 10 trees for quicker index building
    return t

# Pre-compute embeddings and index
description_embeddings = get_bert_embeddings(tokenized_descriptions)
annoy_index = build_annoy_index(description_embeddings)

# Function to generate recommendations for a specific user
def generate_recommendations(user_id, top_n):
    # Check if the user exists in the dataset
    if user_id not in user_item_matrix.index:
        return f"No data found for user ID: {user_id}. Please check the ID."

    # Collaborative Filtering Recommendations
    user_interactions = user_item_matrix.loc[user_id]
    collaborative_recommendations = user_interactions[user_interactions == 0].index.tolist()[:top_n]

    # BERT Embeddings-based Recommendations
    product_summaries = df_merged['review_summary'].tolist()
    product_names = df_merged['name'].tolist()

    bert_embeddings_recommendations = []
    for item_name in collaborative_recommendations:
        if item_name in product_names:
            idx = product_names.index(item_name)
            similar_items = annoy_index.get_nns_by_vector(description_embeddings[idx].flatten(), top_n + 1)[1:]
            similar_item_names = [product_names[i] for i in similar_items]
            bert_embeddings_recommendations.extend(similar_item_names)

    # Combine Recommendations
    combined_recommendations = collaborative_recommendations + bert_embeddings_recommendations
    combined_recommendations = list(set(combined_recommendations))[:top_n]

    # Retrieve images for recommendations or use placeholder if image is missing
    recommendations_with_images = []
    for item in combined_recommendations:
        row = df_merged[df_merged['name'] == item]
        if not row.empty and 'image' in row.columns:
            image_url = row['image'].values[0] if row['image'].values[0] else placeholder_img
        else:
            image_url = placeholder_img
        recommendations_with_images.append((item, image_url))

    return recommendations_with_images

# Gradio Interface
def recommend_books(user_id, top_n):
    recommendations = generate_recommendations(user_id, top_n)
    if isinstance(recommendations, str):  # If error message is returned
        return recommendations
    # Create HTML output with title and image
    html_output = ''
    for rec in recommendations:
        title, image_url = rec
        html_output += f'<h2>{title}</h2><img src="{image_url}" width="150"><br>'
    return html_output

# Define Gradio interface
interface = gr.Interface(
    fn=recommend_books,
    inputs=[
        gr.Textbox(label="User ID", placeholder="Enter user ID"),
        gr.Number(label="Top N Recommendations", value=5)  # Change 'default' to 'value'
    ],
    outputs="html"  # Change output to HTML to display title and image
)

# Launch the Gradio app
interface.launch()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['vocab_projector.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://37601a34b80cafef44.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


