In [1]:
import numpy as np
from preprocessing.text import tokenize
from models.llm import load_llm
from models.classifier import load_classifier_distilbert, load_classifier_bert, device
from tqdm import tqdm
from preprocessing.image import preprocess_and_check_image
from database.connect import conn
from preprocessing.document import extract_text_from_pdf
from tqdm import tqdm
import os
import wandb 

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.init(
    # set the wandb project where this run will be logged
    project="impossible-querry-pipeline",
    name="gemini-bert"
)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmehdinejjar[0m ([33mmehdinejjar-al-akhawayn-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
llm = load_llm()
trainer, tokenizer = load_classifier_bert()

In [4]:
cursor = conn.cursor()
cursor.execute("SELECT * FROM Baby_Bath_Skin_and_Grooming;") 
rows = cursor.fetchall()

In [5]:
IMAGE_INDEX = 4
DOC_INDEX = None
PRIMARY_KEY = ['id']
user_prompt = "Is this item a shampoo with a dispenser pump head?"
picture_prompt = "analyze the picture"
row_prompt = "analyze the row data"
document_prompt = "analyze the document"

image_flag = 1
document_flag = 0

columns = [col[0] for col in cursor.description]
pk_index = [i for i in range(len(columns)) if columns[i] in PRIMARY_KEY]

columns, pk_index

(['id',
  'name',
  'main_category',
  'sub_category',
  'image',
  'ratings',
  'no_of_ratings',
  'discount_price',
  'actual_price'],
 [0])

In [6]:
results_text = []
PK = []
indices = []
predicted_classes = []

def text_llm(row):
    prompt = f"{row_prompt}.\n{user_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != IMAGE_INDEX])
    result = llm.generate_content(
        [f"Row data: {text_data}\n\n", prompt]
    )
    return result.text

def image_llm(image, row):
    prompt = f"{picture_prompt}, and {row_prompt}.\n{user_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != IMAGE_INDEX])
    result = llm.generate_content(
        [image, "\n\n", f"Row data: {text_data}\n\n", prompt]
    )
    return result.text

def document_llm(document, row):
    prompt = f"{document_prompt}, and {row_prompt}.\n{user_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != DOC_INDEX])
    result = llm.generate_content(
        [document, "\n\n", f"Row data: {text_data}\n\n", prompt]
    )
    return result.text
20
def image_document_llm(image, document, row):
    prompt = f"{picture_prompt}, {document_prompt}, and {row_prompt}.\n{user_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != IMAGE_INDEX and j != DOC_INDEX])
    result = llm.generate_content(
        [image, document, "\n\n", f"Row data: {text_data}\n\n", prompt]
    )
    return result.text


for i in tqdm(range(len(rows))):

    if not image_flag and not document_flag:
        result_text = text_llm(rows[i])
    
    elif image_flag and not document_flag:
        image = preprocess_and_check_image(rows[i][IMAGE_INDEX])
        if image:
            result_text = image_llm(image, rows[i])
        else:
            result_text = text_llm(rows[i])

    elif not image_flag and document_flag:
        ## Apply logic of reading pdf
        document = extract_text_from_pdf(rows[i][DOC_INDEX])
        if document:
            result_text = document_llm(document, rows[i])
        else:
            result_text = text_llm(rows[i])
        
    elif image_flag and document_flag:
        document = extract_text_from_pdf(rows[i][DOC_INDEX])
        image = preprocess_and_check_image(rows[i][IMAGE_INDEX])
        if document and image:
            result_text = image_document_llm(image, document, rows[i])
        elif document and image:
            result_text = document_llm(document, rows[i])
        elif not document and image:
            result_text = image_llm(image, rows[i])
        elif not document and not image:
            result_text = text_llm(rows[i])

    strip_text = result_text.replace('\n', '').replace('*', '')
    
    classifier_text = f"Question: {user_prompt} Answer: {strip_text}"
    results_text.append(classifier_text)
    
    dataset_subset = tokenize(classifier_text, tokenizer)
    predictions = trainer.predict(dataset_subset)
    
    predicted_class = np.argmax(predictions.predictions, axis=1)[0]
    predicted_classes.append(predicted_class)

    PK.append([rows[i][j] for j in pk_index])

 74%|██████████████████████████████████████████████████████████▌                    | 224/302 [22:48<12:33,  9.66s/it]

Failed to process image from https://m.media-amazon.com/images/I/61qZEV9j2cS._AC_UL320_.jpg: HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


 82%|████████████████████████████████████████████████████████████████▊              | 248/302 [26:09<05:44,  6.37s/it]

Failed to process image from https://m.media-amazon.com/images/I/51L6xh0yVpL._AC_UL320_.jpg: HTTPSConnectionPool(host='m.media-amazon.com', port=443): Read timed out. (read timeout=10)


100%|███████████████████████████████████████████████████████████████████████████████| 302/302 [32:07<00:00,  6.38s/it]


In [7]:
results_text

['Question: Is this item a shampoo with a dispenser pump head? Answer: No, the item is a liquid cleanser for baby bottles, accessories, and vegetables.  While it has a similar pump head to some shampoo bottles, the label clearly states its purpose.  The product information also confirms this.',
 'Question: Is this item a shampoo with a dispenser pump head? Answer: No, the item is a baby bath seat/bather.  The image clearly shows a fabric and plastic seat designed to support a baby while bathing. The product description in the data confirms this. Shampoo and dispenser pumps are not mentioned or depicted.',
 'Question: Is this item a shampoo with a dispenser pump head? Answer: No, the image and product description clearly indicate it is a massage oil.  It comes in a bottle with a flip-top cap, not a pump dispenser. Shampoo is for hair cleansing, while this product is designed for baby massage.',
 'Question: Is this item a shampoo with a dispenser pump head? Answer: The image shows a pump

In [8]:
label = 0
[rows[i] for i in range(len(results_text)) if predicted_classes[i] == label], [results_text[i] for i in range(len(results_text)) if predicted_classes[i] == label]

([(41,
   'Aveeno Baby Gentle Wash & Shampoo with Natural Oat Extract, Tear-Free &, Lightly Scented, 18 fl',
   'toys & baby products',
   'Baby Bath, Skin & Grooming',
   'https://m.media-amazon.com/images/I/51Gpjipr8SL._AC_UL320_.jpg',
   Decimal('4.40'),
   '211',
   '₹1,589',
   '₹1,999'),
  (106,
   'Pigeon Baby Conditioning Shampoo, For Newborns, Strengthens and Nourishes Hair, Enriched with Chamomile, Rosehip and Olive...',
   'toys & baby products',
   'Baby Bath, Skin & Grooming',
   'https://m.media-amazon.com/images/I/51D3ndPw0FL._AC_UL320_.jpg',
   Decimal('4.20'),
   '581',
   '₹317',
   '₹435'),
  (110,
   'Mamaearth Daily Moisturizing Natural Baby Lotion (400 ml) & Mamaearth Deeply Nourishing Natural Baby wash (400 ml, 0-5 Yrs)',
   'toys & baby products',
   'Baby Bath, Skin & Grooming',
   'https://m.media-amazon.com/images/I/71ECf9SFIPL._AC_UL320_.jpg',
   Decimal('4.30'),
   '12,057',
   '₹578',
   '₹798'),
  (159,
   "Mother Sparsh Plant Powered Natural Baby Shampoo

In [9]:
pk = [[rows[i][j] for j in pk_index] for i in range(len(results_text)) if  predicted_classes[i] == label]
pk

[[41], [106], [110], [159], [211], [243], [250], [264], [269], [293], [304]]

In [10]:
def pk_clause(pk, PRIMARY_KEY):
    return " AND ".join([f"{PRIMARY_KEY[i]} = {pk[i]}" for i in range(len(pk))])

def where_clause(PK, PRIMARY_KEY): 
    return " OR ".join([f"{pk_clause(single, PRIMARY_KEY)}" for single in PK])

In [11]:
user_query = f"SELECT * FROM Baby_Bath_Skin_and_Grooming WHERE {where_clause(pk, PRIMARY_KEY)}"
user_query

'SELECT * FROM Baby_Bath_Skin_and_Grooming WHERE id = 41 OR id = 106 OR id = 110 OR id = 159 OR id = 211 OR id = 243 OR id = 250 OR id = 264 OR id = 269 OR id = 293 OR id = 304'

In [12]:
cursor.execute(user_query) 
rows_filtered = cursor.fetchall()

In [13]:
rows_filtered

[(41,
  'Aveeno Baby Gentle Wash & Shampoo with Natural Oat Extract, Tear-Free &, Lightly Scented, 18 fl',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/51Gpjipr8SL._AC_UL320_.jpg',
  Decimal('4.40'),
  '211',
  '₹1,589',
  '₹1,999'),
 (106,
  'Pigeon Baby Conditioning Shampoo, For Newborns, Strengthens and Nourishes Hair, Enriched with Chamomile, Rosehip and Olive...',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/51D3ndPw0FL._AC_UL320_.jpg',
  Decimal('4.20'),
  '581',
  '₹317',
  '₹435'),
 (110,
  'Mamaearth Daily Moisturizing Natural Baby Lotion (400 ml) & Mamaearth Deeply Nourishing Natural Baby wash (400 ml, 0-5 Yrs)',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/71ECf9SFIPL._AC_UL320_.jpg',
  Decimal('4.30'),
  '12,057',
  '₹578',
  '₹798'),
 (159,
  "Mother Sparsh Plant Powered Natural Baby Shampoo With Vanilla Oil, Vitamin E 