In [1]:
import numpy as np
from preprocessing.text import tokenize
from models.llm import load_llm
from models.classifier import load_classifier_distilbert, load_classifier_bert, device
from tqdm import tqdm
from preprocessing.image import preprocess_and_check_image
from database.connect import conn
from preprocessing.document import extract_text_from_pdf
from tqdm import tqdm
import os
import wandb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
wandb.init(
    # set the wandb project where this run will be logged
    project="generating-impossible-querry-pipeline",
    name="gemini-bert-impossible-query-pipeline",
)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmehdinejjar[0m ([33mmehdinejjar-al-akhawayn-university[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
llm = load_llm()
trainer, tokenizer = load_classifier_bert()

In [4]:
cursor = conn.cursor()
cursor.execute("SELECT * FROM Baby_Bath_Skin_and_Grooming;") 
rows = cursor.fetchall()

In [5]:
IMAGE_INDEX = 4
DOC_INDEX = None
PRIMARY_KEY = ['id']
user_prompt = "Is this item have a real baby picture?"
guidelines = """IMPORTANT GUIDELINES:
1. Provide "Match Score" (0–100).
2. EXAMPLE: If real baby photo: match is 100. If it’s a cartoon or symbolize a baby match is [50-80]. 
3. If totally unrelated, near 0."""

picture_prompt = "analyze the picture"
row_prompt = "analyze the row data"
document_prompt = "analyze the document"

image_flag = 1
document_flag = 0

columns = [col[0] for col in cursor.description]
pk_index = [i for i in range(len(columns)) if columns[i] in PRIMARY_KEY]

columns, pk_index

(['id',
  'name',
  'main_category',
  'sub_category',
  'image',
  'ratings',
  'no_of_ratings',
  'discount_price',
  'actual_price'],
 [0])

In [9]:
results_text = []
PK = []
indices = []
predicted_classes = []

llm_prompt = user_prompt + guidelines

def text_llm(row):
    prompt = f"{row_prompt}.\n{llm_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != IMAGE_INDEX])
    result = llm.generate_content(
        [f"Row data: {text_data}\n\n", prompt]
    )
    return result.text

def image_llm(image, row):
    prompt = f"{picture_prompt}, and {row_prompt}.\n{llm_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != IMAGE_INDEX])
    result = llm.generate_content(
        [image, "\n\n", f"Row data: {text_data}\n\n", prompt]
    )
    return result.text

def document_llm(document, row):
    prompt = f"{document_prompt}, and {row_prompt}.\n{llm_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != DOC_INDEX])
    result = llm.generate_content(
        [document, "\n\n", f"Row data: {text_data}\n\n", prompt]
    )
    return result.text
def image_document_llm(image, document, row):
    prompt = f"{picture_prompt}, {document_prompt}, and {row_prompt}.\n{llm_prompt}"
    text_data = " | ".join([f"{columns[j]}: {row[j]}" for j in range(len(columns)) if j != IMAGE_INDEX and j != DOC_INDEX])
    result = llm.generate_content(
        [image, document, "\n\n", f"Row data: {text_data}\n\n", prompt]
    )
    return result.text


for i in tqdm(range(len(rows))):

    if not image_flag and not document_flag:
        result_text = text_llm(rows[i])
    
    elif image_flag and not document_flag:
        image = preprocess_and_check_image(rows[i][IMAGE_INDEX])
        if image:
            result_text = image_llm(image, rows[i])
        else:
            result_text = text_llm(rows[i])

    elif not image_flag and document_flag:
        ## Apply logic of reading pdf
        document = extract_text_from_pdf(rows[i][DOC_INDEX])
        if document:
            result_text = document_llm(document, rows[i])
        else:
            result_text = text_llm(rows[i])
        
    elif image_flag and document_flag:
        document = extract_text_from_pdf(rows[i][DOC_INDEX])
        image = preprocess_and_check_image(rows[i][IMAGE_INDEX])
        if document and image:
            result_text = image_document_llm(image, document, rows[i])
        elif document and not image:
            result_text = document_llm(document, rows[i])
        elif not document and image:
            result_text = image_llm(image, rows[i])
        elif not document and not image:
            result_text = text_llm(rows[i])

    strip_text = result_text.replace('\n', ' ').replace('*', '')
    
    classifier_text = f"Question: {user_prompt} Answer: {strip_text}"
    results_text.append(classifier_text)
    
    dataset_subset = tokenize(classifier_text, tokenizer)
    predictions = trainer.predict(dataset_subset)
    
    predicted_class = np.argmax(predictions.predictions, axis=1)[0]
    predicted_classes.append(predicted_class)

    PK.append([rows[i][j] for j in pk_index])

100%|██████████| 302/302 [10:51<00:00,  2.16s/it]


In [7]:
results_text

['Question: Is this item have a real baby picture? Answer: Match Score: 100  The image clearly depicts a real baby interacting with a toy.  This aligns with the product description of a cleanser for baby bottles, accessories, and vegetables. ',
 'Question: Is this item have a real baby picture? Answer: Match Score: 0  There is no real baby in this picture. The image depicts a baby bath support seat with cartoon illustrations of sea creatures.  A plush narwhal headrest is also visible. ',
 "Question: Is this item have a real baby picture? Answer: The image on the bottle and box is a cartoon drawing of a baby's head. The image of massaging a baby's foot is also a cartoon. There are no real baby pictures. I would rate the match score between 50-80, given it depicts a baby, but is clearly not a photograph.  I'd lean toward the lower end of that range (closer to 50) since the overall image is simple and cartoonish. ",
 'Question: Is this item have a real baby picture? Answer: The image does

In [8]:
predicted_classes

[0, 2, 1, 2, 1, 2, 2, 0, 1, 2]

In [32]:
rows[:10]

[(11,
  'Luv Lap Liquid Cleanser, Anti-Bacterial, Food Grade, For Baby Bottles, Accessories and Vegetables, 1.5ltr',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/71p5fCtiCnL._AC_UL320_.jpg',
  Decimal('4.40'),
  '5,883',
  '₹509',
  '₹669'),
 (12,
  'Luv Lap LuvLap Aqua Tales Baby Bather for Baby 0-12 Months, New Born Baby Bath Chair, 3 Position Adjustable, Washable Soft...',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/91JNkTWv4TL._AC_UL320_.jpg',
  Decimal('4.10'),
  '3,200',
  '₹899',
  '₹1,599'),
 (13,
  'The Moms Co. Natural Baby Massage Oil with 10 Oils - Sesame Oil, Avocado, Organic Almond, Organic Jojoba, Organic Chamomil...',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/71EQE76D+hL._AC_UL320_.jpg',
  Decimal('4.40'),
  '2,265',
  '₹331',
  '₹474'),
 (14,
  'Chicco Baby Moments Mild Body Wash Refresh, New Advanced Formula Wi

In [24]:
label = 1
elements =[[rows[i], results_text[i]] for i in range(len(results_text)) if predicted_classes[i] == label]
elements

[[(13,
   'The Moms Co. Natural Baby Massage Oil with 10 Oils - Sesame Oil, Avocado, Organic Almond, Organic Jojoba, Organic Chamomil...',
   'toys & baby products',
   'Baby Bath, Skin & Grooming',
   'https://m.media-amazon.com/images/I/71EQE76D+hL._AC_UL320_.jpg',
   Decimal('4.40'),
   '2,265',
   '₹331',
   '₹474'),
  "Question: Is this item have a real baby picture? Answer: The image on the bottle and box is a cartoon drawing of a baby's head and a stylized image of a hand massaging a baby's foot. There are no real baby photos present.  Therefore, the match score is 60. "],
 [(15,
   'Cetaphil Baby Mild Bar 75gm, Kids Soap for Bath, White, Medium',
   'toys & baby products',
   'Baby Bath, Skin & Grooming',
   'https://m.media-amazon.com/images/I/61LHzpekqBL._AC_UL320_.jpg',
   Decimal('4.50'),
   '2,544',
   '₹210',
   '₹218'),
  "Question: Is this item have a real baby picture? Answer: The image depicts a stylized illustration of a baby's bottom being cleaned with a washcloth/p

In [23]:
elements[1][39]

'Question: Is this item have a real baby picture? Answer: The image depicts a pink hooded towel designed to look like a bear.  There is no actual baby pictured. The hood has bear ears and a simple embroidered face.  The item is clearly for babies, but only represents a baby wrapped in a towel by its shape and intended use.  Match Score: 70 '

In [9]:
pk = [[rows[i][j] for j in pk_index] for i in range(len(results_text)) if  predicted_classes[i] == label]
pk

[[41], [106], [110], [159], [211], [243], [250], [264], [269], [293], [304]]

In [10]:
def pk_clause(pk, PRIMARY_KEY):
    return " AND ".join([f"{PRIMARY_KEY[i]} = {pk[i]}" for i in range(len(pk))])

def where_clause(PK, PRIMARY_KEY): 
    return " OR ".join([f"{pk_clause(single, PRIMARY_KEY)}" for single in PK])

In [11]:
user_query = f"SELECT * FROM Baby_Bath_Skin_and_Grooming WHERE {where_clause(pk, PRIMARY_KEY)}"
user_query

'SELECT * FROM Baby_Bath_Skin_and_Grooming WHERE id = 41 OR id = 106 OR id = 110 OR id = 159 OR id = 211 OR id = 243 OR id = 250 OR id = 264 OR id = 269 OR id = 293 OR id = 304'

In [12]:
cursor.execute(user_query) 
rows_filtered = cursor.fetchall()

In [13]:
rows_filtered

[(41,
  'Aveeno Baby Gentle Wash & Shampoo with Natural Oat Extract, Tear-Free &, Lightly Scented, 18 fl',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/51Gpjipr8SL._AC_UL320_.jpg',
  Decimal('4.40'),
  '211',
  '₹1,589',
  '₹1,999'),
 (106,
  'Pigeon Baby Conditioning Shampoo, For Newborns, Strengthens and Nourishes Hair, Enriched with Chamomile, Rosehip and Olive...',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/51D3ndPw0FL._AC_UL320_.jpg',
  Decimal('4.20'),
  '581',
  '₹317',
  '₹435'),
 (110,
  'Mamaearth Daily Moisturizing Natural Baby Lotion (400 ml) & Mamaearth Deeply Nourishing Natural Baby wash (400 ml, 0-5 Yrs)',
  'toys & baby products',
  'Baby Bath, Skin & Grooming',
  'https://m.media-amazon.com/images/I/71ECf9SFIPL._AC_UL320_.jpg',
  Decimal('4.30'),
  '12,057',
  '₹578',
  '₹798'),
 (159,
  "Mother Sparsh Plant Powered Natural Baby Shampoo With Vanilla Oil, Vitamin E 