In [None]:
import os

os.environ["PINECONE_API_KEY"] = "pcsk_3o6ozV_JHvPeAYZdSSRn1FtXu8njTnewDh1TXcNgecMfZVS6dUynnabzCHMXf59FijkWhu"
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"


In [None]:
!pip install langchain langchain-community langchain-core pinecone
!pip install -U langchain-huggingface



In [None]:
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint

In [None]:
# --- GenAI (LangChain) ---
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
# from langchain.chains import LLMChain


In [None]:
# --- 0. Imports and Setup ---

# Install necessary libraries


# --- Core Libraries ---
import pandas as pd
import numpy as np
import os
import re
import requests
from io import BytesIO
from PIL import Image
import torch
from tqdm.auto import tqdm # For progress bars

# --- ML / NLP / CV (HuggingFace Transformers) ---
from transformers import AutoProcessor, AutoModel, pipeline
from sentence_transformers import SentenceTransformer

# --- Vector Database (Pinecone) ---
from pinecone import Pinecone, ServerlessSpec

# --- GenAI (LangChain) ---
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.llms import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
# from langchain.chains import LLMChain   # ✅ new location


# --- Evaluation ---
from sklearn.metrics import classification_report
from sklearn.preprocessing import MultiLabelBinarizer

# --- Environment Setup ---
# Reasoning: We use environment variables to store sensitive API keys.
# This is much safer than hardcoding them.
# You MUST set these in your environment (e.E., in a .env file or your OS)
PINECONE_API_KEY = os.environ.get("PINECONE_API_KEY")
HUGGINGFACEHUB_API_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

if not PINECONE_API_KEY or not HUGGINGFACEHUB_API_TOKEN:
    print("="*50)
    print("WARNING: API keys not found in environment variables.")
    print("Please set PINECONE_API_KEY and HUGGINGFACEHUB_API_TOKEN.")
    print("="*50)

# --- Device Setup (for PyTorch) ---
# Reasoning: We check if a GPU (like CUDA or Apple's MPS) is available.
# This will make our embedding generation *significantly* faster.
if torch.cuda.is_available():
    DEVICE = torch.device("cuda")
elif torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
else:
    DEVICE = torch.device("cpu")

print(f"Using device: {DEVICE}")

Using device: cuda


In [None]:
# --- 1. Load and Preprocess Data ---

# Reasoning: We need a clean, consistent dataset. These steps ensure
# data types are correct and missing values are handled before
# feeding data into our models.

DATA_FILE_PATH = 'data/intern_data_ikarus.csv' # Adjust if your path is different

try:
    df = pd.read_csv(DATA_FILE_PATH)
    print(f"Successfully loaded data. Shape: {df.shape}")
except FileNotFoundError:
    print(f"Error: Dataset file not found at {DATA_FILE_PATH}.")
    print("Please ensure the file is in the 'data' directory.")
    # Create a mock DataFrame to allow the rest of the notebook to run
    df = pd.DataFrame({
        'uniq_id': [str(i) for i in range(10)],
        'title': ['Comfy Chair', 'Large Sofa', 'Modern Table', 'Bright Lamp', 'Wooden Chair', 'Patio Set', 'Bookshelf', 'Office Desk', 'King Bed', 'Nightstand'],
        'description': ['A very comfortable chair for your living room.', 'A large sofa, perfect for the whole family.', 'A sleek modern table.', 'A bright lamp to light up your desk.', 'A sturdy wooden chair.', 'Outdoor patio set with 4 chairs.', 'Tall bookshelf for all your books.', 'A professional office desk.', 'A large king-sized bed frame.', 'A small nightstand.'],
        'price': [199.99, 799.00, 350.50, 80.00, 150.00, 450.00, 250.00, 300.00, 899.00, 120.00],
        'categories': ["['Home', 'Furniture', 'Chair']", "['Home', 'Furniture', 'Sofa']", "['Home', 'Furniture', 'Table']", "['Home', 'Decor', 'Lamp']", "['Home', 'Furniture', 'Chair']", "['Outdoor', 'Furniture', 'Patio Set']", "['Home', 'Furniture', 'Bookshelf']", "['Home', 'Furniture', 'Desk']", "['Home', 'Furniture', 'Bed']", "['Home', 'Furniture', 'Nightstand']"],
        'brand': ['BrandA', 'BrandB', 'BrandA', 'BrandC', 'BrandA', 'BrandD', 'BrandB', 'BrandA', 'BrandE', 'BrandE'],
        'material': ['Wood', 'Fabric', 'Glass', 'Metal', 'Wood', 'Metal', 'Wood', 'Wood', 'Fabric', 'Wood'],
        'color': ['Brown', 'Gray', 'Black', 'Silver', 'Brown', 'Black', 'White', 'Brown', 'Gray', 'White'],
        'images': ["['https_www.example.com/img1.jpg']"] * 10 # Mock images
    })
    print("Loaded mock data to proceed.")


# --- Data Cleaning Steps ---
# 1. Drop duplicates
df.drop_duplicates(subset='uniq_id', inplace=True)

# 2. Handle missing 'description'
df['description'] = df['description'].fillna('')

# 3. Drop rows with missing critical info
df.dropna(subset=['price', 'title', 'categories', 'images', 'uniq_id'], inplace=True)

# 4. Parse 'categories'
def parse_string_list(x):
    try:
        return re.findall(r"'(.*?)'", str(x))
    except:
        return []

df['categories_list'] = df['categories'].apply(parse_string_list)
df['primary_category'] = df['categories_list'].apply(lambda x: x[-1] if len(x) > 0 else 'Unknown')

# 5. Parse 'images'
# Reasoning: We need a single, valid image URL to feed to our CV model.
# We'll take the *first* image from the list.
df['image_url_list'] = df['images'].apply(parse_string_list)
df['first_image_url'] = df['image_url_list'].apply(lambda x: x[0] if len(x) > 0 else None)
df.dropna(subset=['first_image_url'], inplace=True) # Drop items with no images

# 6. Create a combined text field for embedding
# Reasoning: Combining title and description gives our NLP model
# more context, resulting in a better semantic vector.
df['text_to_embed'] = df['title'] + " | " + df['description']

# --- Sample the data for this assignment ---
# Reasoning: Embedding 10,000+ items can take a long time. For a 2-day
# project, it's wise to start with a smaller sample (e.g., 1000 items)
# to prove the pipeline works end-to-end.
# For the final submission, you can run it on the full dataset.
df_sample = df.sample(n=min(1000, df.shape[0]), random_state=42)

print(f"Data cleaned. Using a sample of {df_sample.shape[0]} items for modeling.")
print(df_sample[['uniq_id', 'text_to_embed', 'first_image_url', 'primary_category']].head())

Successfully loaded data. Shape: (312, 12)
Data cleaned. Using a sample of 210 items for modeling.
                                  uniq_id  \
53   9762324d-bb95-5daa-a613-7c5f77eb01f6   
262  6aaac02f-cfdb-5330-88e5-fd88a1857f51   
139  e558c4f4-12f2-5cfd-a314-d69c22675547   
298  c88138fb-5c2c-5f19-939a-a18c479ce897   
107  54e4f202-a43e-5859-b47e-3c81ef395b31   

                                         text_to_embed  \
53   Franklin Sports NFL Storage Ottoman + Containe...   
262  SLLFLY Water Bottle Organizer,Stackable Water ...   
139  Homevany Bamboo Wine Rack,4 Tier, Wine Bottle ...   
298  KINGYES Folding Adjustable Backrest Adirondack...   
107  Xchouxer Side Tables Natural Bamboo Sofa Armre...   

                                       first_image_url  \
53   https://m.media-amazon.com/images/I/31ptZB+wS-...   
262  https://m.media-amazon.com/images/I/51EAJVwOuL...   
139  https://m.media-amazon.com/images/I/51DO5hfgdK...   
298  https://m.media-amazon.com/images/I/41RnRNOg

In [None]:
# --- 2. NLP: Generate Text Embeddings ---

# Reasoning: We are using a pre-trained model specialized in creating
# meaningful sentence/paragraph embeddings. This model, 'all-MiniLM-L6-v2',
# is fast, lightweight, and highly effective for semantic search.
# This fulfills the NLP requirement.

print("Initializing text embedding model (all-MiniLM-L6-v2)...")
# We use LangChain's HuggingFaceEmbeddings wrapper, as required [cite: 46]
text_embedder = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2",
    model_kwargs={'device': DEVICE}
)

print("Generating text embeddings for all products in the sample...")
# Get the list of texts to embed
texts = df_sample['text_to_embed'].tolist()

# Generate embeddings (this may take a minute)
# .embed_documents() is the batch method
text_embeddings = text_embedder.embed_documents(texts)

# Add embeddings to our DataFrame
df_sample['text_embedding'] = list(text_embeddings)

print("Text embeddings generated successfully.")
print(f"Vector dimension: {len(df_sample['text_embedding'].iloc[0])}")

Initializing text embedding model (all-MiniLM-L6-v2)...
Generating text embeddings for all products in the sample...
Text embeddings generated successfully.
Vector dimension: 384


In [None]:
# --- 3. CV: Generate Image Embeddings ---

# Reasoning: We need to represent our images as vectors. We chose CLIP,
# a state-of-the-art model that embeds images and text in the same
# space. This means a text query ("wooden chair") can find matching
# image vectors, which is perfect for our recommendation goal.
# This fulfills the CV requirement.

# --- Load the token from the Colab Secrets ---
HF_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

if not HF_TOKEN:
    print("="*50)
    print("ERROR: HUGGINGFACEHUB_API_TOKEN not found.")
    print("Please add it to Colab Secrets and restart the runtime.")
    print("="*50)
else:
    print("Token found. Initializing CV embedding model (CLIP)...")

    # --- FIX: Explicitly pass the token to the loaders ---
    model_name = "openai/clip-vit-base-patch32"

    # Load the CLIP model and processor from HuggingFace, passing the token
    clip_model = AutoModel.from_pretrained(
        model_name,
        token=HF_TOKEN
    ).to(DEVICE)

    clip_processor = AutoProcessor.from_pretrained(
        model_name,
        token=HF_TOKEN
    )

    # --- Helper function to get image from URL ---
    def get_image_embedding(image_url):
        """Downloads an image, processes it, and returns its CLIP embedding."""
        try:
            # 1. Download image
            response = requests.get(image_url, timeout=5)
            response.raise_for_status() # Raise error for bad responses
            image = Image.open(BytesIO(response.content)).convert("RGB")

            # 2. Process image for CLIP
            with torch.no_grad(): # Disable gradient calculation for inference
                inputs = clip_processor(images=image, return_tensors="pt", padding=True).to(DEVICE)

                # 3. Get the image features (embedding)
                image_features = clip_model.get_image_features(**inputs)

            return image_features.cpu().numpy().flatten().tolist()

        except Exception as e:
            # print(f"Error processing image {image_url}: {e}")
            return None

    # --- Generate Embeddings (with a progress bar) ---
    print(f"Generating image embeddings for {df_sample.shape[0]} items...")

    image_embeddings = []
    for url in tqdm(df_sample['first_image_url']):
        embedding = get_image_embedding(url)
        image_embeddings.append(embedding)

    df_sample['image_embedding'] = image_embeddings

    # --- Clean up failed image downloads ---
    failed_count = df_sample['image_embedding'].isna().sum()
    if failed_count > 0:
        print(f"Warning: Failed to process {failed_count} images. They will be dropped.")
        df_sample = df_sample.dropna(subset=['image_embedding'])

    print(f"Image embeddings generated. New sample size: {df_sample.shape[0]}")
    print(f"Vector dimension: {len(df_sample['image_embedding'].iloc[0])}")

Token found. Initializing CV embedding model (CLIP)...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Generating image embeddings for 210 items...


  0%|          | 0/210 [00:00<?, ?it/s]

Image embeddings generated. New sample size: 3
Vector dimension: 512


In [None]:
# --- 4. Create Hybrid Embeddings ---

# Reasoning: By concatenating text and image vectors, we create a
# single "hybrid" vector that represents the product using *both* its
# visual and textual features. A search query can then be vectorized
# and compared against this rich vector for superior recommendations.

# Text embedding dim: 384
# Image embedding dim: 512
# Hybrid embedding dim: 384 + 512 = 896

def create_hybrid_embedding(row):
    text_vec = np.array(row['text_embedding'])
    image_vec = np.array(row['image_embedding'])

    # Concatenate the two vectors
    hybrid_vec = np.concatenate([text_vec, image_vec])

    return hybrid_vec.tolist()

print("Creating hybrid embeddings...")
df_sample['hybrid_embedding'] = df_sample.apply(create_hybrid_embedding, axis=1)

HYBRID_VECTOR_DIMENSION = len(df_sample['hybrid_embedding'].iloc[0])
print(f"Hybrid embeddings created with dimension: {HYBRID_VECTOR_DIMENSION}")

Creating hybrid embeddings...
Hybrid embeddings created with dimension: 896


In [None]:
# --- 5. Store Embeddings in Vector DB (Pinecone) ---

# Reasoning: A vector database is required[cite: 32]. We are using Pinecone
# to store our 896-dimensional vectors. This allows for
# "semantic search"—finding the most similar product vectors
# to a user's query vector in milliseconds.

if not PINECONE_API_KEY:
    print("Skipping Pinecone: API key not found.")
else:
    print("Initializing Pinecone...")
    pc = Pinecone(api_key=PINECONE_API_KEY)

    INDEX_NAME = "product-recommendation"

    # 1. Create the index (if it doesn't exist)
    if INDEX_NAME not in pc.list_indexes().names():
        print(f"Creating new serverless index: {INDEX_NAME}")
        pc.create_index(
            name=INDEX_NAME,
            dimension=HYBRID_VECTOR_DIMENSION,
            metric="cosine", # Cosine similarity is good for semantic search
            spec=ServerlessSpec(
                cloud="aws",
                region="us-east-1"
            )
        )
        print("Index created.")
    else:
        print(f"Index '{INDEX_NAME}' already exists.")

    # 2. Connect to the index
    index = pc.Index(INDEX_NAME)
    print(index.describe_index_stats())

    # 3. Format data for upsert
    # We must include 'metadata' (the product details) so that
    # when our API gets a match, it can return the title, price, etc.
    vectors_to_upsert = []
    for _, row in df_sample.iterrows():
        # Ensure metadata values are in a valid format (e.g., non-null)
        metadata = {
            'title': str(row['title']),
            'price': float(str(row['price']).replace('$', '').replace(',', '').strip()),
            'brand': str(row.get('brand', 'Unknown')),
            'category': str(row['primary_category']),
            'image_url': str(row['first_image_url']),
            'text': str(row['text_to_embed'])
        }

        vectors_to_upsert.append({
            'id': str(row['uniq_id']),
            'values': row['hybrid_embedding'],
            'metadata': metadata
        })

    # 4. Upsert data in batches
    # Reasoning: Upserting in batches is much more efficient than one by one.
    BATCH_SIZE = 100
    print(f"Upserting {len(vectors_to_upsert)} vectors in batches of {BATCH_SIZE}...")

    for i in tqdm(range(0, len(vectors_to_upsert), BATCH_SIZE)):
        batch = vectors_to_upsert[i : i + BATCH_SIZE]
        index.upsert(vectors=batch)

    print("--- Pinecone Upsert Complete ---")
    print(index.describe_index_stats())

Initializing Pinecone...
Index 'product-recommendation' already exists.
{'dimension': 896,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 3}},
 'total_vector_count': 3,
 'vector_type': 'dense'}
Upserting 3 vectors in batches of 100...


  0%|          | 0/1 [00:00<?, ?it/s]

--- Pinecone Upsert Complete ---
{'dimension': 896,
 'index_fullness': 0.0,
 'metric': 'cosine',
 'namespaces': {'': {'vector_count': 3}},
 'total_vector_count': 3,
 'vector_type': 'dense'}


In [None]:
!pip install -U langchain-google-genai




In [None]:
import os
os.environ["GOOGLE_API_KEY"] = "AIzaSyC11c84Sm2USn9BkIh9lh_nszObnHefKRM"

In [None]:
!pip install -U langchain langchain-core langchain-google-genai



In [None]:
# --- 6. GenAI: Product Description Generator (New Approach: Google Gemini) ---

# Import the required libraries
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain_core.prompts import PromptTemplate
import os
import pandas as pd

# --- Load the token from the Colab Secrets ---
if 'GOOGLE_API_KEY' not in os.environ:
    try:
        from google.colab import userdata
        os.environ['GOOGLE_API_KEY'] = userdata.get('GOOGLE_API_KEY')
    except:
        pass

if not os.environ.get("GOOGLE_API_KEY"):
    print("="*50)
    print("ERROR: GOOGLE_API_KEY not found in environment.")
    print("Please add your key to Colab Secrets (key icon) and restart.")
    print("="*50)
else:
    print("Google API Key found! Initializing GenAI (Gemini) via LangChain...")

    try:
        # --- THIS IS THE FIX ---
        # We are using "gemini-1.5-flash" instead of "gemini-pro"
        llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0.7)

        # 2. Create a Prompt Template
        template = """
        You are a creative marketing assistant. Your task is to write a short,
        engaging, and creative product description for a furniture website.
        Do NOT just list the features. Make it sound appealing.

        Product Details:
        - Title: {title}
        - Category: {category}
        - Material: {material}

        Your Creative Description:
        """
        prompt = PromptTemplate.from_template(template)

        # 3. Create the LangChain chain
        llm_chain = prompt | llm

        # --- Test the GenAI Pipeline ---
        print("\n--- Testing GenAI Pipeline ---")

        test_product = df_sample.iloc[0]

        # Handle 'nan' material value
        material = test_product.get('material')
        if pd.isna(material):
            material_for_prompt = "various high-quality materials"
        else:
            material_for_prompt = str(material)

        test_input = {
            'title': test_product['title'],
            'category': test_product['primary_category'],
            'material': material_for_prompt
        }

        print(f"Input for GenAI:\n{test_input}")

        # 4. Run the chain
        response = llm_chain.invoke(test_input)
        generated_description = response.content

        print("\nGenerated Description:")
        print(generated_description)

    except Exception as e:
        print(f"\n--- ERROR ---")
        print("The API call failed. This might be an issue with your API key or permissions.")
        print(f"Details: {e}")

Google API Key found! Initializing GenAI (Gemini) via LangChain...

--- Testing GenAI Pipeline ---
Input for GenAI:
{'title': '#4203 Adjustable 1/4" Threaded Non-Skid Leveling Glides Black Pad 4-Pack', 'category': 'Chairs', 'material': 'various high-quality materials'}

Generated Description:
Give your beloved chairs the unwavering support they deserve. These ingenious glides are the secret to perfectly stable seating and pristine floors. No more wobbles, no more scratches – just a smooth, silent glide and a rock-solid foundation for every moment. Elevate your comfort, protect your home.


In [None]:
# --- 7. CV: Zero-Shot Classification and Evaluation ---

# Reasoning: The PDF requires an "image classification model" and
# [cite_start]"model performance evaluation"[cite: 30, 55]. Training a full CNN/ResNet
# is too slow for this project. A "zero-shot" model is a clever
# and modern solution. It uses a model like CLIP to see which text label
# (e.g., "Chair", "Sofa") is most similar to the image, without any training.

print("\n--- Starting CV Model Evaluation (Zero-Shot) ---")

# --- Load the token from the Colab Secrets ---
# We need this token to download the classifier model from Hugging Face.
HF_TOKEN = os.environ.get("HUGGINGFACEHUB_API_TOKEN")

if not HF_TOKEN:
    print("="*50)
    print("ERROR: HUGGINGFACEHUB_API_TOKEN not found.")
    print("This cell needs the token to download the CLIP model.")
    print("Please add it to Colab Secrets and restart the runtime.")
    print("="*50)
else:
    print("Token found. Initializing zero-shot classification pipeline...")

    # 1. Load the zero-shot classification pipeline
    # --- THIS IS THE FIX ---
    # We must pass the token to the pipeline so it can
    # download the model from Hugging Face without a 401 Error.
    classifier = pipeline(
        "zero-shot-image-classification",
        model="openai/clip-vit-base-patch32",
        device=DEVICE,
        token=HF_TOKEN  # <-- The added argument
    )

    # 2. Get our candidate labels (the real categories)
    candidate_labels = df_sample['primary_category'].unique().tolist()
    print(f"Will classify images into {len(candidate_labels)} categories.")

    # 3. Run evaluation on a small sample (e.g., 50 items)
    # (Running on all items would be slow)
    eval_sample_size = 50
    eval_df = df_sample.sample(n=min(eval_sample_size, df_sample.shape[0]))

    y_true = []
    y_pred = []

    print(f"Running zero-shot classification on {eval_df.shape[0]} images...")
    for _, row in tqdm(eval_df.iterrows(), total=eval_df.shape[0]):
        try:
            # Get image from URL
            response = requests.get(row['first_image_url'], timeout=5)
            image = Image.open(BytesIO(response.content)).convert("RGB")

            # Classify
            preds = classifier(image, candidate_labels=candidate_labels)

            # Store true label and predicted label
            y_true.append(row['primary_category'])
            y_pred.append(preds[0]['label']) # The label with the highest score

        except Exception as e:
            # print(f"Error classifying image {row['first_image_url']}: {e}")
            continue

    # 4. Show Performance Evaluation
    # [cite_start]Reasoning: This report fulfills the deliverable requirement[cite: 55].
    # It shows how well our CV model can predict the correct category
    # (Accuracy, Precision, Recall, F1-score).
    print("\n--- Model Performance Evaluation: CV (Zero-Shot) ---")

    # Ensure we have a consistent set of labels for the report
    all_labels = sorted(list(set(y_true + y_pred)))

    report = classification_report(y_true, y_pred, labels=all_labels, zero_division=0)
    print(report)

    print("\n--- Model Training Notebook Complete ---")
    print("All models are built, and vectors are in Pinecone (if enabled).")
    print("Next step: Build the FastAPI backend.")


--- Starting CV Model Evaluation (Zero-Shot) ---
Token found. Initializing zero-shot classification pipeline...


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda


Will classify images into 3 categories.
Running zero-shot classification on 3 images...


  0%|          | 0/3 [00:00<?, ?it/s]


--- Model Performance Evaluation: CV (Zero-Shot) ---
                          precision    recall  f1-score   support

                  Chairs       0.00      0.00      0.00         1
Free Standing Shoe Racks       0.50      1.00      0.67         1
    Wall-Mounted Mirrors       1.00      1.00      1.00         1

                accuracy                           0.67         3
               macro avg       0.50      0.67      0.56         3
            weighted avg       0.50      0.67      0.56         3


--- Model Training Notebook Complete ---
All models are built, and vectors are in Pinecone (if enabled).
Next step: Build the FastAPI backend.
