In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import faiss
import os
import pickle
import torch
import openai
from tqdm import tqdm
from transformers import BertForSequenceClassification, BertTokenizer
from sentence_transformers import SentenceTransformer

# Set up OpenAI API key
openai.api_key = 'YOUR_OPENAI_API_KEY'

# Set up device (use GPU if available)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


In [2]:
# Load the sentiment analysis model and tokenizer
sentiment_model = BertForSequenceClassification.from_pretrained('sentiment_model')
sentiment_tokenizer = BertTokenizer.from_pretrained('sentiment_model')

# Move the model to the appropriate device
sentiment_model.to(device)
sentiment_model.eval()


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

In [3]:
# Initialize the embedding model for RAG
embedding_model = SentenceTransformer('all-mpnet-base-v2')  # Or your chosen embedding model

In [None]:
embeddings_file = 'product_embeddings.npy'
metadata_file = 'product_metadata.pkl'
index_file = 'faiss_index.index'

if os.path.exists(embeddings_file) and os.path.exists(metadata_file) and os.path.exists(index_file):
    # Load embeddings, metadata, and index
    embeddings = np.load(embeddings_file)
    with open(metadata_file, 'rb') as f:
        metadata = pickle.load(f)
    index = faiss.read_index(index_file)
    print("Loaded embeddings and index from disk.")
else:
    # Load the dataset
    df = pd.read_csv('train.csv')  
    
    # Clean the dataset
    df_cleaned = df[['TITLE', 'BULLET_POINTS', 'DESCRIPTION']].dropna()
    df_cleaned.reset_index(drop=True, inplace=True)
    
    # Display the number of products
    print(f"Number of products in the dataset: {len(df_cleaned)}")
    
    # Compute embeddings
    print("Computing embeddings...")
    texts = [
        f"{row['TITLE']} {row['BULLET_POINTS']} {row['DESCRIPTION']}"
        for _, row in tqdm(df_cleaned.iterrows(), total=df_cleaned.shape[0])
    ]
    embeddings = embedding_model.encode(texts, batch_size=64, show_progress_bar=True)
    embeddings = np.array(embeddings).astype('float32')
    
    # Store metadata
    metadata = [
        {
            'title': row['TITLE'],
            'bullet_points': row['BULLET_POINTS'],
            'description': row['DESCRIPTION']
        }
        for _, row in df_cleaned.iterrows()
    ]
    
    # Save embeddings and metadata
    np.save(embeddings_file, embeddings)
    with open(metadata_file, 'wb') as f:
        pickle.dump(metadata, f)
    
    # Build FAISS index
    print("Building FAISS index...")
    dimension = embeddings.shape[1]
    index = faiss.IndexHNSWFlat(dimension, 32)
    faiss.normalize_L2(embeddings)
    index.add(embeddings)
    faiss.write_index(index, index_file)
    print("FAISS index built and saved to disk.")


Loaded embeddings and index from disk.


In [5]:
# Function to preprocess text
def preprocess_text(text):
    return text.lower().strip()

# Sentiment Analysis Function
def analyze_sentiment(text):
    inputs = sentiment_tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    with torch.no_grad():
        outputs = sentiment_model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    label_map = {0: "negative", 1: "neutral", 2: "positive"}
    predicted_sentiment = label_map[predicted_class]
    return predicted_sentiment

# Function to extract key information using GPT
def extract_key_info(review):
    messages = [
        {
            "role": "system",
            "content": "You are a helpful assistant that extracts key information from product reviews."
        },
        {
            "role": "user",
            "content": (
                f"Please extract the product title and key features mentioned in the following review:\n\n"
                f"Review: \"{review}\"\n\n"
                f"1. Product Title:\n2. Key Features:"
            )
        }
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=messages
    )
    extracted_info = response['choices'][0]['message']['content'].strip()
    return extracted_info

# Function to retrieve product information using RAG
def retrieve_product_info(query_title, index, metadata, top_k=3):
    query_title_normalized = preprocess_text(query_title)
    query_embedding = embedding_model.encode([query_title_normalized], convert_to_numpy=True)
    query_embedding = np.array(query_embedding).astype('float32')
    faiss.normalize_L2(query_embedding)
    distances, indices = index.search(query_embedding, top_k * 5)
    initial_results = []
    for idx in indices[0]:
        product_info = metadata[idx]
        initial_results.append(product_info)
    query_tokens = set(query_title_normalized.split())
    scored_results = []
    for product_info in initial_results:
        product_title = preprocess_text(product_info['title'])
        product_tokens = set(product_title.split())
        overlap = query_tokens.intersection(product_tokens)
        score = len(overlap)
        scored_results.append((score, product_info))
    scored_results.sort(key=lambda x: x[0], reverse=True)
    results = [item[1] for item in scored_results[:top_k]]
    return results

# Function to create initial prompt based on sentiment and features
def create_ad_prompt(sentiment, features):
    if sentiment == "positive":
        prompt = (
            f"Write an enthusiastic advertisement that highlights these exceptional features: {features}. "
            f"Emphasize how these features make the product unique and beloved by customers."
        )
    elif sentiment == "negative":
        prompt = (
            f"Customers have expressed concerns about: {features}. "
            f"Create an advertisement for an upgraded product that addresses these issues. "
            f"Highlight the improvements and reassure customers by emphasizing the new benefits."
        )
    elif sentiment == "neutral":
        prompt = (
            f"Based on customer feedback regarding: {features}, "
            f"introduce a new and improved product that addresses these points. "
            f"Develop an informative advertisement that highlights the product's strengths and the enhancements made. "
            f"Provide compelling information to educate potential customers about the product's benefits."
        )
    else:
        prompt = (
            f"Generate an advertisement highlighting the key features: {features}."
        )
    return prompt

system_prompt = (
    "You are an expert marketing copywriter with a talent for crafting compelling and persuasive advertisements. "
    "Your ads should be engaging, informative, and encourage potential customers to consider purchasing the product."
)

# Function to generate final advertisement using GPT
def generate_ad(prompt, key_features, product_info):
    product_details = ""
    for info in product_info:
        title = info['title']
        bullet_points = info['bullet_points']
        description = info['description']
        product_details += f"Title: {title}\nBullet Points: {bullet_points}\nDescription: {description}\n\n"
    messages = [
        {
            "role": "system",
            "content": system_prompt
        },
        {
            "role": "user",
            "content": (
                f"{prompt}\n\n"
                f"Product Information:\n{product_details}"
            )
        }
    ]
    response = openai.ChatCompletion.create(
        model="gpt-4o-mini",
        messages=messages,
        max_tokens=350,
        temperature=0.7,
        n=1,
        stop=None
    )
    ad_text = response['choices'][0]['message']['content'].strip()
    return ad_text


In [6]:
import re

# Example reviews for testing (you can replace these with other reviews)
user_reviews = [
    "This tablet is incredibly slow, and the battery drains faster than I can charge it. It’s almost unusable for anything productive.",
    "I bought these sunglasses thinking they’d be durable, but the frame broke after just one drop. Not impressed with the quality.",
    "This hair dryer cuts my drying time in half, and my hair feels smoother. The heat settings are just right, and it’s lightweight too!",
    "The electric toothbrush is too aggressive on my gums, even on the gentlest setting. My gums started bleeding, so I stopped using it.",
    "The fitness tracker’s step count is way off. It’s stylish, but the tracking features are inaccurate, which defeats the purpose.",
    "These Nike shoes are super comfortable and lightweight, ideal for long runs.",
    "This stroller is impossible to maneuver. It’s heavy, clunky, and folding it up is a nightmare. I regret this purchase.",
    "The yoga mat has a horrible chemical smell that hasn’t gone away after several uses. I can’t use it without feeling nauseated.",
    "The headphones started crackling within a week. Definitely not worth the price – really disappointed with the sound quality.",
    "The mini projector is fun to use, but the image quality is quite poor. It’s okay for occasional use, but don’t expect cinema quality."
]

# Iterate over each review and run the process
for i, user_review in enumerate(user_reviews, start=1):
    print(f"Test Case {i}:")
    
    # Step 1: Sentiment Analysis
    sentiment = analyze_sentiment(user_review)
    print(f"\nPredicted Sentiment: {sentiment}")
    
    # Step 2: Extract Key Information
    extracted_info = extract_key_info(user_review)
    print(f"\nExtracted Information:\n{extracted_info}")
    
    # Parse the extracted information to get the product title and key features
    title_match = re.search(r"1\. Product Title:\s*(.*)", extracted_info)
    features_match = re.search(r"2\. Key Features:\s*(.*)", extracted_info, re.DOTALL)

    if title_match:
        product_title = title_match.group(1).strip()
    else:
        product_title = ""
    if features_match:
        key_features = features_match.group(1).strip()
    else:
        key_features = ""
    print(f"\nProduct Title: {product_title}")
    print(f"Key Features: {key_features}")
    
    # Step 3: Retrieve Product Information
    retrieved_info = retrieve_product_info(product_title, index, metadata, top_k=3)
    print("\nRetrieved Product Information:")
    for j, info in enumerate(retrieved_info):
        print(f"\nResult {j+1}:")
        print(f"Title: {info['title']}")
        print(f"Bullet Points: {info['bullet_points']}")
        print(f"Description: {info['description']}")
    
    # Step 4: Create Initial Prompt
    initial_prompt = create_ad_prompt(sentiment, key_features)
    print(f"\nInitial Prompt:\n{initial_prompt}")
    
    # Step 5: Generate Final Advertisement
    final_ad = generate_ad(initial_prompt, key_features, retrieved_info)
    print("\nGenerated Advertisement:\n")
    print(final_ad)
    print("==================================================\n")


Test Case 1:


  attn_output = torch.nn.functional.scaled_dot_product_attention(



Predicted Sentiment: negative

Extracted Information:
1. Product Title: Tablet
2. Key Features: Slow performance, fast battery drain, unusable for productivity.

Product Title: Tablet
Key Features: Slow performance, fast battery drain, unusable for productivity.

Retrieved Product Information:

Result 1:
Title: TS X1 Tablet i5 8G 256G W10P
Bullet Points: Lenovo ThinkPad X1 Tablet 3rd Gen 20KJ0019US 13 Touchscreen LCD 2 in 1 Notebook - Intel Core i5 (8th Gen) i5-8350U Quad-core (4 Core) 1.70 GHz - 8 GB DDR4 SDRAM - 256 GB SSD - Windows 10 Pro 64-bit (English) - 3000 x 2000 - In-plane Switching (IPS) Technology - Black - Intel UHD Graphics 620 DDR4 SDRAM - Bluetooth - English (US) Keyboard - Near Field Communication - IEEE 802.11a/b/g/n/ac - Ethernet - Network (RJ-45) - USB Type-C
Description: Tablet. Laptop. Canvas. You decide.Redesigned from the ground up, the ThinkPad X1 Tablet gives you power, productivity, and portability in an ultrapremium detachable 2-in-1 that means business. Fr