In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
projectPath = '/content/drive/MyDrive/RAG_Chatbot_System'
os.chdir(projectPath)

In [None]:
# Required packages
!pip install -U sentence-transformers chromadb

In [None]:
!pip install transformers torch huggingface-hub -qqq

In [None]:
!pip install -q accelerate bitsandbytes

In [None]:
# Import Required packages
# import config  # Token Key
import pandas as pd
import numpy as np
import chromadb
import re
import logging
import torch
import textwrap
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, PreTrainedTokenizerBase
from typing import Dict, Any, List, Union, Optional
from huggingface_hub import login
import hashlib
import pickle

login(token = config.huggingFaceToken) # Your Hugging Face Login Token

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16
)

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

In [None]:
import spacy
from spacy.matcher import PhraseMatcher

In [None]:
# Load the CSV file
def load_csv(fileName: str, encoding: Optional[str] = 'utf-8') -> pd.DataFrame:
  """ Load a CSV file into a pandas DataFrame.
  Args:
      fileName (str, optional): Path to the CSV file. Defaults to 'tnt.csv'.
      encoding (str, None, optional): Encoding of the CSV file. Defaults to 'utf-8'.

  Returns:
      pd.DataFrame: The loaded DataFrame.

  Raises:
      FileNotFoundError: If the specified file does not exist.
      pd.errors.ParserError: If the CSV file is malformed.
      UnicodeDecodeError: If the file cannot be decoded with the specified encoding.
      Exception: For other unexpected errors during CSV loading.
  """
  try:
    df = pd.read_csv(fileName, encoding=encoding)
  except FileNotFoundError:
    logger.error(f"Error: File '{fileName}' not found. Please ensure the file exists in the working directory.")
    raise
  except pd.errors.ParserError as e:
    logger.error(f"Error: Failed to parse CSV file '{fileName}'. The file may be malformed: {e}")
    raise
  except UnicodeDecodeError as e:
    logger.error(f"Error: Unable to decode file '{fileName}' with encoding '{encoding}': {e}")
    raise
  except Exception as e:
    logger.error(f"Error: An unexpected error occurred while loading '{fileName}': {e}")
    raise
  else:
    logger.info(f"Successfully loaded the file: {fileName}")
    return df

In [None]:
def generate_embeddings(texts:
                        Union[str, List[str]],
                        model: SentenceTransformer
                        ) -> List[List[float]]:
    """ Generate embeddings for a list of texts using the sentence transformer model.

    Args:
        texts (Union[str, List[str]]): Input text or list of tests to encode.
        model (SentenceTransformer): Pretrained SentenceTransformer model.

    Returns:
        List[List[float]]: List of embeddings as lists of floats.

    Raises:
        Exception: For unexpected errors during embeddings.
    """

    if not texts:
        return []

    # Ensure texts is a list; if a single string, convert to list
    if isinstance(texts, str):
        texts = [texts]

    try:
        # Generate embeddings in batch
        embeddings: np.ndarray = model.encode(texts,
                                              show_progress_bar=True,
                                              convert_to_numpy=True
                                              )
        # Convert NumPy array to list for compatibility with ChromaDB
        return embeddings.tolist()
    except Exception as e:
        print(f"Error generating embeddings: {e}")
        raise

In [None]:
def generate_response(query: str,
                      tokenizer: AutoTokenizer,
                      model: AutoModelForCausalLM,
                      context: str = "",
                      productMatched: list = "",
                      max_new_tokens: int = 200,
                      temperature: float = 0.3,
                      top_p: float = 0.8
                      ) -> str:
  """ Generate a response using the language model based on the query and context. """

  if productMatched:
    prompt = (
      f"You are a helpful and concise shopping assistant for an online clothing store.\n"
      f"The user asked: '{query}'\n"
      f"The user is asking about this specific product: '{productMatched}'\n\n"
      f"Here are relevant products from the catalog (including availability status):\n{context}\n\n"
      f"Reply briefly, naturally, and in a friendly tone.\n"
      f"- If the user asks a yes/no question, respond with a short and polite yes or no.\n"
      f"- If the user asks for product details, answer based on the product and the user’s question.\n"
      f"- If the user asks about a product and its availability is anything other than 'in_stock' (e.g., 'out_of_stock', 'preorder', etc.), politely inform them it's not currently available and ask if the user would like to provide their email to be notified when it’s available again.\n"
      f"Keep the response friendly, accurate, and to the point.\n\n"
      f"Response:"
    )

  elif context.strip():
    prompt = (
        f"You are a helpful and concise shopping assistant for an online clothing store.\n"
        f"The user asked: '{query}'\n\n"
        f"Here are relevant products from the catalog:\n{context}\n\n"
        f"Reply briefly, naturally, and in a friendly tone.\n"
        f"- If the user asks about a clothing product in general (e.g., 'T-shirt'), but not a specific product name, suggest the most relevant matching product(s) from the catalog. "
        f"For example, if they ask for 'T-shirt', suggest something like 'Summer Men T-shirt' from the product list.\n"
        f"- If the user's question does not clearly relate to any product in the catalog, or is unclear or contains errors, politely respond with: "
        f"\"I'm sorry, I couldn't quite understand your request. Could you please rephrase it?\"\n"
        f"Keep your response short and helpful.\n\n"
        f"Response:"
    )

  else:
    prompt = (
        f"You are a helpful assistant for a clothing store.\n"
        f"The user said: {query}\n\n"
        f"Respond only based on the user's input, following these rules:\n"
        f"- If the user says 'Hi', 'Hello', or gives a greeting only, reply with a warm welcome and ask what type of clothing they're looking for. Do NOT mention products, availability, or ask for their email.\n"
        f"- If the user asks for a specific clothing item, politely apologize for not having it and ask for their email to notify them if it becomes available.\n"
        f"- If the user asks about something unrelated to clothing, kindly explain that your store only sells clothing and encourage them to ask about clothing instead.\n"
        f"- Do not mention product availability or email collection unless the user clearly requested a specific item.\n"
        f"- If the user input is unclear or doesn't match any condition, ask them politely to clarify.\n"
        f"- Always keep your tone friendly, natural, and your response brief.\n"
        f"- Remind the user they can type 'exit' to end the conversation.\n\n"
        f"Response:"
    )

  # input_text = f"User query: {query}\n\nContext:\t{context}\n\nAnswer:\t"

  inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True, return_attention_mask=True)

  inputs = {key: value.to(model.device) for key, value in inputs.items()}

  with torch.inference_mode():
    outputs = model.generate(
        input_ids=inputs["input_ids"],
        attention_mask=inputs["attention_mask"],
        max_new_tokens=max_new_tokens,
        temperature=temperature,
        top_p=top_p,
        do_sample=True,
        pad_token_id=tokenizer.eos_token_id,
        num_return_sequences=1
        )

  response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
  response_start = response_text.find("Response:") + len("Response:")
  return response_text[response_start:].strip()

In [None]:
def create_document(row: pd.Series) -> Dict[str, Any]:
  """Create Document for embedding and chromadb"""
  return (
      row['product_name'] + ' ' +
      row['description'] + ' ' +
      row['color'] + ' ' +
      row['material'] + ' ' +
      row['category'] + ' ' +
      row['brand'] + ' ' +
      row['audience'] + ' ' +
      row['tags'] + ' '
  )

# Create metadata for ChromaDB
def create_metadata(row: pd.Series) -> Dict[str, Any]:
    """Create JSON-serializable metadata dictionary."""
    return {
        'product_id': row['product_id'] if pd.notna(row['product_id']) else None,
        'sizes': row['size'] if pd.notna(row['size']) else '',
        'image_url': row['image_url'] if pd.notna(row['image_url']) else '',
        'price': row['price'] if pd.notna(row['price']) else None,
        'discount_price': row['discount_price'] if pd.notna(row['discount_price']) else None,
        'discount_percent': row['discount_percent'] if pd.notna(row['discount_percent']) else 0.0,
        'stock_quantity': row['stock_quantity'] if pd.notna(row['stock_quantity']) else 0,
        'created_at': row['created_at'] if pd.notna(row['created_at']) else None
    }

In [None]:
def create_and_populate_chromadb(df):
    """Create a ChromaDB collection and add data."""
    client = chromadb.PersistentClient(path="/content/drive/MyDrive/RAG_Chatbot_System/chroma_db")
    try:
        collection = client.get_or_create_collection("TNT_Store")
        print("Collection created successfully...")

        # Add data to the collection
        collection.add(
            ids=[str(i) for i in range(len(df))],
            documents = df['document'].tolist(),
            embeddings=df['embeddings'].tolist(),
            metadatas=df['metadata'].tolist()
        )
        print("Data added to collection successfully...")
        return collection
    except Exception as e:
        print(f"Error creating or populating collection: {e}")
        return None

In [None]:
def format_context(results, df):
  context_lines = []

  for id, distance in zip(results['ids'][0], results['distances'][0]):
    product_id = df.loc[int(id), 'product_id'] if int(id) in df.index else 'N/A'
    product_name = df.loc[int(id), 'product_name'] if int(id) in df.index else 'N/A'
    description = df.loc[int(id), 'description'] if int(id) in df.index else 'N/A'
    color = df.loc[int(id), 'color'] if int(id) in df.index else 'N/A'
    size = df.loc[int(id), 'size'] if int(id) in df.index else 'N/A'
    material = df.loc[int(id), 'material'] if int(id) in df.index else 'N/A'
    image_url = df.loc[int(id), 'image_url'] if int(id) in df.index else 'N/A'
    price = df.loc[int(id), 'price'] if int(id) in df.index else 'N/A'
    discount_price = df.loc[int(id), 'discount_price'] if int(id) in df.index else 'N/A'
    discount_percent = df.loc[int(id), 'discount_percent'] if int(id) in df.index else 'N/A'
    category = df.loc[int(id), 'category'] if int(id) in df.index else 'N/A'
    brand = df.loc[int(id), 'brand'] if int(id) in df.index else 'N/A'
    stock_quantity = df.loc[int(id), 'stock_quantity'] if int(id) in df.index else 'N/A'
    audience = df.loc[int(id), 'audience'] if int(id) in df.index else 'N/A'
    tags = df.loc[int(id), 'tags'] if int(id) in df.index else 'N/A'
    created_at = df.loc[int(id), 'created_at'] if int(id) in df.index else 'N/A'
    availability_status = df.loc[int(id), 'availability_status'] if int(id) in df.index else 'N/A'

    product_info = (
        f"Details of product:\n"
        f"ID: {product_id} \n"
        f"Name: {product_name} \n"
        f"Description: {description} \n"
        f"Color: {color} \n"
        f"Size: {size} \n"
        f"Material: {material} \n"
        f"Image: {image_url} \n"
        f"Price: {price} \n"
        f"Discounted Price: {discount_price} ({discount_percent}% off) \n"
        f"Category: {category} \n"
        f"Brand: {brand} \n"
        f"In Stock: {stock_quantity} items\n"
        f"For: {audience} \n"
        f"Tags: {tags} \n"
        f"Added on: {created_at} \n"
        f"Availability: {availability_status} \n"
    )
    context_lines.append(product_info)
  return "\n---\n".join(context_lines)

In [None]:
def dict_based_matcher(df, text):
  nlp = spacy.load("en_core_web_sm")
  matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
  pattern = [nlp.make_doc(name) for name in df['product_name'].dropna()]
  matcher.add("PRODUCT", pattern)
  doc = nlp(text)
  matches = matcher(doc)
  found = [doc[start:end].text for _, start, end in matches]
  if found:
    return found
  else:
    return ''

In [None]:
def truncate_context(context: str, tokenizer: PreTrainedTokenizerBase, max_tokens: int = 1024) -> str:
  """Truncate context to a maximum number of tokens."""
  tokens = tokenizer.encode(context, truncation=True, max_length=max_tokens)
  return tokenizer.decode(tokens, skip_special_tokens=True)

In [None]:
def hash_file(filepath: str) -> str:
  """Generate SHA256 hash of a file"""
  with open(filepath, "rb") as f:
    return hashlib.sha256(f.read()).hexdigest()

In [None]:
def has_csv_changed(csv_path: str, hash_path: str = "data_hash.txt") -> bool:
  current_hash = hash_file(csv_path)

  # If hash file doesn't exist, assume it's changed
  if not os.path.exists(hash_path):
    with open(hash_path, "w") as f:
      f.write(current_hash)
    return True

  with open(hash_path, "r") as f:
    previous_hash = f.read().strip()

  # If hash differs, update and retrun True
  if current_hash != previous_hash:
    with open(hash_path, "w") as f:
      f.write(current_hash)
    return True

  return False

In [None]:
def extract_email(text: str) -> Optional[str]:
  """Extract a vaild email from a text string."""
  match = re.search(r"[\w\.-]+@[\w\.-]+\.\w+", text)
  if match:
    return match.group(0)
  return None


def is_valid_email(email: str) -> bool:
  return re.match(r"[^@]+@[^@]+\.[^@]+", email) is not None


def save_email_to_csv(email: str, query: str, file_path: str = "interested_customers.csv"):
  query = query.strip().lower()
  email = email.strip().lower()
  new_entry = pd.DataFrame([[email, query]], columns = ["Email", "Query"])

  if os.path.exists(file_path):
    df = pd.read_csv(file_path)
    updated_data = df
    if email in df["Email"].values:
      existing_queries = df.loc[df["Email"] == email, "Query"].values[0]
      existing_list = [q.strip().lower() for q in existing_queries.split(',')]
      if query not in existing_list:
        df.loc[df["Email"] == email, "Query"] = existing_queries + ', ' + query
        updated_data = df
    else:
      updated_data = pd.concat([df, new_entry], ignore_index=True)
  else:
    updated_data = new_entry

  updated_data.to_csv(file_path, index=False)


# !pip install email-validator

# from email-validator import validate_email, EmailNotValidError

# def is_valid_email(email: str) -> bool:
#   try:
#     validate_email(email)
#     return True
#   except EmailNotValidError:
#     return False

In [None]:
 # Initilize the sentence transformer model
try:
  embeddingModel = SentenceTransformer('sentence-transformers/paraphrase-multilingual-mpnet-base-v2')
except Exception as e:
  print(f"Error loading model: {e}")
  raise

#Initilize the generation model
model_id = "mistralai/Mistral-7B-Instruct-v0.3"
generationTokenizer = AutoTokenizer.from_pretrained(model_id)

if generationTokenizer.pad_token is None:
  generationTokenizer.pad_token = generationTokenizer.eos_token

generationModel = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",  # put model on GPU automatically
    quantization_config=bnb_config
    )

In [None]:
csv_file = 'sample_data.csv'

pkl_path = "embeddings.pkl"

need_to_generate = (
    has_csv_changed(csv_file)
    or not os.path.exists(pkl_path)
    or os.path.getsize(pkl_path) == 0
)

if need_to_generate:
  print("Generating embeddings and saving...")

  #Load the CSV file
  df = load_csv(csv_file)

  # Create document
  df['document'] = df.apply(create_document, axis=1)

  # Generate embeddings
  try:
    # embeddings = model.encode(df['text_to_embed'].tolist(), show_progress_bar=True)
    df['embeddings'] = generate_embeddings(df['document'].tolist(), embeddingModel)
  except Exception as e:
    print(f"Error generating embedding: {e}")
    raise

  # Create metadata
  df['metadata'] = df.apply(create_metadata, axis=1)

  # Save embeddings and metadata
  with open("embeddings.pkl", "wb") as f:
    pickle.dump(df[['embeddings', 'metadata', 'document']], f)

  print("Embeddings saved.")

else:
  print("No CSV change: loading cached embeddings...")
  df = load_csv(csv_file)
  with open("embeddings.pkl", "rb") as f:
    loaded = pickle.load(f)
  df['embeddings'] = loaded['embeddings']
  df['metadata'] = loaded['metadata']
  df['document'] = loaded['document']


In [None]:
collection = create_and_populate_chromadb(df)

In [None]:
print("Hello, How can I help you:\n")
print("Type 'exit' to end the chat.\n")
chatHistory = []
numberOfChat = 0
lastQuery = None
while(True):
  try:
    query = input().strip()
  except Exception as e:
    print(f"Input Error: {e}")
    continue

  if query.lower() == 'exit':
    break

  numberOfChat += 1

  if numberOfChat > 1:
    email = extract_email(query)
    if email is not None and is_valid_email(email):
      save_email_to_csv(email, lastQuery)
      print(f"Thank you! We'll notify you at {email} as soon as relevant products are available.")
      continue

  productNameMatcher = dict_based_matcher(df, query)
  if productNameMatcher:
    productNameEmbedding = generate_embeddings(productNameMatcher, embeddingModel)[0]
    context = collection.query(query_embeddings=[productNameEmbedding], n_results=5)
    retrievedContext = format_context(context, df)
    truncatedContext = truncate_context(retrievedContext, generationTokenizer, max_tokens=1024)
    response = generate_response(query, generationTokenizer, generationModel, context=truncatedContext, productMatched=productNameMatcher)

  else:
    queryEmbedding = generate_embeddings([query], embeddingModel)[0]
    context = collection.query(query_embeddings=[queryEmbedding], n_results=10)

    topDistance = context['distances'][0][0]

    thresholdDistance = 10.0 # Threshold for semantic similarity; lower is more similar

    if topDistance > thresholdDistance:
      response = generate_response(query, generationTokenizer, generationModel)

    else:
      retrievedContext = format_context(context, df)
      truncatedContext = truncate_context(retrievedContext, generationTokenizer, max_tokens=1024)
      response = generate_response(query, generationTokenizer, generationModel, context=truncatedContext)

  lastQuery = query
  chat = {f"Query {numberOfChat}": query, f"Response {numberOfChat}": response}
  chatHistory.append(chat)
  print("Answer: \n")
  wrapped = textwrap.fill(response, width=100)
  print(wrapped)

In [None]:
# # Debugging
# while(True):
#   query = input("What's up:")
#   if query.lower() == 'exit':
#     break
#   queryEmbedding = generate_embeddings([query], embeddingModel)[0]
#   context = collection.query(query_embeddings=[queryEmbedding], n_results = 5)
#   results = context["documents"]
#   distance = context["distances"]
#   print(context)
#   print(results)
#   print("\n",distance)