### Download the FAQ data

In [27]:
import pandas as pd
import uuid

url = "https://raw.githubusercontent.com/imsoumya18/E-commerce_FAQ/main/Ecommerce_FAQs.csv"
df_faq = pd.read_csv(url).reset_index().rename(columns={'index': 'id'})
df_faq

Unnamed: 0,id,prompt,response
0,0,How can I create an account?,"To create an account, click on the 'Sign Up' b..."
1,1,What payment methods do you accept?,"We accept major credit cards, debit cards, and..."
2,2,How can I track my order?,You can track your order by logging into your ...
3,3,What is your return policy?,Our return policy allows you to return product...
4,4,Can I cancel my order?,You can cancel your order if it has not been s...
...,...,...,...
74,74,Can I order a product if it is listed as 'sold...,If a product is listed as 'sold out' but avail...
75,75,Can I return a product if it was purchased wit...,"Yes, you can return a product purchased with a..."
76,76,Can I request a product if it is not currently...,If a product is not available in your preferre...
77,77,Can I order a product if it is listed as 'comi...,If a product is listed as 'coming soon' but no...


### OPENAI 
* the RAGs in this notebook are all powered by OpenAI as LLM provider

In [28]:
from openai import OpenAI
client_openai = OpenAI()

In [5]:
q = "What are the policy for your clients?"

### MINSEARCH POWERED SEARCH

In [45]:
import minsearch

In [6]:
documents = df_faq.to_dict(orient='records')

In [85]:
min_index = minsearch.Index(
    text_fields=["prompt", "response"],
    keyword_fields=[])

min_index.fit(documents)

<minsearch.minsearch.Index at 0x70852a449a90>

In [86]:
def min_search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=3
    )

    return results

### QDRANT POWERED SEARCH
#### SEMANTIC SEARCH

* Remember, it seems that the qdrant server instance persist and with it persists the collection

  ```bash
   docker run -p 6333:6333 -p 6334:6334 \                                                                                                          
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant
  ```



In [7]:
from qdrant_client import QdrantClient, models

In [12]:
qdrant_client = QdrantClient("http://localhost:6333")

In [274]:
from fastembed import TextEmbedding
model_handle = "jinaai/jina-embeddings-v2-small-en"

```bash
    collection_name = "project"
    EMBEDDING_DIMENSIONALITY = 512
    
    # Create the collection with specified vector parameters
    qdrant_client.create_collection(
        collection_name=collection_name,
        vectors_config=models.VectorParams(
            size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
            distance=models.Distance.COSINE  # Distance metric for similarity search
        )
    )
    
    points = []
    id = 0
    
    for doc in documents:
    
        point = models.PointStruct(
                id=id,
                vector=models.Document(text=doc['response'], model=model_handle), #embed text locally with "jinaai/jina-embeddings-v2-small-en" from FastEmbed
                payload={
                    "prompt": doc['prompt'],
                    "response": doc['response'],
                    "id": doc["id"]
                } #save all needed metadata fields
            )
        points.append(point)
    
        id += 1
    
    qdrant_client.upsert(
        collection_name=collection_name,
        points=points
    )
```

* output

```output
    UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)
```

In [130]:
def qdrant_search(query, limit=3):

    result_points = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    results = [point.payload for point in result_points.points]
    
    return results

In [120]:
search_qdrant(q)

[{'prompt': 'Can I return a product if it was damaged due to improper use?',
  'response': 'Our return policy generally covers products that are defective or damaged upon arrival. Damage due to improper use may not be eligible for a return. Please contact our customer support team for assistance.'},
 {'prompt': 'Can I return a product if it was purchased as part of a bundle or set?',
  'response': 'If a product was purchased as part of a bundle or set, the return policy may vary. Please refer to the specific terms and conditions or contact our customer support team for further guidance.'},
 {'prompt': 'What is your price matching policy?',
  'response': "We have a price matching policy where we will match the price of an identical product found on a competitor's website. Please contact our customer support team with the details of the product and the competitor's offer."}]

#### HYBRID SEARCH VECTOR BASE

In [13]:
# Create the collection with both vector types
qdrant_client.create_collection(
    collection_name="project-sparse-and-dense",
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [16]:
qdrant_client.upsert(
    collection_name="project-sparse-and-dense",
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=doc['response'],
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                "bm25": models.Document(
                    text=doc['response'], 
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "prompt": doc['prompt'],
                "response": doc['response'],
                "id": doc['id']
            }
        )
        for doc in documents
    ]
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [19]:
def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    result_points = qdrant_client.query_points(
        collection_name="project-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(3 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(3 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )


    results = [point.payload for point in result_points.points]

    return results

##### Hybrid search test

In [26]:
rrf_search('which are the policies?')

UnexpectedResponse: Unexpected Response: 500 (Internal Server Error)
Raw response content:
b'{"status":{"error":"Service internal error: 1 of 1 read operations failed:\\n  Service internal error: task 175 panicked with message \\"called `Result::unwrap()` on an `Err` value: OutputTooSmall {  ...'

### PROMPT LLM AND RAG

In [21]:
def build_prompt(query, search_results):
    prompt_template = """
            You're are the customer service chatbot of an e-commerce platform. Answer the QUESTION based on the CONTEXT from the FAQ database.
            Use only the facts from the CONTEXT when answering the QUESTION.
            
            QUESTION: {question}
            
            CONTEXT: 
            {context}
            """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"prompt: {doc['prompt']}\nresponse: {doc['response']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [22]:
def llm(prompt):
    response = client_openai.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query, search):
    search_results = search(query)
    prompt = build_prompt(q, search_results)
    output = llm(prompt)
    return output

In [23]:
print(rag("Which are the policies?",rrf_search))

Our policies include offering a price matching policy where we will match the price of an identical product found on a competitor's website. We also offer the option to return a product if you changed your mind, provided the product is in its original condition and packaging. A receipt or proof of purchase is usually required for returns. Additionally, we offer live chat support during business hours for customer assistance.


### EVALUATION DATA GENERATION

#### CANDIDATE PROMPT

In [181]:
prompt_template_eval = """
You emulate a user of our e-commerce platform.
Formulate 5 questions this user might ask based on a provided response of the FAQ. Make the question specific to the response.
The record should contain the answer to the question, and the questions should be complete and not too short. Use as fewer words as possible from the record

The record:

prompt: {prompt}
response: {response}

provide the output in parsable JSON without using code blocks:

{{"questions": "question1", "question2", ..., "question5"]}}

""".strip()

In [182]:
prompt = prompt_template_eval.format(**documents[0])

In [184]:
print(prompt)

You emulate a user of our e-commerce platform.
Formulate 5 questions this user might ask based on a provided response of the FAQ. Make the question specific to the response.
The record should contain the answer to the question, and the questions should be complete and not too short. Use as fewer words as possible from the record

The record:

prompt: How can I create an account?
response: To create an account, click on the 'Sign Up' button on the top right corner of our website and follow the instructions to complete the registration process.

provide the output in parsable JSON without using code blocks:

{"questions": "question1", "question2", ..., "question5"]}


In [187]:
def llm(prompt):
    response = client_openai.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [195]:
def generate_questions(doc):
    prompt = prompt_template_eval.format(**doc)

    response = client_openai.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [207]:
json.loads(generate_questions(documents[0]))['questions']

["What steps do I need to follow after clicking the 'Sign Up' button to create my account?",
 "Where can I find the 'Sign Up' button on your website?",
 'Is there any information I need to prepare before starting the registration process?',
 "What happens if I encounter issues while creating my account after clicking 'Sign Up'?",
 'Can I create an account using my mobile device, and will the process be the same?']

In [198]:
from tqdm.auto import tqdm

### FETCH GENERATED QUESTIONS


In [260]:
results = {}

In [261]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']



  0%|          | 0/79 [00:00<?, ?it/s]

In [263]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [265]:
pd.DataFrame(final_results, columns=['id', 'question'])\
    .to_csv('/mnt/c/Users/Carlos/Desktop/projects/llm-zoomcamp/project/data/ground-truth-retrieval.csv', index=False)