In [7]:
import csv
import json

# Load the CSV file
with open('data/Econet_data.csv', 'r') as f:
    reader = csv.DictReader(f)
    data = list(reader)

# Convert the data to JSON format
with open('data/Econet_data.json', 'w') as f:
    json.dump(data, f, indent=4)

In [10]:
import json
from sentence_transformers import SentenceTransformer

# Load the JSON file
with open('data/Econet_data.json', 'r') as f:
    docs_raw = json.load(f)

# Create the necessary data structure
documents = []
for doc in docs_raw:
    document = {
        'question': doc['Question'],
        'answer': doc['Answer'],
        'question_vector': None,
        'answer_vector': None
    }
    documents.append(document)

In [11]:
documents[1]

{'question': 'Hi, I just purchased my daily bundles and they have exhausted though I haven’t used them much.',
 'answer': 'We would like to inform you that all our bundles are usage based and you can now track your data, airtime or SMS usage via My Web self-care. You just need to follow this link: https://selfcare.econet.co.zw/ and register.',
 'question_vector': None,
 'answer_vector': None}

In [13]:
from tqdm import tqdm
from sentence_transformers import SentenceTransformer

# Load the pre-trained sentence embedding model
model = SentenceTransformer('all-mpnet-base-v2')

# Create sentence embeddings for the question and answer with progress tracking
for doc in tqdm(documents, desc="Processing documents"):
    doc['question_vector'] = model.encode(doc['question']).tolist()
    doc['answer_vector'] = model.encode(doc['answer']).tolist()


Processing documents: 100%|████████████████████████████████████████████████████████████████████████████| 194/194 [00:36<00:00,  5.26it/s]


In [14]:
from elasticsearch import Elasticsearch

# Set up the Elasticsearch connection
es_client = Elasticsearch('http://localhost:9200')

# Define the index settings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "question_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"},
            "answer_vector": {"type": "dense_vector", "dims": 768, "index": True, "similarity": "cosine"}
        }
    }
}

# Create the index
index_name = "customer_support"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'customer_support'})

In [16]:
from tqdm import tqdm
from elasticsearch import Elasticsearch

# Initialize the Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')

# Index the documents in Elasticsearch with progress tracking
for doc in tqdm(documents, desc="Indexing documents"):
    try:
        es_client.index(index=index_name, document=doc)
    except Exception as e:
        print(f"Error indexing document: {e}")


Indexing documents: 100%|██████████████████████████████████████████████████████████████████████████████| 194/194 [00:07<00:00, 25.79it/s]


### vector search in Elasticsearch

In [18]:
# Perform a vector search in Elasticsearch
search_term = "My bundle is not working?"
vector_search_term = model.encode(search_term)

query = {
    "query": {
        "knn": {
            "question_vector": {
                "vector": vector_search_term,
                "k": 5
            }
        }
    }
}

results = es_client.search(index=index_name, body=query)
for hit in results['hits']['hits']:
    print(f"Question: {hit['_source']['question']}")
    print(f"Answer: {hit['_source']['answer']}")
    print()

Question: What is the validity of the bundles?
Answer: The bundles have a daily and weekly validity period.
Score: 0.7348753

Question: I have WhatsApp bundle but am failing to make an App call?
Answer: Please note that you cannot make calls using the WhatsApp bundle. In order to make calls, you will need Data bundles.
Score: 0.72886837

Question: How do I check my USD bundle balance?
Answer: Dial *143# and go to option 8
Score: 0.7057455

Question: How can I purchase the bundles?
Answer: You can purchase USD airtime directly from all Econet shops and dealers and convert it to bundles via *143#
Score: 0.7045275

Question: What is the difference between the old bundles and the new bundles?
Answer: The difference is that there are changes that have been made to allow for more and better benefits across all the bundles and more options have been added. The changes are tabulated below
Score: 0.69786763



**The highest score for vector Search is  0.7348753**

### Perform Keyword search with Semantic Search (Hybrid/Advanced Search)

In [26]:
import json
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from elasticsearch import Elasticsearch

# Load your JSON data
with open('data/Econet_data.json', 'rt') as f_in:
    documents = json.load(f_in)

# Initialize the sentence transformer model
model_name = 'multi-qa-MiniLM-L6-cos-v1'
model = SentenceTransformer(model_name)

# Create embeddings for each document
for doc in tqdm(documents, desc="Creating embeddings"):
    question = doc['Question']
    answer = doc['Answer']
    qa = question + ' ' + answer

    doc['question_vector'] = model.encode(question).tolist()
    doc['answer_vector'] = model.encode(answer).tolist()
    doc['question_answer_vector'] = model.encode(qa).tolist()

Creating embeddings: 100%|█████████████████████████████████████████████████████████████████████████████| 194/194 [00:11<00:00, 16.44it/s]


In [29]:
# Initialize Elasticsearch client
es_client = Elasticsearch('http://localhost:9200')

# Define index settings and mappings
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "question": {"type": "text"},
            "answer": {"type": "text"},
            "question_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
            "question_answer_vector": {
                "type": "dense_vector",
                "dims": 384,
                "index": True,
                "similarity": "cosine"
            },
        }
    }
}

# Create the index
index_name = "customer-support"
es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

# Index the documents
for doc in tqdm(documents, desc="Indexing documents"):
    es_client.index(index=index_name, document=doc)


Indexing documents: 100%|██████████████████████████████████████████████████████████████████████████████| 194/194 [00:06<00:00, 30.12it/s]


In [30]:
# Function to perform hybrid search
def hybrid_search(query, size=5):
    # Encode the query
    query_vector = model.encode(query).tolist()

    # KNN query
    knn_query = {
        "field": "question_answer_vector",
        "query_vector": query_vector,
        "k": size,
        "num_candidates": 100
    }

    # Keyword query
    keyword_query = {
        "multi_match": {
            "query": query,
            "fields": ["question^2", "answer"],
            "type": "best_fields",
            "tie_breaker": 0.3,
            "minimum_should_match": "30%"
        }
    }

    # Combine queries
    combined_query = {
     "bool": {
            "must": [keyword_query],
            "should": [
                {
                    "script_score": {
                        "query": {"match_all": {}},
                        "script": {
                            "source": "cosineSimilarity(params.query_vector, 'question_answer_vector') + 1.0",
                            "params": {"query_vector": query_vector}
                        }
                    }
                }
            ]
        }
    }

    # Perform the search
    response = es_client.search(
        index=index_name,
        query=combined_query,
        knn=knn_query,
        size=size
    )

    return response

# Example usage
query = "My bundle is not working?"
results = hybrid_search(query)

# Print results
for hit in results['hits']['hits']:
    print(f"Question: {hit['_source']['Question']}")
    print(f"Answer: {hit['_source']['Answer']}")
    print(f"Score: {hit['_score']}")
    print()




Question: What is the validity of the bundles?
Answer: The bundles have a daily and weekly validity period.
Score: 0.74401045

Question: Can I carry over my bundle or ask for a bundle extension in case I didn’t finish my bundle allocation before expiry?
Answer: No, you cannot carry over the bundle. You will have to use it within stipulated times.
Score: 0.7293763

Question: How do I buy or check my SMS bundles?
Answer: No problem! You just dial *140#, select option 1 and choose the option which serves you.
Score: 0.72461826

Question: How do I check my USD bundle balance?
Answer: Dial *143# and go to option 8
Score: 0.72084755

Question: What is the difference between the old bundles and the new bundles?
Answer: The difference is that there are changes that have been made to allow for more and better benefits across all the bundles and more options have been added. The changes are tabulated below
Score: 0.7169128



**The highest score for vector Search is 0.74401045 therefore Hybrid search has got a better accuracy**

Evaluating search: 100%|███████████████████████████████████████████████████████████████████████████████| 840/840 [00:26<00:00, 31.63it/s]

Mean Precision: 0.0000
Mean Recall: 0.0000
Mean F1 Score: 0.0000
Mean Reciprocal Rank: -1.0000
Detailed evaluation results saved to 'evaluation_results.csv'



