In [9]:
from tqdm import tqdm 
import pandas as pd

## Generate Ground Truth Table

### Add IDs to the data

#### Create IDs

In [552]:
# Read data into dataframe 
df = pd.read_csv("data/topic_data.csv").dropna()

# Convert dataframe to list of dictionaries
data_dict = df.to_dict(orient="records")

In [464]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['answer']}-{doc['question']}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [465]:
for doc in data_dict:
    doc['id'] = generate_document_id(doc)

Check unique document ids

In [466]:
from collections import defaultdict
hash = defaultdict(list)
for doc in data_dict:
    hash[doc['id']].append(doc)

In [467]:
print(len(data_dict), len(hash))
duplicate_ids =[]
for k,values in hash.items():
    if len(values)>1:
        duplicate_ids.append(k)

639 598


In [468]:
print(len(duplicate_ids))

41


In [553]:
gt_df = pd.read_csv('data/ground_truth.csv',usecols=[0, 1])

In [470]:
gt_df.head(3)

Unnamed: 0,question,document_id
0,What are the main considerations in balancing ...,08aa7d88
1,How does the bias-variance trade-off affect mo...,08aa7d88
2,What strategies can be employed to manage bias...,08aa7d88


In [471]:
gt_df_duplicate = gt_df[gt_df['document_id'].isin(duplicate_ids)]

In [472]:
gt_df_duplicate

Unnamed: 0,question,document_id


In [474]:

# def generate_topic(doc):
#     topic_mapping = {'Supervised Learning': 'Supervised Learning',
#                      'RAG': 'RAG',
#                      'Unsupervised Learning KNN': 'Unsupervised Learning',
#                      'LLM': 'LLM',
#                      'Evaluation metrics': 'Model Evaluation',
#                 'Recommend matrix factoqrization': 'Recommender Systems',}
#     if doc['answer'] 

In [475]:
data_dict[4]

{'id': '94727dd0',
 'question': 'How would you evaluate a logistic regression model?',
 'answer': "To evaluate a logistic regression model, use the confusion matrix to analyze true positives, true negatives, false positives, and false negatives. Accuracy measures the proportion of correct predictions, while precision and recall are critical when false positives or false negatives have significant implications. Additionally, the ROC curve and AUC (Area Under the Curve) assess the model's ability to distinguish between classes across various thresholds. Logistic Regression is often used for binary classification and these metrics help in understanding its performance comprehensively.",
 'topic': 'Model Evaluation'}

In [476]:
new_df = pd.DataFrame(data_dict)
# Set 'id' column as the index
new_df.set_index('id', inplace=True)

In [477]:
new_df.head(3)

Unnamed: 0_level_0,question,answer,topic
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
34a96b07,When should you use classification over regres...,Use classification when you need to categorize...,Data Science
2075cd3f,Name an example where ensemble techniques migh...,"Ensemble techniques, like bagging and boosting...",Data Science
10e5ca34,How do you ensure you’re not overfitting with ...,"To avoid overfitting, you can: 1) Simplify the...",Data Science


#### Write to new .csv file

Commenting the line below to prevent overwriting of data

In [478]:
# new_df.to_csv('data/ml_indexed.csv', index=True)

In [479]:
data_dict[1]

{'id': '2075cd3f',
 'question': 'Name an example where ensemble techniques might be useful.',
 'answer': 'Ensemble techniques, like bagging and boosting, combine multiple models to improve predictive performance and robustness. For example, in a classification task, using a Random Forest (which combines multiple decision trees) can reduce overfitting and enhance accuracy compared to a single decision tree.',
 'topic': 'Data Science'}

### Add topic column

In [480]:

# Read data into dataframe 
df = pd.read_csv("data/ml_indexed.csv").dropna()

# Convert dataframe to list of dictionaries
data_dict = df.to_dict(orient="records")

In [489]:
topic_mapping = {
    'Supervised Learning': 'Supervised Learning',
    'RAG retrieval': 'RAG',
    'Unsupervised  KNN kNN clustering': 'Unsupervised Learning',
    'LLM language': 'LLM',
    'PCA feature engineering': 'Feature Engineering',
    'metrics evaluation evaluate ROC AUC': 'Model Evaluation',
    'tuning hyperparameter MLFlow': 'Model Tuning',
    'deploy deployment production': 'Model Deployment',
    'Adverserial': 'GAN',
    'Recommend matrix factorization': 'Recommender Systems',
}

# Function to check if any word in the key is in the text
def contains_any_word(text, words):
    text = text.lower()
    for word in words:
        if word.lower() in text:
            return True
    return False

for row in data_dict:
    # Initialize 'topic' to default value
    row['topic'] = 'Data Science'
    
    # Check if any key in topic_mapping appears in 'question' or 'answer'
    for key, value in topic_mapping.items():
        key_words = key.split()  # Tokenize the key into words
        if (contains_any_word(row.get('question', ''), key_words) or 
            contains_any_word(row.get('answer', ''), key_words)):
            row['topic'] = value
            break  # Stop checking once a match is found


In [490]:
topic_df = pd.DataFrame(data_dict)
topic_df['topic'].value_counts()

topic
Supervised Learning      222
Model Evaluation         110
Feature Engineering       94
RAG                       73
Data Science              62
Unsupervised Learning     30
LLM                       23
Recommender Systems       13
Model Tuning               8
Model Deployment           4
Name: count, dtype: int64

In [491]:
topic_df.to_csv('data/topic_data.csv', index=False)

## Evaluate Retrieval

### Text Evalition

#### Create index on documents

In [11]:
# Read data into dataframe 
df = pd.read_csv("data/topic_data.csv").dropna()

# Convert dataframe to list of dictionaries
data_dict = df.to_dict(orient="records")

In [12]:
embedding_size = 384 #128

In [13]:
from elasticsearch import Elasticsearch

es = Elasticsearch('http://localhost:9200') 

index_settings={
    "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
        "question": {"type": "text"},
        "answer": {"type": "text"},
        "topic": {"type": "keyword"},
        "id": {"type": "keyword"},}
    }
}
index_name = 'python-qa-index'

In [14]:
es.indices.delete(index=index_name,ignore_unavailable=True)
es.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'python-qa-index'})

In [15]:
# Add Data to Index using index()
for i in tqdm(range(len(data_dict))):
    row = data_dict[i]
    es.index(index=index_name, id=i, document=row)

  0%|          | 0/639 [00:00<?, ?it/s]

100%|██████████| 639/639 [00:39<00:00, 16.17it/s]


#### Define elastic search function

In [16]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "answer", "topic"],
                        "type": "best_fields"
                    }
                },
            }
        }
    }
    response = es.search(index=index_name, body=search_query)
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

#### Get ground truth data

In [17]:
df_ground_truth = pd.read_csv('data/ground_truth.csv', header = 0, usecols=[0, 1])

In [18]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [19]:
print(len(ground_truth), ground_truth[0])

1735 {'question': 'When is it appropriate to use classification rather than regression?', 'document_id': '34a96b07'}


In [20]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document_id']
    results = elastic_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 1735/1735 [01:47<00:00, 16.09it/s]


#### Calucalte hit rate and mrr

In [21]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [22]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [23]:
(hit_rate(relevance_total), mrr(relevance_total)) # question3, answer3

(0.7688760806916427, 0.706695485110471)

Elastic Search 
```
(0.768, 0.706)
```

#### Using minisearch

In [24]:
import minisearch
ms = minisearch.Index(
    text_fields=['question', 'answer'],
    keyword_fields=['topic','id'],
)

ms.fit(data_dict)

<minisearch.Index at 0x26fc23a5ac0>

In [25]:
def mini_search(query):
    results = ms.search( query = query,
                    num_results = 5)
    # response = [result['answer'] for result in results]
    # return response
    return results

In [26]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document_id']
    results = mini_search(query=q['question'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

100%|██████████| 1735/1735 [00:05<00:00, 337.10it/s]


In [27]:
(hit_rate(relevance_total), mrr(relevance_total))

(0.8242074927953891, 0.7583669548511048)

minisearch 
```
(0.82, 0.75)
```

### Vector Evalution

In [28]:
from sentence_transformers import SentenceTransformer

  from tqdm.autonotebook import tqdm, trange


In [29]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
embedding_size = 384 
model = SentenceTransformer(model_name)

In [30]:
index_settings={
    "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
        "question": {"type": "text"},
        "answer": {"type": "text"},
        "topic": {"type": "keyword"},
        "id": {"type": "keyword"},
        "answer_vector": {"type": "dense_vector", "dims": embedding_size, "index": True, "similarity": "cosine"},
        "question_vector": {"type": "dense_vector", "dims": embedding_size, "index": True, "similarity": "cosine"},
        "question_answer_vector": {"type": "dense_vector", "dims": embedding_size, "index": True, "similarity": "cosine"},
        }
    }
}

# Create Index and delete if it already exists
es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'python-qa-index'})

Embedding the answer and query vector

In [31]:
vector_data_dict = []
for i in tqdm(data_dict):
    question_answer = i['question'] + ' ' + i['answer']
    i['answer_vector'] = model.encode(i['answer'])
    i['question_vector'] = model.encode(i['question'])
    i['question_answer_vector'] = model.encode(question_answer)
    vector_data_dict.append(i)

100%|██████████| 639/639 [01:47<00:00,  5.96it/s]


Creating index on data

In [538]:
# Add Data to Index using index()
print('\n\n[[DEBUG] Adding data to index...')
for i in tqdm(vector_data_dict):
    row = i
    es.index(index=index_name, document=row)



[[DEBUG] Adding data to index...


100%|██████████| 639/639 [00:41<00:00, 15.42it/s]


In [539]:
print(es.info())

{'name': '7d50c3ad219e', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'GkHNY1cCQQGIGcngnIggvA', 'version': {'number': '8.15.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '1a77947f34deddb41af25e6f0ddb8e830159c179', 'build_date': '2024-08-05T10:05:34.233336849Z', 'build_snapshot': False, 'lucene_version': '9.11.1', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


In [540]:
def elastic_search_knn(field, vector):
    knn = {
        "field": field,
        "query_vector": vector,
        "k": 5,
        "num_candidates": 10000,
    }
    search_query = {
        "knn": knn,
        "_source": ["answer", "question", "topic", "id"]
    }
    es_results = es.search(
        index=index_name, 
        body=search_query
    )
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

#### Question vector

In [541]:
def question_vector_knn(q):
    question = q['question'] 

    v_q = model.encode(question)

    return elastic_search_knn('question_vector', v_q)

In [542]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance) 
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [543]:
evaluate(ground_truth, question_vector_knn)

100%|██████████| 1735/1735 [02:26<00:00, 11.87it/s]


{'hit_rate': 0.8582132564841498, 'mrr': 0.8392699327569638}

#### Answer vector

In [544]:
def answer_vector_knn(q):
    question = q['question'] 
    v_q = model.encode(question)

    return elastic_search_knn('answer_vector', v_q)

In [545]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [546]:
evaluate(ground_truth, answer_vector_knn)

100%|██████████| 1735/1735 [02:29<00:00, 11.58it/s]


{'hit_rate': 0.8553314121037464, 'mrr': 0.810528338136407}

#### Question-Answer vector

In [547]:
def question_answer_vector_knn(q):
    question = q['question'] 
    v_q = model.encode(question)

    return elastic_search_knn('question_answer_vector', v_q)

In [548]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [549]:
evaluate(ground_truth, question_answer_vector_knn)

100%|██████████| 1735/1735 [03:00<00:00,  9.60it/s]


{'hit_rate': 0.8795389048991354, 'mrr': 0.8574159462055707}

```
hit rate: 0.87, mrr: 0.85
```

### Hybrid Search Evaluation

#### Create index

In [None]:
model_name = 'multi-qa-MiniLM-L6-cos-v1'
embedding_size = 384 
model = SentenceTransformer(model_name)

In [None]:
index_settings={
    "settings": {
            "number_of_shards": 1,
            "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
        "question": {"type": "text"},
        "answer": {"type": "text"},
        "topic": {"type": "keyword"},
        "id": {"type": "keyword"},
        "answer_vector": {"type": "dense_vector", "dims": embedding_size, "index": True, "similarity": "cosine"},
        "question_vector": {"type": "dense_vector", "dims": embedding_size, "index": True, "similarity": "cosine"},
        "question_answer_vector": {"type": "dense_vector", "dims": embedding_size, "index": True, "similarity": "cosine"},
        }
    }
}

# Create Index and delete if it already exists
es.indices.delete(index=index_name, ignore_unavailable=True)
es.indices.create(index=index_name, body = index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'python-qa-index'})

Embedding the answer and query vector

In [None]:
vector_data_dict = []
for i in tqdm(data_dict):
    question_answer = i['question'] + ' ' + i['answer']
    i['answer_vector'] = model.encode(i['answer'])
    i['question_vector'] = model.encode(i['question'])
    i['question_answer_vector'] = model.encode(question_answer)
    vector_data_dict.append(i)

100%|██████████| 639/639 [01:47<00:00,  5.96it/s]


Creating index on data

In [40]:
# Add Data to Index using index()
print('\n\n[[DEBUG] Adding data to index...')
for i in tqdm(vector_data_dict):
    row = i
    es.index(index=index_name, document=row)



[[DEBUG] Adding data to index...


100%|██████████| 639/639 [00:45<00:00, 14.08it/s]


In [46]:
print(es.info())

{'name': '9983a944bbfc', 'cluster_name': 'docker-cluster', 'cluster_uuid': '1GfPU_vTQuyANA_6FK8Mqw', 'version': {'number': '8.7.0', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '09520b59b6bc1057340b55750186466ea715e30e', 'build_date': '2023-03-27T16:31:09.816451435Z', 'build_snapshot': False, 'lucene_version': '9.5.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'}


#### Create search function

In [67]:
def hybrid_elastic_search(field, query, query_vector):
    keyword_query = {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question", "answer", "topic"],
                        "type": "best_fields",
                        "boost": 0.6
                    },
                },
            }
        }
    knn_query = {
        "field": field,
        "query_vector": query_vector,
        "k": 5,
        "num_candidates": 1000,
        "boost": 0.4
    }
    search_query = {
        "knn": knn_query,
        "query": keyword_query,
        "_source": ["answer", "question", "topic", "id"],
        "size":5
    }
    es_results = es.search(
        index=index_name, 
        body=search_query
    )
    result_docs = []
    for hit in es_results['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

#### Question-Answer vector

In [68]:
def question_hybrid(q):
    query = q['question'] 
    v_q = model.encode(query)

    return hybrid_elastic_search('question_answer_vector', query,v_q)

In [69]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document_id']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance) 
    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [70]:
evaluate(ground_truth, question_hybrid)

100%|██████████| 1735/1735 [02:53<00:00,  9.97it/s]


{'hit_rate': 0.8005763688760807, 'mrr': 0.7226993275696453}

In [None]:
evaluate(ground_truth, question_vector_knn)

100%|██████████| 1735/1735 [02:26<00:00, 11.87it/s]


{'hit_rate': 0.8582132564841498, 'mrr': 0.8392699327569638}

Best Performancing Retrieval:

Elastic Search with Question-Answer vector 
```
hit rate: 0.87, mrr: 0.85
```