In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('eval_data.csv')


In [3]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
connections.connect()
from tqdm.autonotebook import tqdm


In [4]:
TABLE_NAME = 'eval_question_answering'
collection = None

In [None]:
#Deleting previouslny stored table for clean run
def create_mqa():
    if utility.has_collection(TABLE_NAME):
        collection = Collection(name=TABLE_NAME)
        collection.drop()

    field1 = FieldSchema(name="id", dtype=DataType.INT64, descrition="int64", is_primary=True)
    field3 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, descrition="float vector",dim=1024, is_primary=False)
    schema = CollectionSchema(fields=[field1, field3], description="collection description")
    collection = Collection(name=TABLE_NAME, schema=schema)
    
    default_index = {"index_type": "IVF_FLAT", "metric_type": 'IP', "params": {"nlist": 200}}
    collection.create_index(field_name="embedding", index_params=default_index)

if utility.has_collection(TABLE_NAME):
    global collection
    collection = Collection(name=TABLE_NAME)

In [None]:
create_mqa()
print(collection)


In [None]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("AswiN037/sentence-t-roberta-large-wechsel-tamil")
print("Retriever model loaded")
def encode(text):
    embeddings = model.encode(text)
    return [embeddings.tolist()]

In [None]:
# new 
def push_context_to_milvus():
    i = collection.num_entities
    size = collection.num_entities 
    batch = 50
    while i < len(df) and i < size + batch:
        emb = encode(df['context'][i])
        ids = [int(df['id'][i])]
        collection.insert([ids, emb])
        i+=1
    return collection.num_entities

In [None]:
push_context_to_milvus()

In [None]:
def find_similar(emb):
    collection.load()
    return collection.search(
	data=emb, 
	anns_field="embedding", 
	param={"metric_type": "IP", "params": {"nprobe": 10}}, 
	limit=10, 
	expr=None,
	output_fields = ["id"],
	consistency_level="Strong"
)

In [None]:
def rertieve_id_for_question():
    # 0 - question id 1- retrieved context id
    result=[]
    for i in range(len(df)):
        question_emb = encode(df['question'][i])
        similar_ids = find_similar(question_emb)
        sim_id = similar_ids[0].ids[0]
        result.append((i, sim_id))
    return result

In [None]:
retriever_result = rertieve_id_for_question()

In [None]:
retriever_result
df_retrieved = pd.DataFrame(retriever_result, columns=['question_id', 'context_id'])
df_retrieved.to_json("weschel_encoder_result.json")


In [None]:
from transformers import  pipeline
model_name = "AswiN037/xlm-roberta-squad-tamil"
answer_extract = pipeline('question-answering', model=model_name, tokenizer=model_name)


In [None]:

answer_extract_result = []
for i in range(len(df_retrieved)):
    r_q_id = df_retrieved['question_id'][i]
    question = df['question'][r_q_id]
    r_c_id = df_retrieved['context_id'][i]
    context = df['context'][r_c_id]
    original_answer = df['answer_text'][r_q_id]
    qc = {
        "context" : context, 
        "question" : question 
    }
    predicted_answer = answer_extract(qc)['answer']
    # original answer, predicted answer
    answer_extract_result.append((original_answer, predicted_answer))

In [None]:
answer_extract_result
df_extracted_answer = pd.DataFrame(answer_extract_result, columns=['Actual', 'Predicted'])
df_extracted_answer.to_json('weschel_encoder_xlm_robert.json')