# ENCODER

In [None]:
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import AutoModel, AutoTokenizer



In [None]:
tokenizer = AutoTokenizer.from_pretrained('ai4bharat/indic-bert')
model = AutoModel.from_pretrained('ai4bharat/indic-bert')

In [None]:
import torch
import torch.nn.functional as F

#Mean Pooling - Take average of all tokens
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output.last_hidden_state #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
#Encode text
def encode(texts):
    # Tokenize sentences
    doc_stride = 128
    encoded_input = tokenizer(texts, padding=True, truncation=True, return_tensors='pt', max_length=512, stride=doc_stride, return_overflowing_tokens = True)
    encoded_input.pop("overflow_to_sample_mapping")

    # Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input, return_dict=True)

    # Perform pooling
    embeddings = mean_pooling(model_output, encoded_input['attention_mask'])

    # Normalize embeddings
    embeddings = F.normalize(embeddings, p=2, dim=1)
    
    return embeddings.tolist()

# MYSQL

In [24]:
import pymysql
conn = pymysql.connect(host='localhost', user='aswin', port=3306, password='Mysql@123', database='ODQA',local_infile=True)
cursor = conn.cursor()
print("odqa")
# print("this is imp")
TABLE_NAME = 'QA_DATASET'

odqa


In [25]:
def create_context_table():
    #Deleting previouslny stored table for clean run
    drop_table = "DROP TABLE IF EXISTS " + TABLE_NAME + ";"
    cursor.execute(drop_table)
    try:
        # sql = "CREATE TABLE if not exists " + TABLE_NAME + " (id TEXT, context TEXT);"
        sql = f"""
                CREATE TABLE if not exists {TABLE_NAME} (
                    id int(10) NOT NULL AUTO_INCREMENT,
                    question TEXT COLLATE utf8_bin NOT NULL,
                    context MEDIUMTEXT COLLATE utf8_bin NOT NULL,
                    answer  TEXT COLLATE utf8_bin NOT NULL,
                    answer_start int(5) NOT NULL,
                    PRIMARY KEY (id)
                ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_bin
                AUTO_INCREMENT=1 ;"""
        cursor.execute(sql)
        print(f"{TABLE_NAME} table successfully!")
    except Exception as e:
        print("can't create a MySQL table: ", e)



In [26]:
def execute_query(query):
    try:
        cursor.execute(query)
        rows = cursor.fetchall()
        return rows
    except Exception as e:
        print("can't create a MySQL table: ", e)



In [31]:
id = 1

In [34]:
q = f"select context from QA_DATASET where id = {id}"
q = f"select * from context where id between 0 and 10 ;"
res = execute_query(q)

In [None]:
res

In [None]:
def insert_data(dataset):
    """
    context should be array of contexts
    [con1, con2, ...]
    """
    # q = "select count(id) from context"
    # res = execute_query(q)
    # current_size = res[0][0]
    # next = current_size+1
    for data in dataset:
        sql = "INSERT INTO QA_DATASET (question, context, answer, answer_start) VALUES (%s, %s, %s, %s)"
        cursor.execute(sql, (data["question"], data["context"], data["answer"], data["answer_start"]))
        # next+=1 
    conn.commit()

def extract_context(id):
    q = f"select context from QA_DATASET where id = {id}"
    res = execute_query(q)
    return res[0]

# MILVUS

In [1]:
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
connections.connect()
import odqa_mysql as odqa_mysql
import odqa_encoder as odqa_encoder

odqa


In [2]:
from tqdm.autonotebook import tqdm

In [3]:
import json

TABLE_NAME = 'question_answering'
collection = None



In [8]:
#Deleting previouslny stored table for clean run
def create_mqa():
    if utility.has_collection(TABLE_NAME):
        collection = Collection(name=TABLE_NAME)
        collection.drop()

    field1 = FieldSchema(name="ind", dtype=DataType.INT64, descrition="int64", is_primary=True)
    field2 = FieldSchema(name="id", dtype=DataType.INT64, descrition="int64", is_primary=False)
    field3 = FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, descrition="float vector",dim=1024, is_primary=False)
    schema = CollectionSchema(fields=[field1, field2, field3], description="collection description")
    collection = Collection(name=TABLE_NAME, schema=schema)
    
    default_index = {"index_type": "IVF_FLAT", "metric_type": 'IP', "params": {"nlist": 200}}
    collection.create_index(field_name="embedding", index_params=default_index)

if utility.has_collection(TABLE_NAME):
    collection = Collection(name=TABLE_NAME)

In [9]:
search_params = {"metric_type": "IP", "params": {"nprobe": 10}}




def push_context_to_milvus():
    print("\n\n")
    db_fp = r"database_handler.json"
    file = open(db_fp)
    database_handler = json.loads(file.read())
    file.close()

    start= database_handler['milvus_rows']
    end= start+database_handler["batch"]
    index = database_handler['milvus_rows']
    
    query = f"select * from context where id between {start} and {end} ;"
    res = odqa_mysql.execute_query(query)

    for id, context in tqdm(res):
        emb = [odqa_encoder.encode(context)]
        indexs = []
        ids = []
        for i in range(len(emb)):
            indexs.append(index)
            index+=1  
            ids.append(id)
        # print(emb, indexs, ids)
        collection.insert([indexs, ids, emb])
           
    database_handler['milvus_rows'] = end
    database_handler['milvus_index'] = index

    file = open(db_fp,"w")
    json.dump(database_handler, file)
    file.close()
    
    mysql_size = odqa_mysql.execute_query("select count(*) from QA_DATASET")[0][0]
    return f"mysql : {mysql_size}\nmilvus : {collection.num_entities}"

In [6]:
create_mqa()

In [10]:
push_context_to_milvus()






100%|██████████| 10/10 [00:20<00:00,  2.04s/it]


'mysql : 368\nmilvus : 10'

In [59]:
val = [[-0.32235443592071533, -0.15411481261253357, -0.338736355304718, -0.11812705546617508, -0.1453520953655243, 0.09137594699859619, -0.2227887362241745, -0.060651686042547226, -0.3328101336956024, 0.0895385891199112, 0.27272829413414, 0.47644010186195374, -0.2735048830509186, 0.16963237524032593, -0.2065221220254898, -0.07273852825164795, 0.11340218037366867, 0.028661157935857773, -0.029131906107068062, 0.04160211980342865, 0.06410396099090576, -0.015747789293527603, 0.13380853831768036, 0.08997336775064468, -0.09963123500347137, 0.10795332491397858, 0.09433432668447495, 0.08334070444107056, -0.3104367256164551, -0.174935445189476, -0.000488787074573338, 0.06462708115577698, 0.1596061736345291, 0.043419819325208664, 0.4734404981136322, 0.571754515171051, -0.21498072147369385, -0.08099639415740967, -0.09227392077445984, 0.12245827168226242, 0.029332740232348442, -0.009343566372990608, 0.04166129231452942, -0.12336704134941101, -0.16149765253067017, -0.15697848796844482, -0.4947028160095215, 0.25432088971138, -0.14352379739284515, 0.23318178951740265, 0.013301362283527851, -0.5161556005477905, 0.14713893830776215, -0.06232636421918869, -0.1566370278596878, -0.08993060141801834, 0.1698639690876007, -0.04441910237073898, 0.6242244243621826, -0.28356707096099854, -0.034220583736896515, 0.2836962938308716, -0.16827942430973053, 0.23677025735378265, -0.3203761577606201, 0.0774579644203186, 0.21145904064178467, -0.09599912166595459, -0.20016638934612274, 0.031235672533512115, -0.5524901747703552, 0.1435542106628418, 0.2560126781463623, 0.15027247369289398, 0.15505163371562958, 0.07975465804338455, -0.03615735098719597, -0.11947239935398102, 0.0087112532928586, -0.050386056303977966, 0.03253942355513573, 0.002014781115576625, -0.08569740504026413, -0.01989923045039177, -0.31593942642211914, 0.04531010985374451, -0.3545777499675751, 0.13169972598552704, -0.22778084874153137, -0.34302398562431335, -0.21585260331630707, 0.06736195832490921, 0.10703694820404053, 0.060225412249565125, -0.5337704420089722, 0.2277696430683136, -0.3169878423213959, 0.4251852333545685, -0.22468768060207367, -0.032569900155067444, -0.10536202043294907, 0.02893897145986557, -0.29901421070098877, 0.11286560446023941, -0.26057037711143494, -0.006071037612855434, 0.12167683988809586, 0.014700202271342278, -0.1605110913515091, -0.1136801540851593, 0.2257898449897766, 0.4061618149280548, -0.20501971244812012, -0.258598268032074, -0.10119038820266724, -0.2881211042404175, 0.06998644769191742, 0.3858989477157593, -0.13522252440452576, -0.059779565781354904, 0.05307925492525101, -0.21888367831707, 0.03828355297446251, -0.17809806764125824, -0.23276178538799286, 0.22979974746704102, -0.39177370071411133, -0.1599680632352829, 0.22854246199131012, -0.09191564470529556, -0.3289462625980377, -0.04491579905152321, 0.2546848654747009, -0.2205338031053543, -0.12239175289869308, -0.1670522540807724, 0.14010745286941528, 0.03271207585930824, -0.39113613963127136, 0.32044175267219543, 0.17058570683002472, -0.4269524812698364, -0.17408263683319092, 0.051222190260887146, 0.1609659492969513, -0.20106886327266693, -0.017109660431742668, -0.5881782174110413, -0.23694436252117157, 0.19408711791038513, 0.3692284822463989, 0.41474393010139465, -0.050386521965265274, -0.2565793991088867, -0.006927132606506348, -0.10671048611402512, 0.04594831541180611, 0.23760570585727692, -0.008567793294787407, -0.21412897109985352, 0.0006066945497877896, 0.18789425492286682, -0.0916498601436615, 0.33851543068885803, -0.2578064203262329, 0.06778804212808609, 0.2908920347690582, -0.05731724202632904, -0.41258808970451355, -0.1208016499876976, -0.04588640481233597, -0.24870780110359192, -0.11979990452528, 0.36423540115356445, -0.095649853348732, -0.2417655736207962, -0.1256885528564453, -0.26503458619117737, 0.08065865188837051, 0.1133996918797493, -0.011927610263228416, 0.07300273329019547, 0.17524953186511993, -0.12816956639289856, 0.26934105157852173, 0.043946053832769394, 0.03386685624718666, -0.3185825049877167, -0.06401261687278748, 0.2606874108314514, 0.03282832354307175, -0.3187115490436554, -0.415275514125824, 0.09936948865652084, 0.19112062454223633, -0.12371739000082016, -0.13322080671787262, 0.020644068717956543, -0.23498770594596863, 0.10751079022884369, 0.15384870767593384, -0.082929328083992, -0.3464660048484802, 0.22702966630458832, 0.21614116430282593, 0.15906085073947906, -0.1459553986787796, 0.32091209292411804, -0.1859057992696762, 0.17273662984371185, 0.11299451440572739, 0.2932707369327545, -0.2645028829574585, 0.15675552189350128, -0.13557861745357513, 0.08457371592521667, 0.27275389432907104, 0.19819563627243042, 0.18172574043273926, -0.2414999008178711, -0.22732527554035187, 0.2449416071176529, 0.09191843867301941, -0.09461793303489685, 0.07776325196027756, -0.07607558369636536, 0.3715945780277252, 0.006083988584578037, 0.33570989966392517, -0.16209352016448975, 0.25132524967193604, -0.2154540717601776, 0.3072347044944763, 0.08285646140575409, -0.14441315829753876, -0.2889159321784973, 0.12402459233999252, 0.07934363931417465, -0.12581166625022888, -0.02790514938533306, -0.08536842465400696, -0.17620965838432312, 0.2687890827655792, 0.006654575001448393, -0.18067055940628052, -0.26540255546569824, 0.13408800959587097, -0.25518596172332764, 0.05850676819682121, 0.048658937215805054, -0.10851670801639557, 0.0337260439991951, 0.3430430591106415, 0.3974270224571228, 0.19699808955192566, -0.10070876032114029, -0.31844455003738403, -0.41177457571029663, 0.1675693392753601, 0.4494423270225525, 0.08198948204517365, -0.008144254796206951, 0.3540792167186737, 0.09247156232595444, -0.18021562695503235, 0.11195240914821625, -0.3542245924472809, 0.3404114544391632, 0.06277056783437729, -0.10108459740877151, -0.07433482259511948, 0.38768336176872253, -0.17170830070972443, -0.14242643117904663, -0.09991893917322159, 0.11198045313358307, 0.12368699908256531, -0.24588696658611298, 0.20377761125564575, 0.42603781819343567, 0.15891623497009277, -0.018342595547437668, 0.18034151196479797, 0.02259465679526329, 0.2400689721107483, -0.01584763452410698, -0.13729286193847656, 0.3606187105178833, -0.3485390245914459, -0.13281278312206268, 0.33754199743270874, -0.36230918765068054, -0.1558678299188614, 0.29062509536743164, -0.1281113475561142, 0.11051997542381287, 0.05584849789738655, -0.08629444986581802, 0.17695188522338867, -0.055308304727077484, -0.21405385434627533, -0.10526812076568604, 0.19761551916599274, -0.1420060694217682, -0.11736565083265305, 0.33968159556388855, -0.3085418939590454, 0.19375216960906982, 0.4369267523288727, 0.3851618468761444, -0.2893314063549042, -0.010019159875810146, 0.34326881170272827, -0.41626203060150146, -0.2545163035392761, 0.023946434259414673, 0.3332706689834595, -0.1579488068819046, -0.45066043734550476, -0.08591263741254807, 0.11639949679374695, -0.3198464512825012, -0.043897759169340134, 0.08796108514070511, -0.0269604604691267, -0.0883440151810646, 0.2703898847103119, 0.14964546263217926, -0.42753496766090393, -0.15868140757083893, -0.3689490854740143, 0.013858850114047527, -0.08923260122537613, 0.45234864950180054, 0.051579706370830536, 0.07783153653144836, -0.19495455920696259, -0.06783583015203476, -0.27540409564971924, 0.2776864767074585, -0.2062920182943344, 0.02852771058678627, -0.10697177052497864, 0.021358363330364227, 0.022313082590699196, 0.16047346591949463, -0.25550377368927, -0.15099415183067322, -0.10583832859992981, -0.25895658135414124, 0.12606745958328247, 0.33186811208724976, 0.1510196328163147, -0.191482275724411, 0.15891651809215546, 0.06926427781581879, -0.2379395216703415, -0.18552488088607788, 0.07995918393135071, 0.060438793152570724, -0.19690006971359253, 0.3035007119178772, -0.07484272867441177, -0.13947036862373352, 0.08503194153308868, 0.09590385854244232, 0.1560872346162796, 0.189806267619133, 0.13225150108337402, -0.02540009841322899, 0.28527820110321045, 0.375622421503067, -0.10517147183418274, 0.3388400375843048, 0.12232764810323715, -0.1600469946861267, 0.15722164511680603, -0.10400206595659256, 0.017789585515856743, 0.0008199057774618268, 0.2537330389022827, -0.10067691653966904, 0.15873664617538452, 0.24766471982002258, 0.3311566114425659, -0.23267200589179993, -0.03630908951163292, 0.09716028720140457, -0.20941464602947235, 0.25100046396255493, 0.1452886462211609, 0.017099061980843544, 0.2533751428127289, -0.3800129294395447, 0.0975002571940422, 0.03913199156522751, 0.24344828724861145, -0.5646995902061462, -0.04189790040254593, 1.1586886644363403, -0.25617605447769165, -0.007782048545777798, -0.06694837659597397, -0.44205179810523987, 0.1941697597503662, 0.07883992791175842, 0.301050066947937, 0.06740200519561768, -0.158853679895401, -0.48599398136138916, -0.1099635511636734, 0.34172606468200684, -0.19247967004776, 0.35681384801864624, 0.1951245367527008, 0.017653103917837143, -0.11452025920152664, 0.15391796827316284, -0.23987099528312683, -0.159428671002388, -0.11250938475131989, 0.11769941449165344, -0.019589092582464218, 0.1479639858007431, -0.17647632956504822, -0.092124342918396, 0.2199317365884781, 0.11439824104309082, 0.06362220644950867, -0.10664204508066177, 0.11502096056938171, -0.013105558231472969, -0.015050675719976425, -0.21116560697555542, 0.3217304050922394, 0.016487322747707367, 0.11887635290622711, 0.09286145865917206, 0.010744158178567886, 0.3396081328392029, -0.15873759984970093, -0.12834084033966064, -0.21136805415153503, -0.09672597050666809, 0.25197315216064453, 0.08593542873859406, -0.2874579429626465, 0.05470048263669014, -0.08827845752239227, -0.41169703006744385, -0.26290762424468994, -0.1877925544977188, -0.20669116079807281, -0.09392619878053665, 0.32837268710136414, 0.058087415993213654, 0.06795723736286163, -0.4262283444404602, -0.06300100684165955, 0.11568284034729004, -0.29100996255874634, -0.28871428966522217, -0.05026092380285263, -0.158347487449646, 0.3669024705886841, -0.07742597162723541, -0.4861496388912201, 0.056818537414073944, -0.047346845269203186, -0.05479009076952934, 0.603597104549408, 0.6183546185493469, 0.3111165761947632, 0.1096542701125145, 0.035165514796972275, -0.12814587354660034, -0.15026861429214478, -0.16125790774822235, -0.25822100043296814, 0.24308034777641296, 0.19937586784362793, 0.15843318402767181, 0.2726357579231262, 0.15619045495986938, -0.26513659954071045, 0.5379045605659485, 0.10561535507440567, -0.026165056973695755, -0.5527903437614441, -0.2397429198026657, 0.06249536573886871, 0.0006053712568245828, -0.14037686586380005, 0.06852992624044418, -0.04705292731523514, 0.06575135886669159, 0.07280155271291733, -0.02559276856482029, 0.3462737798690796, 0.1642923653125763, -0.045234449207782745, -0.15907394886016846, 0.055268824100494385, -0.11473226547241211, -0.19215644896030426, 0.1439947783946991, -0.09522424638271332, 0.2818288505077362, -0.29713404178619385, -0.12218398600816727, -0.4804271459579468, -0.2509973645210266, 0.11917711794376373, -0.14349347352981567, -0.09922085702419281, 0.12670938670635223, -0.24041035771369934, -0.07401324808597565, -0.02228129841387272, -0.05217767134308815, -0.09188726544380188, 0.06881454586982727, 0.27919018268585205, 0.23706500232219696, -0.05948856845498085, -0.24451406300067902, 0.3711407482624054, -6.588300311705098e-05, 0.1277533620595932, -0.052795618772506714, -0.044397905468940735, 0.01321118138730526, 0.2204769253730774, 0.33426013588905334, -0.1415136456489563, -0.009066058322787285, -0.6169691681861877, 0.23132623732089996, -0.0030628214590251446, 0.35647475719451904, 0.034064870327711105, 0.1062946617603302, 0.22080864012241364, 0.05851142853498459, -0.38685736060142517, 0.2524290978908539, -0.056838445365428925, 1.6484153270721436, 0.09419509023427963, -0.34114646911621094, 0.2996199429035187, -0.049768559634685516, 0.07094986736774445, -0.1610587239265442, 0.19429993629455566, 0.03707920014858246, 0.09474655985832214, 0.31484267115592957, 0.12670424580574036, 0.0697910413146019, 0.1592167466878891, 0.1270555704832077, 0.40010127425193787, 0.08349665999412537, -0.1724693924188614, 0.2830675542354584, 0.004229601006954908, -0.36183324456214905, 0.1440754532814026, 0.05363713949918747, 0.09344227612018585, 0.28420546650886536, -0.1889534294605255, -0.010545507073402405, 0.021384261548519135, -0.054729048162698746, 0.08455728739500046, 0.4224550724029541, -0.29392802715301514, 0.0574335902929306, -0.30571210384368896, 0.05138574540615082, -0.12291917204856873, 0.4147696793079376, -0.44075408577919006, 0.26360884308815, -0.26287052035331726, -0.20384450256824493, 0.001038753311149776, -0.21011106669902802, -0.0649319589138031, -0.2560479938983917, -0.018281441181898117, -0.011217368766665459, 0.1944352388381958, -0.13973651826381683, 0.2256287932395935, 0.21902845799922943, 0.4210990071296692, 0.22337251901626587, 0.14182697236537933, -0.23501542210578918, 0.06990373134613037, -0.1020665168762207, -0.21465615928173065, -0.11023249477148056, 0.12562574446201324, -0.32006552815437317, 0.3851131796836853, 0.5438449382781982, -0.09516500681638718, -0.2564687728881836, -0.2401149570941925, 0.18717436492443085, -0.26218897104263306, -0.17636655271053314, 0.6574936509132385, -0.24249812960624695, 0.08274167031049728, -0.055729612708091736, -0.20456992089748383, -0.009902840480208397, -0.17821183800697327, -0.10578413307666779, 0.05533887445926666, 0.08116312325000763, -0.43721839785575867, 0.3634927570819855, 0.11609441787004471, -0.017900655046105385, -0.25103357434272766, -10.190662384033203, -0.0869196280837059, 0.06041968613862991, -0.22028738260269165, -0.10373219847679138, -0.2520028352737427, 0.4061976671218872, -0.25840461254119873, 0.2316199094057083, -0.18681949377059937, 0.37440434098243713, -0.06840169429779053, 0.0797077864408493, -0.27683183550834656, -0.05195033550262451, -0.4243795573711395, 0.03857417777180672, 0.31875741481781006, -0.17499180138111115, -0.12726730108261108, 0.022553514689207077, -0.12852950394153595, -0.17462116479873657, 0.28504419326782227, 0.0196926798671484, 0.04956952482461929, -0.04022049531340599, -0.07183274626731873, 0.04439162835478783, -0.05337719991803169, 0.12968598306179047, 0.3247431814670563, 0.04235389456152916, -0.3440907895565033, 0.2145535796880722, 0.5700027942657471, 0.04259880259633064, -0.5415158271789551, -0.13342328369617462, 0.22251145541667938, 0.12457302212715149, 0.06607748568058014, -2.898782968521118, 0.0796944797039032, -0.11199775338172913, -0.1932663768529892, -0.14335528016090393, -0.3464726507663727, 0.21198207139968872, 0.09303416311740875, 0.08489514887332916, -0.002410206478089094, 0.06960011273622513, 0.12256116420030594, 0.15649431943893433, -0.08284012973308563, -0.11390355974435806, 0.08344937860965729, -0.21112293004989624, -0.0327921099960804, 0.42965462803840637, -0.0269633736461401, 0.07325229048728943, -0.07117760926485062, -0.21081720292568207, -0.3394417464733124, -0.05019677057862282, -0.14377766847610474, -0.06404658406972885, 0.10856836289167404, -0.2329503893852234, -0.009976823814213276, -0.4097945988178253, 0.18664643168449402, -0.17123478651046753, 0.00675891013815999, -0.02200661413371563, 0.005480742081999779, -0.15244726836681366, 0.15562354028224945, 0.18818506598472595, 0.10133695602416992, 0.19456440210342407, 0.30905500054359436, 0.0674196109175682, -0.16446012258529663, 0.05304621905088425, -0.3838998079299927, -0.01689225807785988, -0.3003225326538086, 0.0875445306301117, 0.13395023345947266, -0.1483515053987503, 0.021435685455799103, 0.2734033763408661, -0.1628648042678833, 0.08677637577056885, -0.2778036296367645, 0.1418071985244751, -0.042441487312316895, 0.45874035358428955, -0.1148260161280632, 0.2897411286830902, 0.012639479711651802, -0.29688677191734314, -0.13707201182842255, -0.47205787897109985, -0.19403494894504547, 0.3417533040046692, -0.12477108091115952, 0.2971839904785156, 0.09297007322311401, 0.06659331172704697, -0.14933273196220398, -0.1779976189136505, -0.1452379673719406, 0.03066452592611313, -0.25738853216171265, -0.028548546135425568, -0.20171326398849487, -0.396677702665329, -0.29924556612968445, 0.20015273988246918, -0.15254411101341248, 0.07489748299121857, -0.00016018252063076943, -0.2835170030593872, -0.03620348498225212, 0.1184823215007782, -0.27221885323524475, -0.21294987201690674, -0.18707884848117828, 0.2141011357307434, 0.06419214606285095, -0.1385880559682846, 0.046695537865161896, 0.12287876009941101, 0.04573548585176468, -0.13094089925289154, -0.04289059340953827, -0.11107193678617477, -0.01210386399179697, 0.1947573870420456, -0.31037285923957825, 0.27574196457862854, 0.0741373598575592, -0.18758553266525269, 0.24366150796413422, 0.08314979821443558, -0.022474555298686028, 0.020937614142894745, -0.15296076238155365, 0.28660285472869873, 0.11723250895738602, -0.39981189370155334, -0.16381579637527466, 0.04408017173409462, -0.5227293372154236, -0.04603147879242897, 0.0866566002368927, -0.22299101948738098, -0.14861685037612915, -0.17349295318126678, 0.06863177567720413, -0.2668343782424927, -0.12336914986371994, -0.1179567202925682, 0.3649806082248688, -0.2984587550163269, -0.28951913118362427, 0.3215509057044983, 0.2729945480823517, 0.05255197733640671, 0.10826665908098221, 0.22920046746730804, -0.06565851718187332, -0.44626155495643616, 0.2949139475822449, 0.07401397079229355, 0.15319859981536865, -0.1542404443025589, -0.26862236857414246, -0.07377006113529205, -0.20474494993686676, -0.312465101480484, 0.28770262002944946, 0.2641927897930145, -0.1951998621225357, 0.10939332097768784, -0.0145217664539814, -0.20798738300800323, -0.12224169820547104, -0.18903027474880219, -0.13436359167099, 0.24907906353473663, 0.2482711672782898, -0.14385747909545898, -0.20620723068714142, -0.19584867358207703, 0.02166503109037876, -0.004893223289400339, -0.08628492802381516, 0.3522568643093109, -0.04512610286474228, 0.24495407938957214, 0.06468342244625092, -0.04708687961101532, 0.19676275551319122, 0.39373135566711426, 0.06051156297326088, -0.15115080773830414, 0.5217702388763428, 0.08091078698635101, -0.3845408856868744, -0.056500956416130066, -0.18722033500671387, 0.14128507673740387, -0.5432021617889404, -0.04783369600772858, -0.26562750339508057, 0.32573986053466797, -0.07785440236330032, 0.05178811401128769, -0.04938144609332085, -0.1218980923295021, 0.07650583982467651, 0.23301979899406433, -0.06229274719953537, 0.18069332838058472, 0.31456780433654785, 0.02705739252269268, 0.09008312970399857, -0.05865590274333954, 0.2809014916419983, -0.2623142898082733, -0.026369402185082436, -0.1793598234653473, 0.011120377108454704, -0.11366450786590576, -0.009927981533110142, 0.05474455654621124, 0.04818350076675415, 0.13646967709064484, 0.25642329454421997, -0.0341450460255146, 0.14487163722515106, -0.20054489374160767, -0.07310570776462555, 0.19121527671813965, -0.14469045400619507, 0.12653107941150665, 0.06261742115020752, -0.1474534422159195, -0.02133093774318695, -0.36100438237190247, -0.1506786048412323, 0.025962550193071365, 0.273864209651947, -0.15331655740737915, -0.06239619478583336, 0.3305407762527466, 0.11003141850233078, -0.18594005703926086, -0.04942525178194046, 0.21983259916305542, 0.24847139418125153, 0.1657058298587799, 0.09479910880327225, -0.13686898350715637, 0.1640421748161316, 0.10944391787052155, -0.17377859354019165, -0.19537214934825897, 0.10450676083564758, -0.2051854431629181, -0.038712963461875916, -0.24785590171813965, -0.003523549996316433, 0.2693016231060028, 0.12486040592193604, -0.12239481508731842, 0.15440532565116882, 0.10967830568552017, 0.890360951423645, -0.046961646527051926, 0.07082017511129379, -0.2194899022579193, -0.17472529411315918, 0.10425935685634613, -0.16719427704811096, -0.0002233766863355413, 0.2340196967124939, 0.259307324886322, 0.30561915040016174, -0.27514880895614624, -0.1057141125202179, -0.13474032282829285, -0.09951220452785492, -0.26301446557044983, -0.09546706080436707, -0.28721103072166443, -0.04132963716983795, -0.19574688374996185, 0.24073179066181183, -0.29522791504859924, -0.20778173208236694, -0.19671446084976196, 0.42220523953437805, 0.06912880390882492, -0.22265389561653137, -0.027279861271381378, 0.015022948384284973, -0.03813406825065613, -0.008571094833314419, 0.0230477973818779, 0.2586926221847534, 0.18644651770591736, -0.0655607283115387, 0.03347012773156166, 0.3575823903083801, -0.021993979811668396, 0.10766961425542831, -0.026051077991724014, -0.18273748457431793, -0.17524568736553192, 0.014442745596170425, -0.03743363544344902, 0.09170683473348618, 0.1350557506084442, -0.2189628928899765, 0.3889719247817993, -0.1568765640258789, 0.23986069858074188, 0.13027040660381317, 0.1280180662870407, -0.4278815686702728, 0.17082306742668152, 0.27815762162208557, 0.23238740861415863, -0.42840397357940674, 0.07363121211528778, 0.06223312392830849, -0.35150378942489624, -0.14736784994602203, 0.23525425791740417, -0.076148122549057, -0.020401697605848312, -0.13576640188694, 0.28511860966682434, -0.10956791788339615, -0.18416239321231842, 0.11404156684875488, -0.11214175820350647, 0.2590857744216919, -0.20566673576831818, 0.21962526440620422, 0.15270034968852997, -0.18922573328018188, -0.029977014288306236, 0.29008039832115173, -0.47275829315185547, 0.14029794931411743, -0.03648200258612633, -0.00830860435962677, 0.21501128375530243, 0.3015373945236206, -0.3453731834888458, -0.38697388768196106, -0.1936315894126892, 0.024018654599785805, -0.27707040309906006, 0.036604154855012894, 0.18546029925346375, -0.005322598852217197, -0.11017399281263351, 0.5249467492103577, -0.4886886179447174, -0.4184633195400238, -0.1357334405183792, 0.027974823489785194, 0.018195059150457382, -0.18541888892650604, 0.09766358137130737, 0.2124328911304474, 0.09028207510709763, -0.02203476056456566, 0.2424597442150116, 0.02595249004662037, -0.11051066964864731, 0.22457821667194366, 0.04711220785975456, 0.046404141932725906, 0.265415221452713]]

In [72]:
def find_similar(emb):
    collection.load()
    return collection.search(
	data=emb, 
	anns_field="embedding", 
	param=search_params, 
	limit=10, 
	output_fields = ['id', 'ind'], 
	expr=None,
	consistency_level="Strong"
)


In [73]:
len(val[0])

1024

In [74]:
res = find_similar(val)

In [75]:
res[0].ids

[0, 8, 5, 4, 9, 6, 7, 1, 3, 2]

In [76]:
res

<pymilvus.orm.search.SearchResult at 0x250fba96430>

In [77]:
res[0]

<pymilvus.orm.search.Hits at 0x250cd1db9a0>

In [78]:
for re in res:
    print(re)

['(distance: 129.51051330566406, id: 0)', '(distance: 129.07215881347656, id: 8)', '(distance: 127.6363525390625, id: 5)', '(distance: 127.57453918457031, id: 4)', '(distance: 127.57299041748047, id: 9)', '(distance: 127.15343475341797, id: 6)', '(distance: 127.04383850097656, id: 7)', '(distance: 126.84539031982422, id: 1)', '(distance: 126.68315124511719, id: 3)', '(distance: 126.5791244506836, id: 2)']


In [79]:
def query():
    collection.load()
    return collection.query(
	anns_field="embedding", 
	param=search_params,
	output_fields = ["id"] ,
	limit=10, 
	expr="id < 6",
	consistency_level="Strong"
)

In [80]:
res = query()

In [81]:
res

[{'ind': 3, 'id': 4},
 {'ind': 0, 'id': 1},
 {'ind': 1, 'id': 2},
 {'ind': 2, 'id': 3},
 {'ind': 4, 'id': 5}]

# EXTRACTOR

In [None]:
from transformers import  pipeline

# model_name = "deepset/xlm-roberta-large-squad2"
model_name = "AswiN037/xlm-roberta-squad-tamil"

answer_extract = pipeline('question-answering', model=model_name, tokenizer=model_name)


# Evaluation

In [1]:
from datasets import load_metric

In [7]:
rouge = load_metric("rouge")

In [8]:
# !pip install absl-py
# !pip install rouge_score


In [9]:
res = [['206', ' 206'],
 ['காசுமீரில்', ' காசுமீரில்'],
 ['சர் அலெக்ஸாண்டர் ஃபிளெமிங்', ' அலெக் ஸாண்டர் ஃப்பௌமிங்.'],
 ['தாலாட்டு', '\n(ஆயர்பாடி…)'],
 ['சூரியனும்', ' சூரியன்'],
 ['IBM', ' IBM'],
 ['அலெக்ஸாண்டர் கிரகாம் பெல்', ' பெல்'],
 ['1914ம்', ' (1914'],
 ['அக்டோபர் 12, 1993', ' அக்டோபர் 12, 1993[1]'],
 ['27', ' 27'],
 ['உருகுவே', ' பிரேசில்'],
 ['பிரான்ஸ்', ' பிரான்ஸ்'],
 ['30,368,609', '\n(km²)'],
 ['185 மீட்டர்', ' 185 மீட்டர்'],
 ['கி.பி.1510', '\nகி.பி.1510-ம்'],
 ['கி.மு. ஐந்தாம் நூற்றாண்டில்', ' (கி.மு 470/469'],
 ['60 நொடிகள்', ' 60'],
 ['பசுபிக்', ' பசுபிக்'],
 ['லிஸ்பன்', ' லிஸ்பன்'],
 ['உடல் திசு ஆய்வு', ' (Men Get Breast Cancer'],
 ['சசி', ' சசி'],
 ['1,376  கிலோ மீட்டர்', ' 1370 கிமீ'],
 ['1000', ' 1025நீர்  1000ஈத்தைல்'],
 ['மூங்கில்', ' மூங்கிலால்'],
 ['புளோரிடாவில்', ' புளோரிடாவில்'],
 ['208', ' 206'],
 ['இந்தியத் திரைப்பட இசைப் பாடகர்', ' பின்னணிப்பாடகர்'],
 ['5488', ' 5488'],
 ['சனவரி 28, 1892', ' 1995'],
 ['அலெக்ஸாண்டர்-எட்மண்ட பெக்கெரெலின்', ' ஆர்க்கிமிடீஸ்'],
 ['அகுவாபா', ' அகுவாபா'],
 ['பிரிட்டனில்', ' ஸ்பெயினில்,'],
 ['புதன்', ' (Mercury)'],
 ['நெமடோடா', '\nஎக்டிசாசோவாக்கள்'],
 ['26 மே 2001', ' 26 மே 2001'],
 ['8 லட்சத்துக்கு', ' 8 லட்சத்துக்கு மேற்பட்டவை'],
 ['அ. இர. ரகுமான்', ' நியமிக்கப்பட்டார்.[5]'],
 ['தொற்று', '\nதொற்று'],
 ['தமிழ்', ' தமிழ்'],
 ['Hibiscus rosa-sinensis', ' செவ்வரத்தை'],
 ['கரிகாலன்', ' கரிகாலன்'],
 ['சுவீடனில்', ' சுவீடனில்'],
 ['மேரிகியூரி மற்றும் பியரிகியூரி தம்பதியரால்',
  ' மேரிகியூரி மற்றும் பியரிகியூரி'],
 ['10,911', ' 4,694 மீட்டர்.'],
 ['பசிபிக் பெருங்கடலாகும்', ' அத்திலாந்திக் பெருங்கடல்'],
 ['புதன்', ' வாகும்.'],
 ['செவுள்கள், நுரையீரலால்', ' (நுரையீரல்'],
 ['1,568.7 square kilometres', ' 1,568.7 square kilometres'],
 ['தோக்கியோ', ' தோக்கியோ'],
 ['1000', ' 1000'],
 ['விஷம்', ' விஷம் கொடுத்துக்'],
 ['மெசொப்பொத்தேமியர்கள்', ' எகிப்தியர்கள்'],
 ['பிடி', ' பிடி'],
 ['ஈராக்', ' Mesopotamia.'],
 ['சனவரி 26ஆம்', ' குடியரசு நாள் அன்று'],
 ['ஆறாம் முகம்மது', '\nமுதலாம் சுலைமானின்'],
 ['செவ்வாய்', ' செவ்வாயின்'],
 ['1901', ' 1901'],
 ['29', ' 14'],
 ['வெளிப்பரப்பு அடர்த்தியான சாம்பல் நிறத்திலும், உள்நிறை மஞ்சள், வெள்ளை',
  ' சாம்பல்'],
 ['1623', ' 27 அக்டோபர் 1605),[3][4]'],
 ['பைக்கால்', ' பைக்கால்'],
 ['அவுஸ்திரேலியா', ' ஆஸ்திரேலிய'],
 ['ஒழுங்கற்ற', ' இரண்டாவது சிறிய'],
 ['மாட்ரிட்', ' மாட்ரிட்.'],
 ['என்றிகோ பெர்மியின்', ' என்றிகோ பெர்மியின்'],
 ['70 ஆண்டுகள்', ' 70 ஆண்டுகள்).'],
 ['பெங்களூரு', ' பெங்களூரில்'],
 ['ஆறு', ' ஆறு'],
 ['ஐந்தில் ஒரு பங்கு', ' ஐந்தில் ஒரு பங்கு'],
 ['1982', ' 1982ம்'],
 ['ஆல்ஃபிரட் நோபல்', ' ஆல்ஃபிரட் நோபெல்'],
 ['தென் அமெரிக்க', ' தென் அமெரிக்க'],
 ['சாக்ரடீசு', ' பிளேட்டோவும்'],
 ['பழங்குடி மொழி', ' பழங்குடி மொழி'],
 ['சர்தார் வல்லப்பாய் படேல்', ' சர்தார் வல்லப்பாய் படேல்'],
 ['70', ' 70 ஆண்டுகள்).'],
 ['7870', ' 7870வெள்ளீயம்'],
 ['ஆர்கெண்ட்டம்', ' ஆர்கெண்ட்டம்'],
 ['டிசம்பர் 25, 1642', ' (டிசம்பர் 25, 1642'],
 ['கெய்ரோ', ' கெய்ரோ'],
 ['எட்டு கோள்களையும், ஐந்து குறுங்கோள்களையும்', ' எட்டு'],
 ['கொலம்பசு', ' கொலம்பஸ்'],
 ['2008', ' 1957'],
 ['350,000', ' (International Code of Nomenclature for Cultivated Plants)'],
 ['இரண்டு மில்லியன் சதுர கிலோமீட்டர்', '\n1,972,550 சதுர கிலோமீட்டர்'],
 ['1935', ' 1935'],
 ['3', ' 3'],
 ['ஜான் ஷெப்பர்ட் பேரோன்', ' ஜான் ஷெப்பர்ட் பேரோன்'],
 ['பிரெஞ்சு', ' பிரெஞ்சு'],
 ['24', ' 24']]

In [14]:
# predictions = [res[0][0]]
# reference = [res[0][1]]
reference = []
predictions = []

for ref, pre in res:
    reference.append(ref)
    predictions.append(pre)


In [17]:
len(predictions), len(reference)

(91, 91)

In [15]:
rouge.compute(predictions=predictions, references=reference)

{'rouge1': AggregateScore(low=Score(precision=0.16112637362637366, recall=0.16483516483516483, fmeasure=0.1641025641025641), mid=Score(precision=0.24358974358974356, recall=0.25274725274725274, fmeasure=0.2468864468864469), high=Score(precision=0.3461996336996337, recall=0.3626373626373626, fmeasure=0.35239926739926736)),
 'rouge2': AggregateScore(low=Score(precision=0.005494505494505495, recall=0.01098901098901099, fmeasure=0.007326007326007326), mid=Score(precision=0.038461538461538464, recall=0.04395604395604396, fmeasure=0.04029304029304029), high=Score(precision=0.07706043956043944, recall=0.08791208791208792, fmeasure=0.08424908424908424)),
 'rougeL': AggregateScore(low=Score(precision=0.15929487179487184, recall=0.16483516483516483, fmeasure=0.16117216117216115), mid=Score(precision=0.24725274725274726, recall=0.25274725274725274, fmeasure=0.2490842490842491), high=Score(precision=0.32967032967032966, recall=0.34065934065934067, fmeasure=0.33342490842490835)),
 'rougeLsum': Aggr