# Построение вопросно-ответной системы с использованием RAG

В этом ноутбуке продемонстрирована базовая модель на основе БД Qdrant, с кодированием SentenceTransformer

### импорты

In [27]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

In [28]:
# фиксируем seed
random.seed(42)
np.random.seed(42)

### загрузка датасета

In [7]:
# # загрузка csv файла из google drive
# !wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1dgnwG1AZhspMJaDR6jz828qZkSnZg25t' -O full_dataset.csv

In [8]:
rag_dataset = pd.read_csv('full_dataset.csv')
rag_dataset

Unnamed: 0,context,question,answer
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...
1,RWSN Collaborations\nSouthern Africa Self-supp...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...
2,All Android applications categories\nDescripti...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...
3,"How unequal is India? The question is simple, ...",What is the main difference between the Nation...,The main difference between the NSS and the IH...
4,Gunnar Nelson took his time on the feet agains...,How did Gunnar Nelson win the fight against Za...,Gunnar Nelson won the fight against Zak Cummin...
...,...,...,...
11995,"Fuzzy's Ultra Premium Vodka\nThe Myth, The Man...",What are some of the achievements of Fuzzy Zoe...,Fuzzy Zoeller is known for his golfing success...
11996,Swedish Grand Prix rider Malin Nilsson got mar...,Who did Malin Nilsson marry on 2 June 2018?,"Malin Nilsson got married to her partner, Germ..."
11997,The Cracchiolo Law Library of the James E. Rog...,What is the Fellowship in Law Librarianship of...,The Fellowship in Law Librarianship is a progr...
11998,2nd physical eMAG store opens in Mammut\nOnlin...,Where has the second physical eMAG store been ...,The second physical eMAG store has been opened...


In [9]:
# вывод одного сэмпла
rag_dataset.values[0]

array(['Caption: Tasmanian berry grower Nic Hansen showing Macau chef Antimo Merone around his property as part of export engagement activities.\nTHE RISE and rise of the Australian strawberry, raspberry and blackberry industries has seen the sectors redouble their international trade focus, with the release of a dedicated export plan to grow their global presence over the next 10 years.\nDriven by significant grower input, the Berry Export Summary 2028 maps the sectors’ current position, where they want to be, high-opportunity markets and next steps.\nHort Innovation trade manager Jenny Van de Meeberg said the value and volume of raspberry and blackberry exports rose by 100 per cent between 2016 and 2017. She said the Australian strawberry industry experienced similar success with an almost 30 per cent rise in export volume and a 26 per cent rise in value to $32.6M over the same period.\n“Australian berry sectors are in a firm position at the moment,” she said. “Production, adoption o

Видны посторонние символы, такие как перевод строки \n

Имеется 3 колонки и 12000 строк

In [None]:
# Предобработка текстов
df = rag_dataset.copy()

# удаление строк с пустыми значениями (None)
df = df.dropna()

# приведем к нижнему регистру тексты во всех колонках
df['context'] = df['context'].apply(lambda x: x.lower())
df['question'] = df['question'].apply(lambda x: x.lower())
df['answer'] = df['answer'].apply(lambda x: x.lower())

# удаление/замена на пробел знака перевода строки в колонке 'context'
df['context'] = df['context'].apply(lambda x: x.replace('\n', ' '))
# удаление найденных дубликатов по 'question'
df.duplicated(subset=['question'], keep=False)
df.drop_duplicates(subset = ['question'], keep = 'first', inplace = True)

# удаление двух строк на другом языке
df.drop(index=[7453, 10225], inplace=True)
df.reset_index(drop=True, inplace=True) # обновление индексов



## QDRANT Baseline

In [None]:
# %pip install -U sentence-transformers
# %pip install -U qdrant-client
# clear_output()

In [None]:
from qdrant_client import models, QdrantClient
from qdrant_client.models import VectorParams, Distance
from sentence_transformers import SentenceTransformer,CrossEncoder


In [15]:
# инициализация энкодера
encoder = SentenceTransformer("all-MiniLM-L6-v2")
clear_output()
# https://huggingface.co/sentence-transformers/all-MiniLM-L6-v2
# https://www.sbert.net/docs/sentence_transformer/pretrained_models.html

In [13]:
qdrant = QdrantClient(url = "http://localhost:6333")

In [16]:
# создание коллекции
qdrant.recreate_collection(
    collection_name="rag",
    vectors_config=models.VectorParams(
        size=encoder.get_sentence_embedding_dimension(),
        distance=models.Distance.COSINE,
    ),
)

  qdrant.recreate_collection(


True

In [18]:
docs = df
docs['id'] = docs.index
docs.shape, docs.columns

((11981, 4), Index(['context', 'question', 'answer', 'id'], dtype='object'))

In [21]:
# https://qdrant.tech/documentation/concepts/points/
# загрузка данных в БД
qdrant.upload_points(
    collection_name="rag",
    points=[
        models.PointStruct(
            id=row[1]["id"],
            vector=encoder.encode(row[1]["context"]),
            payload={ "context":row[1]["context"]}
        )
        for row in docs.iterrows()
    ],
)


In [22]:
qdrant.count(
    collection_name="rag",
    exact=True,
)

CountResult(count=11981)

In [25]:
query_string = df.question[3000]

hits = qdrant.search(
    collection_name="rag",
    query_vector=encoder.encode(query_string).tolist(),
    limit=1,
)

print(query_string)
for hit in hits:
    print(hit.payload, "score:")
    print(hit.id)
    print(hit.score)

what was the purpose of the lordstown plant according to the context?
{'context': "by tracey o'neill special to the standard north kingstown– monday night’s regular meeting of the north kingstown town council began in joint work session with the planning commission. they tackled economic development issues, focusing on the town’s vision for the future and the revitalization of the post road corridor. at the heart of the discussion were the issues of multi-level, mixed-use development and the possible integration of big-box stores into development plans. planning director jonathan j. reiner presented an overview of the benefits to the town’s moving forward with a plan that includes assimilation of both. he noted the evolution of the times and the present harsh economic climate that north kingstown and surrounding communities are experiencing. “if we allow box stores, route 2 is the most likely area that will receive them.” mixed-use development allows for a combination of residential, b

In [30]:
# в цикле проходим по датасету и для каждого вопроса находим id ответа из БД складваем id в список top_ind_search

top_ind_search = []
for index, row in tqdm(df.iterrows()):
    query_string = row['question']
    hits = qdrant.search(
    collection_name="rag",
    query_vector=encoder.encode(query_string).tolist(),
    limit=1,
    )
    top_ind_search.append(hits[0].id)



11981it [05:25, 36.85it/s]


In [32]:
# делаем колонку top_ind_search и сравниваем оригинальный id и найденный в БД, в колонку eval ставим 1 при совпадении идентификаторов
df['top_ind_search'] = top_ind_search
df['eval'] = (df['top_ind_search'] == df.id).astype(int)

In [None]:
df

Unnamed: 0,context,question,answer,id,top_ind_search,eval
0,caption: tasmanian berry grower nic hansen sho...,what is the berry export summary 2028 and what...,the berry export summary 2028 is a dedicated e...,0,0,1
1,rwsn collaborations southern africa self-suppl...,what are some of the benefits reported from ha...,benefits reported from having access to self-s...,1,1,1
2,all android applications categories descriptio...,what are the unique features of the coolands f...,the unique features of the coolands for twitte...,2,2,1
3,"how unequal is india? the question is simple, ...",what is the main difference between the nation...,the main difference between the nss and the ih...,3,3,1
4,gunnar nelson took his time on the feet agains...,how did gunnar nelson win the fight against za...,gunnar nelson won the fight against zak cummin...,4,4,1
...,...,...,...,...,...,...
11976,"fuzzy's ultra premium vodka the myth, the man,...",what are some of the achievements of fuzzy zoe...,fuzzy zoeller is known for his golfing success...,11976,11976,1
11977,swedish grand prix rider malin nilsson got mar...,who did malin nilsson marry on 2 june 2018?,"malin nilsson got married to her partner, germ...",11977,11977,1
11978,the cracchiolo law library of the james e. rog...,what is the fellowship in law librarianship of...,the fellowship in law librarianship is a progr...,11978,11978,1
11979,2nd physical emag store opens in mammut online...,where has the second physical emag store been ...,the second physical emag store has been opened...,11979,11979,1


In [33]:
# количество правильных/неправильных идентификаторов
df['eval'].value_counts(normalize = True)

eval
1    0.754445
0    0.245555
Name: proportion, dtype: float64

In [34]:
# расчет через precision
from sklearn.metrics import precision_score

precision = precision_score(df.id, df.top_ind_search, average='micro')
print(precision)

0.7544445371838745


Качество (encoder - SentenceTransformer("all-MiniLM-L6-v2")) 75%