# Построение вопросно-ответной системы с использованием RAG

В этом ноутбуке продемонстрирована базовая модель на основе БД Qdrant, с кодированием tf-idf

### импорты

In [32]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
import string
import nltk
from nltk.tokenize import word_tokenize

from qdrant_client import QdrantClient, models
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [4]:
# фиксируем seed
random.seed(42)
np.random.seed(42)

### загрузка датасета

In [14]:
# # загрузка csv файла из google drive
#!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1dgnwG1AZhspMJaDR6jz828qZkSnZg25t' -O full_dataset.csv

In [20]:
rag_dataset = pd.read_csv('full_dataset.csv')
rag_dataset

Unnamed: 0,context,question,answer
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...
1,RWSN Collaborations\nSouthern Africa Self-supp...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...
2,All Android applications categories\nDescripti...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...
3,"How unequal is India? The question is simple, ...",What is the main difference between the Nation...,The main difference between the NSS and the IH...
4,Gunnar Nelson took his time on the feet agains...,How did Gunnar Nelson win the fight against Za...,Gunnar Nelson won the fight against Zak Cummin...
...,...,...,...
11995,"Fuzzy's Ultra Premium Vodka\nThe Myth, The Man...",What are some of the achievements of Fuzzy Zoe...,Fuzzy Zoeller is known for his golfing success...
11996,Swedish Grand Prix rider Malin Nilsson got mar...,Who did Malin Nilsson marry on 2 June 2018?,"Malin Nilsson got married to her partner, Germ..."
11997,The Cracchiolo Law Library of the James E. Rog...,What is the Fellowship in Law Librarianship of...,The Fellowship in Law Librarianship is a progr...
11998,2nd physical eMAG store opens in Mammut\nOnlin...,Where has the second physical eMAG store been ...,The second physical eMAG store has been opened...


In [22]:
# вывод одного сэмпла
rag_dataset.values[0]

array(['Caption: Tasmanian berry grower Nic Hansen showing Macau chef Antimo Merone around his property as part of export engagement activities.\nTHE RISE and rise of the Australian strawberry, raspberry and blackberry industries has seen the sectors redouble their international trade focus, with the release of a dedicated export plan to grow their global presence over the next 10 years.\nDriven by significant grower input, the Berry Export Summary 2028 maps the sectors’ current position, where they want to be, high-opportunity markets and next steps.\nHort Innovation trade manager Jenny Van de Meeberg said the value and volume of raspberry and blackberry exports rose by 100 per cent between 2016 and 2017. She said the Australian strawberry industry experienced similar success with an almost 30 per cent rise in export volume and a 26 per cent rise in value to $32.6M over the same period.\n“Australian berry sectors are in a firm position at the moment,” she said. “Production, adoption o

Видны посторонние символы, такие как перевод строки \n

Имеется 3 колонки и 12000 строк

In [24]:
# Предобработка текстов
df = rag_dataset.copy()

# удаление строк с пустыми значениями (None)
df = df.dropna()

# приведем к нижнему регистру тексты во всех колонках
df['context'] = df['context'].apply(lambda x: x.lower())
df['question'] = df['question'].apply(lambda x: x.lower())
df['answer'] = df['answer'].apply(lambda x: x.lower())

# удаление/замена на пробел знака перевода строки в колонке 'context'
df['context'] = df['context'].apply(lambda x: x.replace('\n', ' '))
# удаление найденных дубликатов по 'question'
df.duplicated(subset=['question'], keep=False)
df.drop_duplicates(subset = ['question'], keep = 'first', inplace = True)

# удаление двух строк на другом языке
df.drop(index=[7453, 10225], inplace=True)
df.reset_index(drop=True, inplace=True) # обновление индексов

In [30]:
stop_words = set(nltk.corpus.stopwords.words('english') + ['-', '-', '–','&'])
punctuation = set(string.punctuation)
df['context'] = df['context'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words) and word not in punctuation]))



## QDRANT Baseline

In [34]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [36]:
# инициализация энкодера
tfidf_vectorizer_train = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.85, sublinear_tf=True)

In [42]:
%%time
tfidf_vectorizer_train.fit(train_df["context"])
context_test = tfidf_vectorizer_train.transform(test_df['context'])
context_all = tfidf_vectorizer_train.transform(df['context'])

CPU times: total: 11.2 s
Wall time: 11.3 s


In [46]:
# подключение к векторной базе данных
client = QdrantClient(url="http://localhost:6333", timeout=100000)

In [48]:
# создание коллекции
client.create_collection(
    collection_name='sparse-coll',
    vectors_config={},
    sparse_vectors_config={
        "text": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    },
)

True

In [60]:
# создание списка для передачи на загрузку
points = []
for i in range(context_test.shape[0]):
    indices = context_test[i].indices.tolist()
    data = context_test[i].data.tolist()
    points.append(models.PointStruct(
        id=i,
        payload={'source_text': test_df.iloc[i]["context"]},
        vector={'text': models.SparseVector(indices=indices, values=data)},
    ))

In [62]:
# загрузка данных в коллекцию
client.upload_points(
    collection_name='sparse-coll',
    points=points
)

In [66]:
client.count(
    collection_name="sparse-coll",
    exact=True,
)

CountResult(count=2397)

In [75]:
correct = 0
for i, row in test_df.iterrows():
    query_text = row['question']
    query_vec = tfidf_vectorizer_train.transform([query_text])
    query_indices = query_vec[0].indices.tolist()
    query_data = query_vec[0].data.tolist()
    result = client.query_points(
        collection_name='sparse-coll',
        query=models.SparseVector(
            indices=query_indices,
            values=query_data,
        ),
        using="text",
        limit=1
    )
    top_n = len(result.points)
    res = [result.points[i].payload['source_text'] for i in range(top_n)]
    if row['context'] in res:
        correct += 1

In [70]:
print(correct/len(test_df))

0.8585732165206508


Точность (encoder - tf-idf) 85.9%