### импорты

In [1]:
from IPython.display import clear_output
import numpy as np
import pandas as pd
from tqdm import tqdm
import re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score

import os
import nltk
from nltk.tokenize import sent_tokenize
#nltk.download('wordnet')
#nltk.download('punkt')
#nltk.download('stopwords')
#nltk.download('averaged_perceptron_tagger')

from gensim.models import FastText

from qdrant_client import models, QdrantClient
from qdrant_client.models import VectorParams, Distance
#import warnings
#warnings.filterwarnings("ignore")

### загрузка датасета

In [2]:
# загрузка csv файла из google drive
!wget --no-check-certificate 'https://docs.google.com/uc?export=download&id=1dgnwG1AZhspMJaDR6jz828qZkSnZg25t' -O full_dataset.csv

--2024-11-03 07:07:15--  https://docs.google.com/uc?export=download&id=1dgnwG1AZhspMJaDR6jz828qZkSnZg25t
Resolving docs.google.com (docs.google.com)... 108.177.11.139, 108.177.11.138, 108.177.11.101, ...
Connecting to docs.google.com (docs.google.com)|108.177.11.139|:443... connected.
HTTP request sent, awaiting response... 303 See Other
Location: https://drive.usercontent.google.com/download?id=1dgnwG1AZhspMJaDR6jz828qZkSnZg25t&export=download [following]
--2024-11-03 07:07:15--  https://drive.usercontent.google.com/download?id=1dgnwG1AZhspMJaDR6jz828qZkSnZg25t&export=download
Resolving drive.usercontent.google.com (drive.usercontent.google.com)... 74.125.134.132, 2607:f8b0:400c:c00::84
Connecting to drive.usercontent.google.com (drive.usercontent.google.com)|74.125.134.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 45451452 (43M) [application/octet-stream]
Saving to: ‘full_dataset.csv’


2024-11-03 07:07:20 (88.4 MB/s) - ‘full_dataset.csv’ saved [454514

In [5]:
df = pd.read_csv('full_dataset.csv')
df.head(3)

Unnamed: 0,context,question,answer
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...
1,RWSN Collaborations\nSouthern Africa Self-supp...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...
2,All Android applications categories\nDescripti...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...


In [7]:
# вывод одного сэмпла
df.values[0]

array(['Caption: Tasmanian berry grower Nic Hansen showing Macau chef Antimo Merone around his property as part of export engagement activities.\nTHE RISE and rise of the Australian strawberry, raspberry and blackberry industries has seen the sectors redouble their international trade focus, with the release of a dedicated export plan to grow their global presence over the next 10 years.\nDriven by significant grower input, the Berry Export Summary 2028 maps the sectors’ current position, where they want to be, high-opportunity markets and next steps.\nHort Innovation trade manager Jenny Van de Meeberg said the value and volume of raspberry and blackberry exports rose by 100 per cent between 2016 and 2017. She said the Australian strawberry industry experienced similar success with an almost 30 per cent rise in export volume and a 26 per cent rise in value to $32.6M over the same period.\n“Australian berry sectors are in a firm position at the moment,” she said. “Production, adoption o

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12000 entries, 0 to 11999
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   context   12000 non-null  object
 1   question  11997 non-null  object
 2   answer    11997 non-null  object
dtypes: object(3)
memory usage: 281.4+ KB


In [11]:
# удаление строк с пустыми значениями (None) и дубликатов
df = df.dropna()
df = df.drop_duplicates()
df = df.drop_duplicates(subset='question')
df.reset_index(drop=True, inplace=True)

In [13]:
# удаление/замена на пробел знака перевода строки в колонке 'context'
df['context'] = df['context'].apply(lambda x: x.replace('\n', ' '))

In [15]:
# Зафиксируем список стоп-слов
STOPWORDS = set(nltk.corpus.stopwords.words('english') + ['-', '-', '–','&'])
# функция, токенизирует по словам переданный ей текст в список слов, возвращает корпус токенов
def corpus(text):
    words = text.str.split().values.tolist()
    corpus = [word.lower() for i in words for word in i]
    return corpus

In [17]:
# оставляем только слова и цифры
def words_only(text):
    return " ".join(re.compile("[A-Za-z0-9]+").findall(text))

# удаление слов меньше 2-х букв
def remove_word(text):
    return " ".join([token for token in text.split() if len(token) > 2])

# удаление стоп слов
def remove_stopwords(text, mystopwords = STOPWORDS):
    try:
        return " ".join([token for token in text.split() if not token in mystopwords])
    except:
        return ""
        
# приведение к нормальной форме
def lemmatize(text):
    lem = nltk.WordNetLemmatizer()
    try:
        return  " ".join([lem.lemmatize(w) for w in text.split()])
    except:
        return " "

# применение всех препроцессоров
def prep(text):
    return remove_stopwords(lemmatize(remove_word(words_only(text.lower()))))

In [19]:
# разбиение на предложения + препроцессинг
def clear_text(text):
    sentences = []
    for el in sent_tokenize(text):
        sentences.append(el)

    return [item for sublist in [prep(sentence).split() for sentence in sentences] for item in sublist]

In [21]:
# создание эмбедингов
def get_emb(tokens, model):
    if not tokens:
        return np.zeros(model.vector_size)
    return np.mean([model.wv[token] for token in tokens if token in model.wv], axis=0)

In [23]:
df_clear = df.copy()

In [25]:
df_clear['context_token'] = df_clear.context.apply(lambda x: clear_text(x))
df_clear['question_token'] = df_clear.question.apply(lambda x: clear_text(x))

In [61]:
all_text = df_clear['question_token'] + df_clear['context_token']

In [63]:
def max_vector(text):
    max = 0
    for words in text:
        tmp = len(words)
        if tmp > max:
            max = tmp
    return max

In [65]:
vector_size = max_vector(all_text)
vector_size

1072

Получили 1072 максимальная длина вектора, увеличим ее до круглого 1100.

In [67]:
# ставим количество потоков по число процессора
workers = os.cpu_count()

In [69]:
# инициализация модели
%time  model = FastText(all_text, workers=workers, vector_size=1100, min_count=1, sg = 1, window=10, seed=42)

CPU times: total: 1h 8min 13s
Wall time: 2min 28s


In [70]:
df_clear['context_emb'] = df_clear.context_token.apply(lambda x: get_emb(x, model))
df_clear['question_emb'] = df_clear.question_token.apply(lambda x: get_emb(x, model))

In [71]:
# проверяем работу модели на одно слово
model.wv.most_similar(positive=["caution"], topn=5)

[('precaution', 0.9085862040519714),
 ('aution', 0.8958996534347534),
 ('diminution', 0.8340831398963928),
 ('cautioned', 0.8101760745048523),
 ('ablution', 0.8084231019020081)]

In [75]:
df_clear.head(3)

Unnamed: 0,context,question,answer,context_token,question_token,context_emb,question_emb
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...,"[caption, tasmanian, berry, grower, nic, hanse...","[berry, export, summary, 2028, purpose]","[-0.10235369, -0.1930192, -0.024718583, 0.1287...","[-0.25707194, -0.32706696, -0.02874051, 0.2370..."
1,RWSN Collaborations Southern Africa Self-suppl...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...,"[rwsn, collaboration, southern, africa, self, ...","[benefit, reported, access, self, supply, wate...","[-0.05442216, -0.11562431, -0.033297185, 0.096...","[-0.021468008, -0.15709007, -0.09668655, 0.145..."
2,All Android applications categories Descriptio...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...,"[android, application, category, description, ...","[unique, feature, coolands, twitter, app]","[-0.1050532, -0.17029549, -0.011876792, 0.0955...","[-0.077749975, -0.23713961, 0.013542196, 0.061..."


In [77]:
# функция определения косинусного расстояния между векторами эмбедингов
def cos_sim(df, model):
    cos_dist = []
    for index, row in df.iterrows():
        tmp = cosine_similarity(row['context_emb'].reshape(1, -1), row['question_emb'].reshape(1, -1))
        cos_dist.append(tmp[0][0])
    return cos_dist

In [79]:
cos_dist = cos_sim(df_clear, model)
df_clear['cos_dist'] = cos_dist

In [81]:
df_clear.head(3)

Unnamed: 0,context,question,answer,context_token,question_token,context_emb,question_emb,cos_dist
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...,"[caption, tasmanian, berry, grower, nic, hanse...","[berry, export, summary, 2028, purpose]","[-0.10235369, -0.1930192, -0.024718583, 0.1287...","[-0.25707194, -0.32706696, -0.02874051, 0.2370...",0.818019
1,RWSN Collaborations Southern Africa Self-suppl...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...,"[rwsn, collaboration, southern, africa, self, ...","[benefit, reported, access, self, supply, wate...","[-0.05442216, -0.11562431, -0.033297185, 0.096...","[-0.021468008, -0.15709007, -0.09668655, 0.145...",0.916283
2,All Android applications categories Descriptio...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...,"[android, application, category, description, ...","[unique, feature, coolands, twitter, app]","[-0.1050532, -0.17029549, -0.011876792, 0.0955...","[-0.077749975, -0.23713961, 0.013542196, 0.061...",0.80964


In [83]:
# функция поиска схожести вопроса и контекста, ранжирование
def search(query, df, model, top_k=3):
    query_vec = get_emb(clear_text(query), model)
    similarities = cosine_similarity([query_vec], list(df['context_emb']))
    top_indices = similarities[0].argsort()[-top_k:][::-1]
    return df.iloc[top_indices][['context_emb']]

In [87]:
top_index = []

for index, row in tqdm(df_clear.iterrows()):
    results_cos = search(row['question'], df_clear, model, top_k=1)
    top_index.append(results_cos.index[0])

df_clear['top_index'] = top_index
df_clear['cos'] = (df_clear['top_index'] == df_clear.index).astype(int)

11983it [09:33, 20.89it/s]


#### По всему датасету отработала за 9 минут 30 секунд

In [91]:
df_clear.head(3)

Unnamed: 0,context,question,answer,context_token,question_token,context_emb,question_emb,cos_dist,top_index,cos
0,Caption: Tasmanian berry grower Nic Hansen sho...,What is the Berry Export Summary 2028 and what...,The Berry Export Summary 2028 is a dedicated e...,"[caption, tasmanian, berry, grower, nic, hanse...","[berry, export, summary, 2028, purpose]","[-0.10235369, -0.1930192, -0.024718583, 0.1287...","[-0.25707194, -0.32706696, -0.02874051, 0.2370...",0.818019,0,1
1,RWSN Collaborations Southern Africa Self-suppl...,What are some of the benefits reported from ha...,Benefits reported from having access to Self-s...,"[rwsn, collaboration, southern, africa, self, ...","[benefit, reported, access, self, supply, wate...","[-0.05442216, -0.11562431, -0.033297185, 0.096...","[-0.021468008, -0.15709007, -0.09668655, 0.145...",0.916283,1,1
2,All Android applications categories Descriptio...,What are the unique features of the Coolands f...,The unique features of the Coolands for Twitte...,"[android, application, category, description, ...","[unique, feature, coolands, twitter, app]","[-0.1050532, -0.17029549, -0.011876792, 0.0955...","[-0.077749975, -0.23713961, 0.013542196, 0.061...",0.80964,303,0


In [93]:
df_clear['cos'].value_counts(normalize = True)

cos
0    0.633648
1    0.366352
Name: proportion, dtype: float64

In [95]:
precision_cos = precision_score(df_clear.index, df_clear.top_index, average='micro')

print(precision_cos)

0.3663523324710006


#### Получили точность в 37%

In [99]:
# подключение к векторной базе данных
qdrant = QdrantClient(url = "http://localhost:6333")

In [101]:
# создание коллекции
def create_collection(name, vec_size):
    qdrant.create_collection(
        collection_name=name,
        vectors_config=models.VectorParams(
            size=vec_size,
            distance=models.Distance.COSINE,
        ),
    )

In [103]:
# загрузка данных в коллекцию
def upload_batch(collection_name, points):
    qdrant.upload_points(
        collection_name=collection_name,
        points=points
    )

In [105]:
df_clear['id'] = df_clear.index

In [107]:
create_collection('rag', 1100)

In [109]:
# создание списка для передачи на загрузку
points = []
for row in tqdm(df_clear.iterrows()):
    vector = row[1]["context_emb"]
    payload = {"context":row[1]["context"]}
    points.append(models.PointStruct(id=row[1]['id'], vector=vector, payload=payload))

11983it [00:07, 1502.87it/s]


In [111]:
%time upload_batch('rag', points)

CPU times: total: 3.11 s
Wall time: 8.04 s


In [113]:
qdrant.count(
    collection_name="rag",
    exact=True,
)

CountResult(count=11983)

In [115]:
# получение контекста от вопроса
query_string = df.question[777]

hits = qdrant.search(
    collection_name="rag",
    query_vector = get_emb(clear_text(query_string), model),
    limit=1,
)

print(query_string)
for hit in hits:
    print(hit.payload, "score:")
    print(hit.id)
    print(hit.score)

What changes were noticed in the family's house after Selina's intervention?
{'context': 'Hi Selina, “I feel so fortunate to have met you Selina. Thank you so much for your gift that you share to help others in need. The property that’s been in the family for five generations, and the forty-something year old house that currently stands on it, truly needed your healing and cleansing. After the cleansing, my whole family noticed the difference when they walked in the house. My daughter who was most affected by the spiritual activity said, without hesitation, that it “felt empty” – meaning no more spirits in the house. My parents were also very pleased and my father even began to speak of some tragedies which happened over the years I’d never heard of. We are all feeling blessed that you could bring our family and the spirits peace, and the property to a content state of being. My daughter is no longer afraid to be in the house by herself and our pets are no longer restless and afraid. B

In [117]:
top_index = []
for index, row in tqdm(df_clear.iterrows()):
    query_string = row['question']
    hits = qdrant.search(
    collection_name = "rag",
    query_vector = get_emb(clear_text(query_string), model),
    limit = 1,
    )
    top_index.append(hits[0].id)

11983it [03:39, 54.51it/s]


#### как видим используя векторную базу данных мы увеличили скорость обработки практически в 2.5 раза.

In [121]:
df_clear['top_index_qd'] = top_index
df_clear['eval_qd'] = (df_clear['top_index_qd'] == df_clear.id).astype(int)

In [123]:
df_clear['eval_qd'].value_counts(normalize = True)

eval_qd
0    0.633648
1    0.366352
Name: proportion, dtype: float64