## Baseline с qdrant и tf-idf

In [None]:
from qdrant_client import QdrantClient, models
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

In [None]:
df = pd.read_csv('cleared_dataset.csv')

In [99]:
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_df = train_df.reset_index(drop=True)
test_df = test_df.reset_index(drop=True)

In [100]:
tfidf_vectorizer_train = TfidfVectorizer(stop_words='english', ngram_range=(1, 2), max_df=0.85, sublinear_tf=True)

In [101]:
tfidf_vectorizer_train.fit(train_df["context"])
context_test = tfidf_vectorizer_train.transform(test_df['context'])

In [103]:
context_all = tfidf_vectorizer_train.transform(df['context'])

In [102]:
context_test

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 647543 stored elements and shape (2395, 2317171)>

In [104]:
client = QdrantClient(url="http://localhost:6333", timeout=100000)

In [113]:
client.delete_collection(collection_name="sparse-coll")

True

In [114]:
client.create_collection(
    collection_name='sparse-coll',
    vectors_config={},
    sparse_vectors_config={
        "text": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    },
)

True

In [106]:
print(test_df.iloc[0]["context"])
print(test_df.iloc[0]["question"])

A North London pub Friday night? You’ll lucky get ‘please’ ‘thank you’ bar staff. The fact Ollie patiently endured pleading smiles ran customer customer pretty amazing. The extra fact finally found spare five minutes indulge sewing knitting blog even amazing. ‘Yes, Karen. My rock god credentials mean I indulge monumental scale give silly witterings iota street cred. No, I won’t tell best place buy skinny jeans. No, Karen, I won’t give half pint cider house, matter much beg.’ I’m wearing Golden Vintage Cardigan took seven weeks knit. You glimpse photo JUST. It’s pattern Baby Cocktails I’m big fan knitting designs. That MMJ24 blog post today nothing short miracle. I’ve REALLY busy, getting work clear desk … four-day break Paris. But means I missing So Zo’s Brighton meet-up. I don’t know whether laugh cry. I imagine everyone Brighton chatting, shopping, comparing notes… Without me!!!! On hand… fabric, Paris, fabric, Paris, credit cards, I’m worth it, fabric, Paris. The fact I booked hotel

In [None]:
# Следует использовать points=models.Batch - загрузку в 1 запросе
points = []
for i in range(context_test.shape[0]):
    indices = context_test[i].indices.tolist()
    data = context_test[i].data.tolist()
    client.upsert(
        collection_name='sparse-coll',
        points = [
                models.PointStruct(
                    id=i,
                    payload={
                        'source_text': test_df.iloc[i]["context"]
                    },
                    vector={
                        'text': models.SparseVector(
                            indices=indices, values=data
                        )
                    },
                )
        ]
    )

In [116]:
correct = 0
for i, row in test_df.iterrows():
    query_text = row['question']
    query_vec = tfidf_vectorizer_train.transform([query_text])
    query_indices = query_vec[0].indices.tolist()
    query_data = query_vec[0].data.tolist()
    result = client.query_points(
        collection_name='sparse-coll',
        query=models.SparseVector(
            indices=query_indices,
            values=query_data,
        ),
        using="text",
        limit=1
    )
    top_n = len(result.points)
    res = [result.points[i].payload['source_text'] for i in range(top_n)]
    if row['context'] in res:
        correct += 1
    
print(correct)

2048


In [118]:
print(correct/len(test_df))

0.8551148225469729


### Hybrid search

In [None]:
from sentence_transformers import SentenceTransformer, util


model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2', device='mps')

In [120]:
embeded = model.encode(test_df['context'].tolist())

In [121]:
embeded = embeded.tolist()

In [122]:
len(embeded[0])

384

In [123]:
client.delete_collection(collection_name="hybrid_collection")

True

In [124]:
client.create_collection(
    collection_name='hybrid_collection',
    vectors_config={
        "dense_text": models.VectorParams(
            size=len(embeded[0]),
            distance=models.Distance.COSINE,
        )
    },
    sparse_vectors_config={
        "sparse_text": models.SparseVectorParams(
            index=models.SparseIndexParams(
                on_disk=False,
            )
        )
    },
)

True

In [126]:
for i in range(context_test.shape[0]):
    indices = context_test[i].indices.tolist()
    data = context_test[i].data.tolist()
    embed = embeded[i]
    context = test_df.iloc[i]["context"]
    client.upsert(
        collection_name='hybrid_collection',
        points = [
            models.PointStruct(
                id=i,
                payload={
                    'source_text': context
                },
                vector={
                    'dense_text': embed,
                    'sparse_text': models.SparseVector(
                        indices=indices, values=data
                    ),
                },
            )
        ]
    )

In [None]:
correct = 0
for i, row in test_df.iterrows():
    query_text = row['question']
    query_sparse = tfidf_vectorizer_train.transform([query_text])
    query_indices = query_vec[0].indices.tolist()
    query_data = query_vec[0].data.tolist()
    query_dense = model.encode(query_text).tolist()
    result = client.query_points(
        collection_name="hybrid_collection",
        prefetch=[
            models.Prefetch(
                query=models.SparseVector(indices=query_indices, values=query_data),
                using="sparse_text",
                limit=20,
            ),
            models.Prefetch(
                query=query_dense,
                using="dense_text",
                limit=20,
            ),
        ],
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        limit=1
    )
    top_n = len(result.points)
    res = [result.points[i].payload['source_text'] for i in range(top_n)]
    if row['context'] in res:
        correct += 1


print(correct)
    

In [135]:
print(correct/len(test_df))

0.407098121085595


### TO-DO: Fine-tune sentence-transofrmers 
[Источник](https://sbert.net/docs/sentence_transformer/training_overview.html)